powerpc/crypto/crc32-vpmsum_core.S

0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * Core of the accelerated CRC algorithm.
0004  * In your file, define the constants and CRC_FUNCTION_NAME
0005  * Then include this file.
0006  *
0007  * Calculate the checksum of data that is 16 byte aligned and a multiple of
0008  * 16 bytes.
0009  *
0010  * The first step is to reduce it to 1024 bits. We do this in 8 parallel
0011  * chunks in order to mask the latency of the vpmsum instructions. If we
0012  * have more than 32 kB of data to checksum we repeat this step multiple
0013  * times, passing in the previous 1024 bits.
0014  *
0015  * The next step is to reduce the 1024 bits to 64 bits. This step adds
0016  * 32 bits of 0s to the end - this matches what a CRC does. We just
0017  * calculate constants that land the data in this 32 bits.
0018  *
0019  * We then use fixed point Barrett reduction to compute a mod n over GF(2)
0020  * for n = CRC using POWER8 instructions. We use x = 32.
0021  *
0022  * https://en.wikipedia.org/wiki/Barrett_reduction
0023  *
0024  * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
0025 */
0026
0027 #include <asm/ppc_asm.h>
0028 #include <asm/ppc-opcode.h>
0029
0030 #define MAX_SIZE    32768
0031
0032     .text
0033
0034 #if defined(__BIG_ENDIAN__) && defined(REFLECT)
0035 #define BYTESWAP_DATA
0036 #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
0037 #define BYTESWAP_DATA
0038 #else
0039 #undef BYTESWAP_DATA
0040 #endif
0041
0042 #define off16       r25
0043 #define off32       r26
0044 #define off48       r27
0045 #define off64       r28
0046 #define off80       r29
0047 #define off96       r30
0048 #define off112      r31
0049
0050 #define const1      v24
0051 #define const2      v25
0052
0053 #define byteswap    v26
0054 #define mask_32bit  v27
0055 #define mask_64bit  v28
0056 #define zeroes      v29
0057
0058 #ifdef BYTESWAP_DATA
0059 #define VPERM(A, B, C, D) vperm A, B, C, D
0060 #else
0061 #define VPERM(A, B, C, D)
0062 #endif
0063
0064 /* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
0065 FUNC_START(CRC_FUNCTION_NAME)
0066     std r31,-8(r1)
0067     std r30,-16(r1)
0068     std r29,-24(r1)
0069     std r28,-32(r1)
0070     std r27,-40(r1)
0071     std r26,-48(r1)
0072     std r25,-56(r1)
0073
0074     li  off16,16
0075     li  off32,32
0076     li  off48,48
0077     li  off64,64
0078     li  off80,80
0079     li  off96,96
0080     li  off112,112
0081     li  r0,0
0082
0083     /* Enough room for saving 10 non volatile VMX registers */
0084     subi    r6,r1,56+10*16
0085     subi    r7,r1,56+2*16
0086
0087     stvx    v20,0,r6
0088     stvx    v21,off16,r6
0089     stvx    v22,off32,r6
0090     stvx    v23,off48,r6
0091     stvx    v24,off64,r6
0092     stvx    v25,off80,r6
0093     stvx    v26,off96,r6
0094     stvx    v27,off112,r6
0095     stvx    v28,0,r7
0096     stvx    v29,off16,r7
0097
0098     mr  r10,r3
0099
0100     vxor    zeroes,zeroes,zeroes
0101     vspltisw v0,-1
0102
0103     vsldoi  mask_32bit,zeroes,v0,4
0104     vsldoi  mask_64bit,zeroes,v0,8
0105
0106     /* Get the initial value into v8 */
0107     vxor    v8,v8,v8
0108     MTVRD(v8, R3)
0109 #ifdef REFLECT
0110     vsldoi  v8,zeroes,v8,8  /* shift into bottom 32 bits */
0111 #else
0112     vsldoi  v8,v8,zeroes,4  /* shift into top 32 bits */
0113 #endif
0114
0115 #ifdef BYTESWAP_DATA
0116     addis   r3,r2,.byteswap_constant@toc@ha
0117     addi    r3,r3,.byteswap_constant@toc@l
0118
0119     lvx byteswap,0,r3
0120     addi    r3,r3,16
0121 #endif
0122
0123     cmpdi   r5,256
0124     blt .Lshort
0125
0126     rldicr  r6,r5,0,56
0127
0128     /* Checksum in blocks of MAX_SIZE */
0129 1:  lis r7,MAX_SIZE@h
0130     ori r7,r7,MAX_SIZE@l
0131     mr  r9,r7
0132     cmpd    r6,r7
0133     bgt 2f
0134     mr  r7,r6
0135 2:  subf    r6,r7,r6
0136
0137     /* our main loop does 128 bytes at a time */
0138     srdi    r7,r7,7
0139
0140     /*
0141      * Work out the offset into the constants table to start at. Each
0142      * constant is 16 bytes, and it is used against 128 bytes of input
0143      * data - 128 / 16 = 8
0144      */
0145     sldi    r8,r7,4
0146     srdi    r9,r9,3
0147     subf    r8,r8,r9
0148
0149     /* We reduce our final 128 bytes in a separate step */
0150     addi    r7,r7,-1
0151     mtctr   r7
0152
0153     addis   r3,r2,.constants@toc@ha
0154     addi    r3,r3,.constants@toc@l
0155
0156     /* Find the start of our constants */
0157     add r3,r3,r8
0158
0159     /* zero v0-v7 which will contain our checksums */
0160     vxor    v0,v0,v0
0161     vxor    v1,v1,v1
0162     vxor    v2,v2,v2
0163     vxor    v3,v3,v3
0164     vxor    v4,v4,v4
0165     vxor    v5,v5,v5
0166     vxor    v6,v6,v6
0167     vxor    v7,v7,v7
0168
0169     lvx const1,0,r3
0170
0171     /*
0172      * If we are looping back to consume more data we use the values
0173      * already in v16-v23.
0174      */
0175     cmpdi   r0,1
0176     beq 2f
0177
0178     /* First warm up pass */
0179     lvx v16,0,r4
0180     lvx v17,off16,r4
0181     VPERM(v16,v16,v16,byteswap)
0182     VPERM(v17,v17,v17,byteswap)
0183     lvx v18,off32,r4
0184     lvx v19,off48,r4
0185     VPERM(v18,v18,v18,byteswap)
0186     VPERM(v19,v19,v19,byteswap)
0187     lvx v20,off64,r4
0188     lvx v21,off80,r4
0189     VPERM(v20,v20,v20,byteswap)
0190     VPERM(v21,v21,v21,byteswap)
0191     lvx v22,off96,r4
0192     lvx v23,off112,r4
0193     VPERM(v22,v22,v22,byteswap)
0194     VPERM(v23,v23,v23,byteswap)
0195     addi    r4,r4,8*16
0196
0197     /* xor in initial value */
0198     vxor    v16,v16,v8
0199
0200 2:  bdz .Lfirst_warm_up_done
0201
0202     addi    r3,r3,16
0203     lvx const2,0,r3
0204
0205     /* Second warm up pass */
0206     VPMSUMD(v8,v16,const1)
0207     lvx v16,0,r4
0208     VPERM(v16,v16,v16,byteswap)
0209     ori r2,r2,0
0210
0211     VPMSUMD(v9,v17,const1)
0212     lvx v17,off16,r4
0213     VPERM(v17,v17,v17,byteswap)
0214     ori r2,r2,0
0215
0216     VPMSUMD(v10,v18,const1)
0217     lvx v18,off32,r4
0218     VPERM(v18,v18,v18,byteswap)
0219     ori r2,r2,0
0220
0221     VPMSUMD(v11,v19,const1)
0222     lvx v19,off48,r4
0223     VPERM(v19,v19,v19,byteswap)
0224     ori r2,r2,0
0225
0226     VPMSUMD(v12,v20,const1)
0227     lvx v20,off64,r4
0228     VPERM(v20,v20,v20,byteswap)
0229     ori r2,r2,0
0230
0231     VPMSUMD(v13,v21,const1)
0232     lvx v21,off80,r4
0233     VPERM(v21,v21,v21,byteswap)
0234     ori r2,r2,0
0235
0236     VPMSUMD(v14,v22,const1)
0237     lvx v22,off96,r4
0238     VPERM(v22,v22,v22,byteswap)
0239     ori r2,r2,0
0240
0241     VPMSUMD(v15,v23,const1)
0242     lvx v23,off112,r4
0243     VPERM(v23,v23,v23,byteswap)
0244
0245     addi    r4,r4,8*16
0246
0247     bdz .Lfirst_cool_down
0248
0249     /*
0250      * main loop. We modulo schedule it such that it takes three iterations
0251      * to complete - first iteration load, second iteration vpmsum, third
0252      * iteration xor.
0253      */
0254     .balign 16
0255 4:  lvx const1,0,r3
0256     addi    r3,r3,16
0257     ori r2,r2,0
0258
0259     vxor    v0,v0,v8
0260     VPMSUMD(v8,v16,const2)
0261     lvx v16,0,r4
0262     VPERM(v16,v16,v16,byteswap)
0263     ori r2,r2,0
0264
0265     vxor    v1,v1,v9
0266     VPMSUMD(v9,v17,const2)
0267     lvx v17,off16,r4
0268     VPERM(v17,v17,v17,byteswap)
0269     ori r2,r2,0
0270
0271     vxor    v2,v2,v10
0272     VPMSUMD(v10,v18,const2)
0273     lvx v18,off32,r4
0274     VPERM(v18,v18,v18,byteswap)
0275     ori r2,r2,0
0276
0277     vxor    v3,v3,v11
0278     VPMSUMD(v11,v19,const2)
0279     lvx v19,off48,r4
0280     VPERM(v19,v19,v19,byteswap)
0281     lvx const2,0,r3
0282     ori r2,r2,0
0283
0284     vxor    v4,v4,v12
0285     VPMSUMD(v12,v20,const1)
0286     lvx v20,off64,r4
0287     VPERM(v20,v20,v20,byteswap)
0288     ori r2,r2,0
0289
0290     vxor    v5,v5,v13
0291     VPMSUMD(v13,v21,const1)
0292     lvx v21,off80,r4
0293     VPERM(v21,v21,v21,byteswap)
0294     ori r2,r2,0
0295
0296     vxor    v6,v6,v14
0297     VPMSUMD(v14,v22,const1)
0298     lvx v22,off96,r4
0299     VPERM(v22,v22,v22,byteswap)
0300     ori r2,r2,0
0301
0302     vxor    v7,v7,v15
0303     VPMSUMD(v15,v23,const1)
0304     lvx v23,off112,r4
0305     VPERM(v23,v23,v23,byteswap)
0306
0307     addi    r4,r4,8*16
0308
0309     bdnz    4b
0310
0311 .Lfirst_cool_down:
0312     /* First cool down pass */
0313     lvx const1,0,r3
0314     addi    r3,r3,16
0315
0316     vxor    v0,v0,v8
0317     VPMSUMD(v8,v16,const1)
0318     ori r2,r2,0
0319
0320     vxor    v1,v1,v9
0321     VPMSUMD(v9,v17,const1)
0322     ori r2,r2,0
0323
0324     vxor    v2,v2,v10
0325     VPMSUMD(v10,v18,const1)
0326     ori r2,r2,0
0327
0328     vxor    v3,v3,v11
0329     VPMSUMD(v11,v19,const1)
0330     ori r2,r2,0
0331
0332     vxor    v4,v4,v12
0333     VPMSUMD(v12,v20,const1)
0334     ori r2,r2,0
0335
0336     vxor    v5,v5,v13
0337     VPMSUMD(v13,v21,const1)
0338     ori r2,r2,0
0339
0340     vxor    v6,v6,v14
0341     VPMSUMD(v14,v22,const1)
0342     ori r2,r2,0
0343
0344     vxor    v7,v7,v15
0345     VPMSUMD(v15,v23,const1)
0346     ori r2,r2,0
0347
0348 .Lsecond_cool_down:
0349     /* Second cool down pass */
0350     vxor    v0,v0,v8
0351     vxor    v1,v1,v9
0352     vxor    v2,v2,v10
0353     vxor    v3,v3,v11
0354     vxor    v4,v4,v12
0355     vxor    v5,v5,v13
0356     vxor    v6,v6,v14
0357     vxor    v7,v7,v15
0358
0359 #ifdef REFLECT
0360     /*
0361      * vpmsumd produces a 96 bit result in the least significant bits
0362      * of the register. Since we are bit reflected we have to shift it
0363      * left 32 bits so it occupies the least significant bits in the
0364      * bit reflected domain.
0365      */
0366     vsldoi  v0,v0,zeroes,4
0367     vsldoi  v1,v1,zeroes,4
0368     vsldoi  v2,v2,zeroes,4
0369     vsldoi  v3,v3,zeroes,4
0370     vsldoi  v4,v4,zeroes,4
0371     vsldoi  v5,v5,zeroes,4
0372     vsldoi  v6,v6,zeroes,4
0373     vsldoi  v7,v7,zeroes,4
0374 #endif
0375
0376     /* xor with last 1024 bits */
0377     lvx v8,0,r4
0378     lvx v9,off16,r4
0379     VPERM(v8,v8,v8,byteswap)
0380     VPERM(v9,v9,v9,byteswap)
0381     lvx v10,off32,r4
0382     lvx v11,off48,r4
0383     VPERM(v10,v10,v10,byteswap)
0384     VPERM(v11,v11,v11,byteswap)
0385     lvx v12,off64,r4
0386     lvx v13,off80,r4
0387     VPERM(v12,v12,v12,byteswap)
0388     VPERM(v13,v13,v13,byteswap)
0389     lvx v14,off96,r4
0390     lvx v15,off112,r4
0391     VPERM(v14,v14,v14,byteswap)
0392     VPERM(v15,v15,v15,byteswap)
0393
0394     addi    r4,r4,8*16
0395
0396     vxor    v16,v0,v8
0397     vxor    v17,v1,v9
0398     vxor    v18,v2,v10
0399     vxor    v19,v3,v11
0400     vxor    v20,v4,v12
0401     vxor    v21,v5,v13
0402     vxor    v22,v6,v14
0403     vxor    v23,v7,v15
0404
0405     li  r0,1
0406     cmpdi   r6,0
0407     addi    r6,r6,128
0408     bne 1b
0409
0410     /* Work out how many bytes we have left */
0411     andi.   r5,r5,127
0412
0413     /* Calculate where in the constant table we need to start */
0414     subfic  r6,r5,128
0415     add r3,r3,r6
0416
0417     /* How many 16 byte chunks are in the tail */
0418     srdi    r7,r5,4
0419     mtctr   r7
0420
0421     /*
0422      * Reduce the previously calculated 1024 bits to 64 bits, shifting
0423      * 32 bits to include the trailing 32 bits of zeros
0424      */
0425     lvx v0,0,r3
0426     lvx v1,off16,r3
0427     lvx v2,off32,r3
0428     lvx v3,off48,r3
0429     lvx v4,off64,r3
0430     lvx v5,off80,r3
0431     lvx v6,off96,r3
0432     lvx v7,off112,r3
0433     addi    r3,r3,8*16
0434
0435     VPMSUMW(v0,v16,v0)
0436     VPMSUMW(v1,v17,v1)
0437     VPMSUMW(v2,v18,v2)
0438     VPMSUMW(v3,v19,v3)
0439     VPMSUMW(v4,v20,v4)
0440     VPMSUMW(v5,v21,v5)
0441     VPMSUMW(v6,v22,v6)
0442     VPMSUMW(v7,v23,v7)
0443
0444     /* Now reduce the tail (0 - 112 bytes) */
0445     cmpdi   r7,0
0446     beq 1f
0447
0448     lvx v16,0,r4
0449     lvx v17,0,r3
0450     VPERM(v16,v16,v16,byteswap)
0451     VPMSUMW(v16,v16,v17)
0452     vxor    v0,v0,v16
0453     bdz 1f
0454
0455     lvx v16,off16,r4
0456     lvx v17,off16,r3
0457     VPERM(v16,v16,v16,byteswap)
0458     VPMSUMW(v16,v16,v17)
0459     vxor    v0,v0,v16
0460     bdz 1f
0461
0462     lvx v16,off32,r4
0463     lvx v17,off32,r3
0464     VPERM(v16,v16,v16,byteswap)
0465     VPMSUMW(v16,v16,v17)
0466     vxor    v0,v0,v16
0467     bdz 1f
0468
0469     lvx v16,off48,r4
0470     lvx v17,off48,r3
0471     VPERM(v16,v16,v16,byteswap)
0472     VPMSUMW(v16,v16,v17)
0473     vxor    v0,v0,v16
0474     bdz 1f
0475
0476     lvx v16,off64,r4
0477     lvx v17,off64,r3
0478     VPERM(v16,v16,v16,byteswap)
0479     VPMSUMW(v16,v16,v17)
0480     vxor    v0,v0,v16
0481     bdz 1f
0482
0483     lvx v16,off80,r4
0484     lvx v17,off80,r3
0485     VPERM(v16,v16,v16,byteswap)
0486     VPMSUMW(v16,v16,v17)
0487     vxor    v0,v0,v16
0488     bdz 1f
0489
0490     lvx v16,off96,r4
0491     lvx v17,off96,r3
0492     VPERM(v16,v16,v16,byteswap)
0493     VPMSUMW(v16,v16,v17)
0494     vxor    v0,v0,v16
0495
0496     /* Now xor all the parallel chunks together */
0497 1:  vxor    v0,v0,v1
0498     vxor    v2,v2,v3
0499     vxor    v4,v4,v5
0500     vxor    v6,v6,v7
0501
0502     vxor    v0,v0,v2
0503     vxor    v4,v4,v6
0504
0505     vxor    v0,v0,v4
0506
0507 .Lbarrett_reduction:
0508     /* Barrett constants */
0509     addis   r3,r2,.barrett_constants@toc@ha
0510     addi    r3,r3,.barrett_constants@toc@l
0511
0512     lvx const1,0,r3
0513     lvx const2,off16,r3
0514
0515     vsldoi  v1,v0,v0,8
0516     vxor    v0,v0,v1        /* xor two 64 bit results together */
0517
0518 #ifdef REFLECT
0519     /* shift left one bit */
0520     vspltisb v1,1
0521     vsl v0,v0,v1
0522 #endif
0523
0524     vand    v0,v0,mask_64bit
0525 #ifndef REFLECT
0526     /*
0527      * Now for the Barrett reduction algorithm. The idea is to calculate q,
0528      * the multiple of our polynomial that we need to subtract. By
0529      * doing the computation 2x bits higher (ie 64 bits) and shifting the
0530      * result back down 2x bits, we round down to the nearest multiple.
0531      */
0532     VPMSUMD(v1,v0,const1)   /* ma */
0533     vsldoi  v1,zeroes,v1,8  /* q = floor(ma/(2^64)) */
0534     VPMSUMD(v1,v1,const2)   /* qn */
0535     vxor    v0,v0,v1    /* a - qn, subtraction is xor in GF(2) */
0536
0537     /*
0538      * Get the result into r3. We need to shift it left 8 bytes:
0539      * V0 [ 0 1 2 X ]
0540      * V0 [ 0 X 2 3 ]
0541      */
0542     vsldoi  v0,v0,zeroes,8  /* shift result into top 64 bits */
0543 #else
0544     /*
0545      * The reflected version of Barrett reduction. Instead of bit
0546      * reflecting our data (which is expensive to do), we bit reflect our
0547      * constants and our algorithm, which means the intermediate data in
0548      * our vector registers goes from 0-63 instead of 63-0. We can reflect
0549      * the algorithm because we don't carry in mod 2 arithmetic.
0550      */
0551     vand    v1,v0,mask_32bit    /* bottom 32 bits of a */
0552     VPMSUMD(v1,v1,const1)       /* ma */
0553     vand    v1,v1,mask_32bit    /* bottom 32bits of ma */
0554     VPMSUMD(v1,v1,const2)       /* qn */
0555     vxor    v0,v0,v1        /* a - qn, subtraction is xor in GF(2) */
0556
0557     /*
0558      * Since we are bit reflected, the result (ie the low 32 bits) is in
0559      * the high 32 bits. We just need to shift it left 4 bytes
0560      * V0 [ 0 1 X 3 ]
0561      * V0 [ 0 X 2 3 ]
0562      */
0563     vsldoi  v0,v0,zeroes,4      /* shift result into top 64 bits of */
0564 #endif
0565
0566     /* Get it into r3 */
0567     MFVRD(R3, v0)
0568
0569 .Lout:
0570     subi    r6,r1,56+10*16
0571     subi    r7,r1,56+2*16
0572
0573     lvx v20,0,r6
0574     lvx v21,off16,r6
0575     lvx v22,off32,r6
0576     lvx v23,off48,r6
0577     lvx v24,off64,r6
0578     lvx v25,off80,r6
0579     lvx v26,off96,r6
0580     lvx v27,off112,r6
0581     lvx v28,0,r7
0582     lvx v29,off16,r7
0583
0584     ld  r31,-8(r1)
0585     ld  r30,-16(r1)
0586     ld  r29,-24(r1)
0587     ld  r28,-32(r1)
0588     ld  r27,-40(r1)
0589     ld  r26,-48(r1)
0590     ld  r25,-56(r1)
0591
0592     blr
0593
0594 .Lfirst_warm_up_done:
0595     lvx const1,0,r3
0596     addi    r3,r3,16
0597
0598     VPMSUMD(v8,v16,const1)
0599     VPMSUMD(v9,v17,const1)
0600     VPMSUMD(v10,v18,const1)
0601     VPMSUMD(v11,v19,const1)
0602     VPMSUMD(v12,v20,const1)
0603     VPMSUMD(v13,v21,const1)
0604     VPMSUMD(v14,v22,const1)
0605     VPMSUMD(v15,v23,const1)
0606
0607     b   .Lsecond_cool_down
0608
0609 .Lshort:
0610     cmpdi   r5,0
0611     beq .Lzero
0612
0613     addis   r3,r2,.short_constants@toc@ha
0614     addi    r3,r3,.short_constants@toc@l
0615
0616     /* Calculate where in the constant table we need to start */
0617     subfic  r6,r5,256
0618     add r3,r3,r6
0619
0620     /* How many 16 byte chunks? */
0621     srdi    r7,r5,4
0622     mtctr   r7
0623
0624     vxor    v19,v19,v19
0625     vxor    v20,v20,v20
0626
0627     lvx v0,0,r4
0628     lvx v16,0,r3
0629     VPERM(v0,v0,v16,byteswap)
0630     vxor    v0,v0,v8    /* xor in initial value */
0631     VPMSUMW(v0,v0,v16)
0632     bdz .Lv0
0633
0634     lvx v1,off16,r4
0635     lvx v17,off16,r3
0636     VPERM(v1,v1,v17,byteswap)
0637     VPMSUMW(v1,v1,v17)
0638     bdz .Lv1
0639
0640     lvx v2,off32,r4
0641     lvx v16,off32,r3
0642     VPERM(v2,v2,v16,byteswap)
0643     VPMSUMW(v2,v2,v16)
0644     bdz .Lv2
0645
0646     lvx v3,off48,r4
0647     lvx v17,off48,r3
0648     VPERM(v3,v3,v17,byteswap)
0649     VPMSUMW(v3,v3,v17)
0650     bdz .Lv3
0651
0652     lvx v4,off64,r4
0653     lvx v16,off64,r3
0654     VPERM(v4,v4,v16,byteswap)
0655     VPMSUMW(v4,v4,v16)
0656     bdz .Lv4
0657
0658     lvx v5,off80,r4
0659     lvx v17,off80,r3
0660     VPERM(v5,v5,v17,byteswap)
0661     VPMSUMW(v5,v5,v17)
0662     bdz .Lv5
0663
0664     lvx v6,off96,r4
0665     lvx v16,off96,r3
0666     VPERM(v6,v6,v16,byteswap)
0667     VPMSUMW(v6,v6,v16)
0668     bdz .Lv6
0669
0670     lvx v7,off112,r4
0671     lvx v17,off112,r3
0672     VPERM(v7,v7,v17,byteswap)
0673     VPMSUMW(v7,v7,v17)
0674     bdz .Lv7
0675
0676     addi    r3,r3,128
0677     addi    r4,r4,128
0678
0679     lvx v8,0,r4
0680     lvx v16,0,r3
0681     VPERM(v8,v8,v16,byteswap)
0682     VPMSUMW(v8,v8,v16)
0683     bdz .Lv8
0684
0685     lvx v9,off16,r4
0686     lvx v17,off16,r3
0687     VPERM(v9,v9,v17,byteswap)
0688     VPMSUMW(v9,v9,v17)
0689     bdz .Lv9
0690
0691     lvx v10,off32,r4
0692     lvx v16,off32,r3
0693     VPERM(v10,v10,v16,byteswap)
0694     VPMSUMW(v10,v10,v16)
0695     bdz .Lv10
0696
0697     lvx v11,off48,r4
0698     lvx v17,off48,r3
0699     VPERM(v11,v11,v17,byteswap)
0700     VPMSUMW(v11,v11,v17)
0701     bdz .Lv11
0702
0703     lvx v12,off64,r4
0704     lvx v16,off64,r3
0705     VPERM(v12,v12,v16,byteswap)
0706     VPMSUMW(v12,v12,v16)
0707     bdz .Lv12
0708
0709     lvx v13,off80,r4
0710     lvx v17,off80,r3
0711     VPERM(v13,v13,v17,byteswap)
0712     VPMSUMW(v13,v13,v17)
0713     bdz .Lv13
0714
0715     lvx v14,off96,r4
0716     lvx v16,off96,r3
0717     VPERM(v14,v14,v16,byteswap)
0718     VPMSUMW(v14,v14,v16)
0719     bdz .Lv14
0720
0721     lvx v15,off112,r4
0722     lvx v17,off112,r3
0723     VPERM(v15,v15,v17,byteswap)
0724     VPMSUMW(v15,v15,v17)
0725
0726 .Lv15:  vxor    v19,v19,v15
0727 .Lv14:  vxor    v20,v20,v14
0728 .Lv13:  vxor    v19,v19,v13
0729 .Lv12:  vxor    v20,v20,v12
0730 .Lv11:  vxor    v19,v19,v11
0731 .Lv10:  vxor    v20,v20,v10
0732 .Lv9:   vxor    v19,v19,v9
0733 .Lv8:   vxor    v20,v20,v8
0734 .Lv7:   vxor    v19,v19,v7
0735 .Lv6:   vxor    v20,v20,v6
0736 .Lv5:   vxor    v19,v19,v5
0737 .Lv4:   vxor    v20,v20,v4
0738 .Lv3:   vxor    v19,v19,v3
0739 .Lv2:   vxor    v20,v20,v2
0740 .Lv1:   vxor    v19,v19,v1
0741 .Lv0:   vxor    v20,v20,v0
0742
0743     vxor    v0,v19,v20
0744
0745     b   .Lbarrett_reduction
0746
0747 .Lzero:
0748     mr  r3,r10
0749     b   .Lout
0750
0751 FUNC_END(CRC_FUNCTION_NAME)