0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027 #include <asm/ppc_asm.h>
0028 #include <asm/ppc-opcode.h>
0029
0030 #define MAX_SIZE 32768
0031
0032 .text
0033
0034 #if defined(__BIG_ENDIAN__) && defined(REFLECT)
0035 #define BYTESWAP_DATA
0036 #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
0037 #define BYTESWAP_DATA
0038 #else
0039 #undef BYTESWAP_DATA
0040 #endif
0041
0042 #define off16 r25
0043 #define off32 r26
0044 #define off48 r27
0045 #define off64 r28
0046 #define off80 r29
0047 #define off96 r30
0048 #define off112 r31
0049
0050 #define const1 v24
0051 #define const2 v25
0052
0053 #define byteswap v26
0054 #define mask_32bit v27
0055 #define mask_64bit v28
0056 #define zeroes v29
0057
0058 #ifdef BYTESWAP_DATA
0059 #define VPERM(A, B, C, D) vperm A, B, C, D
0060 #else
0061 #define VPERM(A, B, C, D)
0062 #endif
0063
0064
0065 FUNC_START(CRC_FUNCTION_NAME)
0066 std r31,-8(r1)
0067 std r30,-16(r1)
0068 std r29,-24(r1)
0069 std r28,-32(r1)
0070 std r27,-40(r1)
0071 std r26,-48(r1)
0072 std r25,-56(r1)
0073
0074 li off16,16
0075 li off32,32
0076 li off48,48
0077 li off64,64
0078 li off80,80
0079 li off96,96
0080 li off112,112
0081 li r0,0
0082
0083
0084 subi r6,r1,56+10*16
0085 subi r7,r1,56+2*16
0086
0087 stvx v20,0,r6
0088 stvx v21,off16,r6
0089 stvx v22,off32,r6
0090 stvx v23,off48,r6
0091 stvx v24,off64,r6
0092 stvx v25,off80,r6
0093 stvx v26,off96,r6
0094 stvx v27,off112,r6
0095 stvx v28,0,r7
0096 stvx v29,off16,r7
0097
0098 mr r10,r3
0099
0100 vxor zeroes,zeroes,zeroes
0101 vspltisw v0,-1
0102
0103 vsldoi mask_32bit,zeroes,v0,4
0104 vsldoi mask_64bit,zeroes,v0,8
0105
0106
0107 vxor v8,v8,v8
0108 MTVRD(v8, R3)
0109 #ifdef REFLECT
0110 vsldoi v8,zeroes,v8,8
0111 #else
0112 vsldoi v8,v8,zeroes,4
0113 #endif
0114
0115 #ifdef BYTESWAP_DATA
0116 addis r3,r2,.byteswap_constant@toc@ha
0117 addi r3,r3,.byteswap_constant@toc@l
0118
0119 lvx byteswap,0,r3
0120 addi r3,r3,16
0121 #endif
0122
0123 cmpdi r5,256
0124 blt .Lshort
0125
0126 rldicr r6,r5,0,56
0127
0128
0129 1: lis r7,MAX_SIZE@h
0130 ori r7,r7,MAX_SIZE@l
0131 mr r9,r7
0132 cmpd r6,r7
0133 bgt 2f
0134 mr r7,r6
0135 2: subf r6,r7,r6
0136
0137
0138 srdi r7,r7,7
0139
0140
0141
0142
0143
0144
0145 sldi r8,r7,4
0146 srdi r9,r9,3
0147 subf r8,r8,r9
0148
0149
0150 addi r7,r7,-1
0151 mtctr r7
0152
0153 addis r3,r2,.constants@toc@ha
0154 addi r3,r3,.constants@toc@l
0155
0156
0157 add r3,r3,r8
0158
0159
0160 vxor v0,v0,v0
0161 vxor v1,v1,v1
0162 vxor v2,v2,v2
0163 vxor v3,v3,v3
0164 vxor v4,v4,v4
0165 vxor v5,v5,v5
0166 vxor v6,v6,v6
0167 vxor v7,v7,v7
0168
0169 lvx const1,0,r3
0170
0171
0172
0173
0174
0175 cmpdi r0,1
0176 beq 2f
0177
0178
0179 lvx v16,0,r4
0180 lvx v17,off16,r4
0181 VPERM(v16,v16,v16,byteswap)
0182 VPERM(v17,v17,v17,byteswap)
0183 lvx v18,off32,r4
0184 lvx v19,off48,r4
0185 VPERM(v18,v18,v18,byteswap)
0186 VPERM(v19,v19,v19,byteswap)
0187 lvx v20,off64,r4
0188 lvx v21,off80,r4
0189 VPERM(v20,v20,v20,byteswap)
0190 VPERM(v21,v21,v21,byteswap)
0191 lvx v22,off96,r4
0192 lvx v23,off112,r4
0193 VPERM(v22,v22,v22,byteswap)
0194 VPERM(v23,v23,v23,byteswap)
0195 addi r4,r4,8*16
0196
0197
0198 vxor v16,v16,v8
0199
0200 2: bdz .Lfirst_warm_up_done
0201
0202 addi r3,r3,16
0203 lvx const2,0,r3
0204
0205
0206 VPMSUMD(v8,v16,const1)
0207 lvx v16,0,r4
0208 VPERM(v16,v16,v16,byteswap)
0209 ori r2,r2,0
0210
0211 VPMSUMD(v9,v17,const1)
0212 lvx v17,off16,r4
0213 VPERM(v17,v17,v17,byteswap)
0214 ori r2,r2,0
0215
0216 VPMSUMD(v10,v18,const1)
0217 lvx v18,off32,r4
0218 VPERM(v18,v18,v18,byteswap)
0219 ori r2,r2,0
0220
0221 VPMSUMD(v11,v19,const1)
0222 lvx v19,off48,r4
0223 VPERM(v19,v19,v19,byteswap)
0224 ori r2,r2,0
0225
0226 VPMSUMD(v12,v20,const1)
0227 lvx v20,off64,r4
0228 VPERM(v20,v20,v20,byteswap)
0229 ori r2,r2,0
0230
0231 VPMSUMD(v13,v21,const1)
0232 lvx v21,off80,r4
0233 VPERM(v21,v21,v21,byteswap)
0234 ori r2,r2,0
0235
0236 VPMSUMD(v14,v22,const1)
0237 lvx v22,off96,r4
0238 VPERM(v22,v22,v22,byteswap)
0239 ori r2,r2,0
0240
0241 VPMSUMD(v15,v23,const1)
0242 lvx v23,off112,r4
0243 VPERM(v23,v23,v23,byteswap)
0244
0245 addi r4,r4,8*16
0246
0247 bdz .Lfirst_cool_down
0248
0249
0250
0251
0252
0253
0254 .balign 16
0255 4: lvx const1,0,r3
0256 addi r3,r3,16
0257 ori r2,r2,0
0258
0259 vxor v0,v0,v8
0260 VPMSUMD(v8,v16,const2)
0261 lvx v16,0,r4
0262 VPERM(v16,v16,v16,byteswap)
0263 ori r2,r2,0
0264
0265 vxor v1,v1,v9
0266 VPMSUMD(v9,v17,const2)
0267 lvx v17,off16,r4
0268 VPERM(v17,v17,v17,byteswap)
0269 ori r2,r2,0
0270
0271 vxor v2,v2,v10
0272 VPMSUMD(v10,v18,const2)
0273 lvx v18,off32,r4
0274 VPERM(v18,v18,v18,byteswap)
0275 ori r2,r2,0
0276
0277 vxor v3,v3,v11
0278 VPMSUMD(v11,v19,const2)
0279 lvx v19,off48,r4
0280 VPERM(v19,v19,v19,byteswap)
0281 lvx const2,0,r3
0282 ori r2,r2,0
0283
0284 vxor v4,v4,v12
0285 VPMSUMD(v12,v20,const1)
0286 lvx v20,off64,r4
0287 VPERM(v20,v20,v20,byteswap)
0288 ori r2,r2,0
0289
0290 vxor v5,v5,v13
0291 VPMSUMD(v13,v21,const1)
0292 lvx v21,off80,r4
0293 VPERM(v21,v21,v21,byteswap)
0294 ori r2,r2,0
0295
0296 vxor v6,v6,v14
0297 VPMSUMD(v14,v22,const1)
0298 lvx v22,off96,r4
0299 VPERM(v22,v22,v22,byteswap)
0300 ori r2,r2,0
0301
0302 vxor v7,v7,v15
0303 VPMSUMD(v15,v23,const1)
0304 lvx v23,off112,r4
0305 VPERM(v23,v23,v23,byteswap)
0306
0307 addi r4,r4,8*16
0308
0309 bdnz 4b
0310
0311 .Lfirst_cool_down:
0312
0313 lvx const1,0,r3
0314 addi r3,r3,16
0315
0316 vxor v0,v0,v8
0317 VPMSUMD(v8,v16,const1)
0318 ori r2,r2,0
0319
0320 vxor v1,v1,v9
0321 VPMSUMD(v9,v17,const1)
0322 ori r2,r2,0
0323
0324 vxor v2,v2,v10
0325 VPMSUMD(v10,v18,const1)
0326 ori r2,r2,0
0327
0328 vxor v3,v3,v11
0329 VPMSUMD(v11,v19,const1)
0330 ori r2,r2,0
0331
0332 vxor v4,v4,v12
0333 VPMSUMD(v12,v20,const1)
0334 ori r2,r2,0
0335
0336 vxor v5,v5,v13
0337 VPMSUMD(v13,v21,const1)
0338 ori r2,r2,0
0339
0340 vxor v6,v6,v14
0341 VPMSUMD(v14,v22,const1)
0342 ori r2,r2,0
0343
0344 vxor v7,v7,v15
0345 VPMSUMD(v15,v23,const1)
0346 ori r2,r2,0
0347
0348 .Lsecond_cool_down:
0349
0350 vxor v0,v0,v8
0351 vxor v1,v1,v9
0352 vxor v2,v2,v10
0353 vxor v3,v3,v11
0354 vxor v4,v4,v12
0355 vxor v5,v5,v13
0356 vxor v6,v6,v14
0357 vxor v7,v7,v15
0358
0359 #ifdef REFLECT
0360
0361
0362
0363
0364
0365
0366 vsldoi v0,v0,zeroes,4
0367 vsldoi v1,v1,zeroes,4
0368 vsldoi v2,v2,zeroes,4
0369 vsldoi v3,v3,zeroes,4
0370 vsldoi v4,v4,zeroes,4
0371 vsldoi v5,v5,zeroes,4
0372 vsldoi v6,v6,zeroes,4
0373 vsldoi v7,v7,zeroes,4
0374 #endif
0375
0376
0377 lvx v8,0,r4
0378 lvx v9,off16,r4
0379 VPERM(v8,v8,v8,byteswap)
0380 VPERM(v9,v9,v9,byteswap)
0381 lvx v10,off32,r4
0382 lvx v11,off48,r4
0383 VPERM(v10,v10,v10,byteswap)
0384 VPERM(v11,v11,v11,byteswap)
0385 lvx v12,off64,r4
0386 lvx v13,off80,r4
0387 VPERM(v12,v12,v12,byteswap)
0388 VPERM(v13,v13,v13,byteswap)
0389 lvx v14,off96,r4
0390 lvx v15,off112,r4
0391 VPERM(v14,v14,v14,byteswap)
0392 VPERM(v15,v15,v15,byteswap)
0393
0394 addi r4,r4,8*16
0395
0396 vxor v16,v0,v8
0397 vxor v17,v1,v9
0398 vxor v18,v2,v10
0399 vxor v19,v3,v11
0400 vxor v20,v4,v12
0401 vxor v21,v5,v13
0402 vxor v22,v6,v14
0403 vxor v23,v7,v15
0404
0405 li r0,1
0406 cmpdi r6,0
0407 addi r6,r6,128
0408 bne 1b
0409
0410
0411 andi. r5,r5,127
0412
0413
0414 subfic r6,r5,128
0415 add r3,r3,r6
0416
0417
0418 srdi r7,r5,4
0419 mtctr r7
0420
0421
0422
0423
0424
0425 lvx v0,0,r3
0426 lvx v1,off16,r3
0427 lvx v2,off32,r3
0428 lvx v3,off48,r3
0429 lvx v4,off64,r3
0430 lvx v5,off80,r3
0431 lvx v6,off96,r3
0432 lvx v7,off112,r3
0433 addi r3,r3,8*16
0434
0435 VPMSUMW(v0,v16,v0)
0436 VPMSUMW(v1,v17,v1)
0437 VPMSUMW(v2,v18,v2)
0438 VPMSUMW(v3,v19,v3)
0439 VPMSUMW(v4,v20,v4)
0440 VPMSUMW(v5,v21,v5)
0441 VPMSUMW(v6,v22,v6)
0442 VPMSUMW(v7,v23,v7)
0443
0444
0445 cmpdi r7,0
0446 beq 1f
0447
0448 lvx v16,0,r4
0449 lvx v17,0,r3
0450 VPERM(v16,v16,v16,byteswap)
0451 VPMSUMW(v16,v16,v17)
0452 vxor v0,v0,v16
0453 bdz 1f
0454
0455 lvx v16,off16,r4
0456 lvx v17,off16,r3
0457 VPERM(v16,v16,v16,byteswap)
0458 VPMSUMW(v16,v16,v17)
0459 vxor v0,v0,v16
0460 bdz 1f
0461
0462 lvx v16,off32,r4
0463 lvx v17,off32,r3
0464 VPERM(v16,v16,v16,byteswap)
0465 VPMSUMW(v16,v16,v17)
0466 vxor v0,v0,v16
0467 bdz 1f
0468
0469 lvx v16,off48,r4
0470 lvx v17,off48,r3
0471 VPERM(v16,v16,v16,byteswap)
0472 VPMSUMW(v16,v16,v17)
0473 vxor v0,v0,v16
0474 bdz 1f
0475
0476 lvx v16,off64,r4
0477 lvx v17,off64,r3
0478 VPERM(v16,v16,v16,byteswap)
0479 VPMSUMW(v16,v16,v17)
0480 vxor v0,v0,v16
0481 bdz 1f
0482
0483 lvx v16,off80,r4
0484 lvx v17,off80,r3
0485 VPERM(v16,v16,v16,byteswap)
0486 VPMSUMW(v16,v16,v17)
0487 vxor v0,v0,v16
0488 bdz 1f
0489
0490 lvx v16,off96,r4
0491 lvx v17,off96,r3
0492 VPERM(v16,v16,v16,byteswap)
0493 VPMSUMW(v16,v16,v17)
0494 vxor v0,v0,v16
0495
0496
0497 1: vxor v0,v0,v1
0498 vxor v2,v2,v3
0499 vxor v4,v4,v5
0500 vxor v6,v6,v7
0501
0502 vxor v0,v0,v2
0503 vxor v4,v4,v6
0504
0505 vxor v0,v0,v4
0506
0507 .Lbarrett_reduction:
0508
0509 addis r3,r2,.barrett_constants@toc@ha
0510 addi r3,r3,.barrett_constants@toc@l
0511
0512 lvx const1,0,r3
0513 lvx const2,off16,r3
0514
0515 vsldoi v1,v0,v0,8
0516 vxor v0,v0,v1
0517
0518 #ifdef REFLECT
0519
0520 vspltisb v1,1
0521 vsl v0,v0,v1
0522 #endif
0523
0524 vand v0,v0,mask_64bit
0525 #ifndef REFLECT
0526
0527
0528
0529
0530
0531
0532 VPMSUMD(v1,v0,const1)
0533 vsldoi v1,zeroes,v1,8
0534 VPMSUMD(v1,v1,const2)
0535 vxor v0,v0,v1
0536
0537
0538
0539
0540
0541
0542 vsldoi v0,v0,zeroes,8
0543 #else
0544
0545
0546
0547
0548
0549
0550
0551 vand v1,v0,mask_32bit
0552 VPMSUMD(v1,v1,const1)
0553 vand v1,v1,mask_32bit
0554 VPMSUMD(v1,v1,const2)
0555 vxor v0,v0,v1
0556
0557
0558
0559
0560
0561
0562
0563 vsldoi v0,v0,zeroes,4
0564 #endif
0565
0566
0567 MFVRD(R3, v0)
0568
0569 .Lout:
0570 subi r6,r1,56+10*16
0571 subi r7,r1,56+2*16
0572
0573 lvx v20,0,r6
0574 lvx v21,off16,r6
0575 lvx v22,off32,r6
0576 lvx v23,off48,r6
0577 lvx v24,off64,r6
0578 lvx v25,off80,r6
0579 lvx v26,off96,r6
0580 lvx v27,off112,r6
0581 lvx v28,0,r7
0582 lvx v29,off16,r7
0583
0584 ld r31,-8(r1)
0585 ld r30,-16(r1)
0586 ld r29,-24(r1)
0587 ld r28,-32(r1)
0588 ld r27,-40(r1)
0589 ld r26,-48(r1)
0590 ld r25,-56(r1)
0591
0592 blr
0593
0594 .Lfirst_warm_up_done:
0595 lvx const1,0,r3
0596 addi r3,r3,16
0597
0598 VPMSUMD(v8,v16,const1)
0599 VPMSUMD(v9,v17,const1)
0600 VPMSUMD(v10,v18,const1)
0601 VPMSUMD(v11,v19,const1)
0602 VPMSUMD(v12,v20,const1)
0603 VPMSUMD(v13,v21,const1)
0604 VPMSUMD(v14,v22,const1)
0605 VPMSUMD(v15,v23,const1)
0606
0607 b .Lsecond_cool_down
0608
0609 .Lshort:
0610 cmpdi r5,0
0611 beq .Lzero
0612
0613 addis r3,r2,.short_constants@toc@ha
0614 addi r3,r3,.short_constants@toc@l
0615
0616
0617 subfic r6,r5,256
0618 add r3,r3,r6
0619
0620
0621 srdi r7,r5,4
0622 mtctr r7
0623
0624 vxor v19,v19,v19
0625 vxor v20,v20,v20
0626
0627 lvx v0,0,r4
0628 lvx v16,0,r3
0629 VPERM(v0,v0,v16,byteswap)
0630 vxor v0,v0,v8
0631 VPMSUMW(v0,v0,v16)
0632 bdz .Lv0
0633
0634 lvx v1,off16,r4
0635 lvx v17,off16,r3
0636 VPERM(v1,v1,v17,byteswap)
0637 VPMSUMW(v1,v1,v17)
0638 bdz .Lv1
0639
0640 lvx v2,off32,r4
0641 lvx v16,off32,r3
0642 VPERM(v2,v2,v16,byteswap)
0643 VPMSUMW(v2,v2,v16)
0644 bdz .Lv2
0645
0646 lvx v3,off48,r4
0647 lvx v17,off48,r3
0648 VPERM(v3,v3,v17,byteswap)
0649 VPMSUMW(v3,v3,v17)
0650 bdz .Lv3
0651
0652 lvx v4,off64,r4
0653 lvx v16,off64,r3
0654 VPERM(v4,v4,v16,byteswap)
0655 VPMSUMW(v4,v4,v16)
0656 bdz .Lv4
0657
0658 lvx v5,off80,r4
0659 lvx v17,off80,r3
0660 VPERM(v5,v5,v17,byteswap)
0661 VPMSUMW(v5,v5,v17)
0662 bdz .Lv5
0663
0664 lvx v6,off96,r4
0665 lvx v16,off96,r3
0666 VPERM(v6,v6,v16,byteswap)
0667 VPMSUMW(v6,v6,v16)
0668 bdz .Lv6
0669
0670 lvx v7,off112,r4
0671 lvx v17,off112,r3
0672 VPERM(v7,v7,v17,byteswap)
0673 VPMSUMW(v7,v7,v17)
0674 bdz .Lv7
0675
0676 addi r3,r3,128
0677 addi r4,r4,128
0678
0679 lvx v8,0,r4
0680 lvx v16,0,r3
0681 VPERM(v8,v8,v16,byteswap)
0682 VPMSUMW(v8,v8,v16)
0683 bdz .Lv8
0684
0685 lvx v9,off16,r4
0686 lvx v17,off16,r3
0687 VPERM(v9,v9,v17,byteswap)
0688 VPMSUMW(v9,v9,v17)
0689 bdz .Lv9
0690
0691 lvx v10,off32,r4
0692 lvx v16,off32,r3
0693 VPERM(v10,v10,v16,byteswap)
0694 VPMSUMW(v10,v10,v16)
0695 bdz .Lv10
0696
0697 lvx v11,off48,r4
0698 lvx v17,off48,r3
0699 VPERM(v11,v11,v17,byteswap)
0700 VPMSUMW(v11,v11,v17)
0701 bdz .Lv11
0702
0703 lvx v12,off64,r4
0704 lvx v16,off64,r3
0705 VPERM(v12,v12,v16,byteswap)
0706 VPMSUMW(v12,v12,v16)
0707 bdz .Lv12
0708
0709 lvx v13,off80,r4
0710 lvx v17,off80,r3
0711 VPERM(v13,v13,v17,byteswap)
0712 VPMSUMW(v13,v13,v17)
0713 bdz .Lv13
0714
0715 lvx v14,off96,r4
0716 lvx v16,off96,r3
0717 VPERM(v14,v14,v16,byteswap)
0718 VPMSUMW(v14,v14,v16)
0719 bdz .Lv14
0720
0721 lvx v15,off112,r4
0722 lvx v17,off112,r3
0723 VPERM(v15,v15,v17,byteswap)
0724 VPMSUMW(v15,v15,v17)
0725
0726 .Lv15: vxor v19,v19,v15
0727 .Lv14: vxor v20,v20,v14
0728 .Lv13: vxor v19,v19,v13
0729 .Lv12: vxor v20,v20,v12
0730 .Lv11: vxor v19,v19,v11
0731 .Lv10: vxor v20,v20,v10
0732 .Lv9: vxor v19,v19,v9
0733 .Lv8: vxor v20,v20,v8
0734 .Lv7: vxor v19,v19,v7
0735 .Lv6: vxor v20,v20,v6
0736 .Lv5: vxor v19,v19,v5
0737 .Lv4: vxor v20,v20,v4
0738 .Lv3: vxor v19,v19,v3
0739 .Lv2: vxor v20,v20,v2
0740 .Lv1: vxor v19,v19,v1
0741 .Lv0: vxor v20,v20,v0
0742
0743 vxor v0,v19,v20
0744
0745 b .Lbarrett_reduction
0746
0747 .Lzero:
0748 mr r3,r10
0749 b .Lout
0750
0751 FUNC_END(CRC_FUNCTION_NAME)