Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * Fast SHA-256 implementation for SPE instruction set (PPC)
0004  *
0005  * This code makes use of the SPE SIMD instruction set as defined in
0006  * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
0007  * Implementation is based on optimization guide notes from
0008  * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
0009  *
0010  * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
0011  */
0012 
0013 #include <asm/ppc_asm.h>
0014 #include <asm/asm-offsets.h>
0015 
0016 #define rHP r3  /* pointer to hash values in memory     */
0017 #define rKP r24 /* pointer to round constants           */
0018 #define rWP r4  /* pointer to input data            */
0019 
0020 #define rH0 r5  /* 8 32 bit hash values in 8 registers      */
0021 #define rH1 r6
0022 #define rH2 r7
0023 #define rH3 r8
0024 #define rH4 r9
0025 #define rH5 r10
0026 #define rH6 r11
0027 #define rH7 r12
0028 
0029 #define rW0 r14 /* 64 bit registers. 16 words in 8 registers    */
0030 #define rW1 r15
0031 #define rW2 r16
0032 #define rW3 r17
0033 #define rW4 r18
0034 #define rW5 r19
0035 #define rW6 r20
0036 #define rW7 r21
0037 
0038 #define rT0 r22 /* 64 bit temporaries               */
0039 #define rT1 r23
0040 #define rT2 r0  /* 32 bit temporaries               */
0041 #define rT3 r25
0042 
0043 #define CMP_KN_LOOP
0044 #define CMP_KC_LOOP \
0045     cmpwi       rT1,0;
0046 
0047 #define INITIALIZE \
0048     stwu        r1,-128(r1);    /* create stack frame       */ \
0049     evstdw      r14,8(r1);  /* We must save non volatile    */ \
0050     evstdw      r15,16(r1); /* registers. Take the chance   */ \
0051     evstdw      r16,24(r1); /* and save the SPE part too    */ \
0052     evstdw      r17,32(r1);                    \
0053     evstdw      r18,40(r1);                    \
0054     evstdw      r19,48(r1);                    \
0055     evstdw      r20,56(r1);                    \
0056     evstdw      r21,64(r1);                    \
0057     evstdw      r22,72(r1);                    \
0058     evstdw      r23,80(r1);                    \
0059     stw     r24,88(r1); /* save normal registers    */ \
0060     stw     r25,92(r1);
0061 
0062 
0063 #define FINALIZE \
0064     evldw       r14,8(r1);  /* restore SPE registers    */ \
0065     evldw       r15,16(r1);                    \
0066     evldw       r16,24(r1);                    \
0067     evldw       r17,32(r1);                    \
0068     evldw       r18,40(r1);                    \
0069     evldw       r19,48(r1);                    \
0070     evldw       r20,56(r1);                    \
0071     evldw       r21,64(r1);                    \
0072     evldw       r22,72(r1);                    \
0073     evldw       r23,80(r1);                    \
0074     lwz     r24,88(r1); /* restore normal registers */ \
0075     lwz     r25,92(r1);                    \
0076     xor     r0,r0,r0;                      \
0077     stw     r0,8(r1);   /* Delete sensitive data    */ \
0078     stw     r0,16(r1);  /* that we might have pushed    */ \
0079     stw     r0,24(r1);  /* from other context that runs */ \
0080     stw     r0,32(r1);  /* the same code. Assume that   */ \
0081     stw     r0,40(r1);  /* the lower part of the GPRs   */ \
0082     stw     r0,48(r1);  /* was already overwritten on   */ \
0083     stw     r0,56(r1);  /* the way down to here     */ \
0084     stw     r0,64(r1);                     \
0085     stw     r0,72(r1);                     \
0086     stw     r0,80(r1);                     \
0087     addi        r1,r1,128;  /* cleanup stack frame      */
0088 
0089 #ifdef __BIG_ENDIAN__
0090 #define LOAD_DATA(reg, off) \
0091     lwz     reg,off(rWP);   /* load data            */
0092 #define NEXT_BLOCK \
0093     addi        rWP,rWP,64; /* increment per block      */
0094 #else
0095 #define LOAD_DATA(reg, off) \
0096     lwbrx       reg,0,rWP;  /* load data            */ \
0097     addi        rWP,rWP,4;  /* increment per word       */
0098 #define NEXT_BLOCK          /* nothing to do        */
0099 #endif
0100 
0101 #define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
0102     LOAD_DATA(w, off)       /* 1: W             */ \
0103     rotrwi      rT0,e,6;    /* 1: S1 = e rotr 6     */ \
0104     rotrwi      rT1,e,11;   /* 1: S1' = e rotr 11       */ \
0105     rotrwi      rT2,e,25;   /* 1: S1" = e rotr 25       */ \
0106     xor     rT0,rT0,rT1;    /* 1: S1 = S1 xor S1'       */ \
0107     and     rT3,e,f;    /* 1: ch = e and f      */ \
0108     xor     rT0,rT0,rT2;    /* 1: S1 = S1 xor S1"       */ \
0109     andc        rT1,g,e;    /* 1: ch' = ~e and g        */ \
0110     lwz     rT2,off(rKP);   /* 1: K             */ \
0111     xor     rT3,rT3,rT1;    /* 1: ch = ch xor ch'       */ \
0112     add     h,h,rT0;    /* 1: temp1 = h + S1        */ \
0113     add     rT3,rT3,w;  /* 1: temp1' = ch + w       */ \
0114     rotrwi      rT0,a,2;    /* 1: S0 = a rotr 2     */ \
0115     add     h,h,rT3;    /* 1: temp1 = temp1 + temp1'    */ \
0116     rotrwi      rT1,a,13;   /* 1: S0' = a rotr 13       */ \
0117     add     h,h,rT2;    /* 1: temp1 = temp1 + K     */ \
0118     rotrwi      rT3,a,22;   /* 1: S0" = a rotr 22       */ \
0119     xor     rT0,rT0,rT1;    /* 1: S0 = S0 xor S0'       */ \
0120     add     d,d,h;      /* 1: d = d + temp1     */ \
0121     xor     rT3,rT0,rT3;    /* 1: S0 = S0 xor S0"       */ \
0122     evmergelo   w,w,w;      /*    shift W           */ \
0123     or      rT2,a,b;    /* 1: maj = a or b      */ \
0124     and     rT1,a,b;    /* 1: maj' = a and b        */ \
0125     and     rT2,rT2,c;  /* 1: maj = maj and c       */ \
0126     LOAD_DATA(w, off+4)     /* 2: W             */ \
0127     or      rT2,rT1,rT2;    /* 1: maj = maj or maj'     */ \
0128     rotrwi      rT0,d,6;    /* 2: S1 = e rotr 6     */ \
0129     add     rT3,rT3,rT2;    /* 1: temp2 = S0 + maj      */ \
0130     rotrwi      rT1,d,11;   /* 2: S1' = e rotr 11       */ \
0131     add     h,h,rT3;    /* 1: h = temp1 + temp2     */ \
0132     rotrwi      rT2,d,25;   /* 2: S1" = e rotr 25       */ \
0133     xor     rT0,rT0,rT1;    /* 2: S1 = S1 xor S1'       */ \
0134     and     rT3,d,e;    /* 2: ch = e and f      */ \
0135     xor     rT0,rT0,rT2;    /* 2: S1 = S1 xor S1"       */ \
0136     andc        rT1,f,d;    /* 2: ch' = ~e and g        */ \
0137     lwz     rT2,off+4(rKP); /* 2: K             */ \
0138     xor     rT3,rT3,rT1;    /* 2: ch = ch xor ch'       */ \
0139     add     g,g,rT0;    /* 2: temp1 = h + S1        */ \
0140     add     rT3,rT3,w;  /* 2: temp1' = ch + w       */ \
0141     rotrwi      rT0,h,2;    /* 2: S0 = a rotr 2     */ \
0142     add     g,g,rT3;    /* 2: temp1 = temp1 + temp1'    */ \
0143     rotrwi      rT1,h,13;   /* 2: S0' = a rotr 13       */ \
0144     add     g,g,rT2;    /* 2: temp1 = temp1 + K     */ \
0145     rotrwi      rT3,h,22;   /* 2: S0" = a rotr 22       */ \
0146     xor     rT0,rT0,rT1;    /* 2: S0 = S0 xor S0'       */ \
0147     or      rT2,h,a;    /* 2: maj = a or b      */ \
0148     xor     rT3,rT0,rT3;    /* 2: S0 = S0 xor S0"       */ \
0149     and     rT1,h,a;    /* 2: maj' = a and b        */ \
0150     and     rT2,rT2,b;  /* 2: maj = maj and c       */ \
0151     add     c,c,g;      /* 2: d = d + temp1     */ \
0152     or      rT2,rT1,rT2;    /* 2: maj = maj or maj'     */ \
0153     add     rT3,rT3,rT2;    /* 2: temp2 = S0 + maj      */ \
0154     add     g,g,rT3     /* 2: h = temp1 + temp2     */
0155 
0156 #define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
0157     rotrwi      rT2,e,6;    /* 1: S1 = e rotr 6     */ \
0158     evmergelohi rT0,w0,w1;  /*    w[-15]            */ \
0159     rotrwi      rT3,e,11;   /* 1: S1' = e rotr 11       */ \
0160     evsrwiu     rT1,rT0,3;  /*    s0 = w[-15] >> 3      */ \
0161     xor     rT2,rT2,rT3;    /* 1: S1 = S1 xor S1'       */ \
0162     evrlwi      rT0,rT0,25; /*    s0' = w[-15] rotr 7   */ \
0163     rotrwi      rT3,e,25;   /* 1: S1' = e rotr 25       */ \
0164     evxor       rT1,rT1,rT0;    /*    s0 = s0 xor s0'       */ \
0165     xor     rT2,rT2,rT3;    /* 1: S1 = S1 xor S1'       */ \
0166     evrlwi      rT0,rT0,21; /*    s0' = w[-15] rotr 18  */ \
0167     add     h,h,rT2;    /* 1: temp1 = h + S1        */ \
0168     evxor       rT0,rT0,rT1;    /*    s0 = s0 xor s0'       */ \
0169     and     rT2,e,f;    /* 1: ch = e and f      */ \
0170     evaddw      w0,w0,rT0;  /*    w = w[-16] + s0       */ \
0171     andc        rT3,g,e;    /* 1: ch' = ~e and g        */ \
0172     evsrwiu     rT0,w7,10;  /*    s1 = w[-2] >> 10      */ \
0173     xor     rT2,rT2,rT3;    /* 1: ch = ch xor ch'       */ \
0174     evrlwi      rT1,w7,15;  /*    s1' = w[-2] rotr 17   */ \
0175     add     h,h,rT2;    /* 1: temp1 = temp1 + ch    */ \
0176     evxor       rT0,rT0,rT1;    /*    s1 = s1 xor s1'       */ \
0177     rotrwi      rT2,a,2;    /* 1: S0 = a rotr 2     */ \
0178     evrlwi      rT1,w7,13;  /*    s1' = w[-2] rotr 19   */ \
0179     rotrwi      rT3,a,13;   /* 1: S0' = a rotr 13       */ \
0180     evxor       rT0,rT0,rT1;    /*    s1 = s1 xor s1'       */ \
0181     xor     rT2,rT2,rT3;    /* 1: S0 = S0 xor S0'       */ \
0182     evldw       rT1,off(rKP);   /*    k             */ \
0183     rotrwi      rT3,a,22;   /* 1: S0' = a rotr 22       */ \
0184     evaddw      w0,w0,rT0;  /*    w = w + s1        */ \
0185     xor     rT2,rT2,rT3;    /* 1: S0 = S0 xor S0'       */ \
0186     evmergelohi rT0,w4,w5;  /*    w[-7]         */ \
0187     and     rT3,a,b;    /* 1: maj = a and b     */ \
0188     evaddw      w0,w0,rT0;  /*    w = w + w[-7]     */ \
0189     CMP_K##k##_LOOP                            \
0190     add     rT2,rT2,rT3;    /* 1: temp2 = S0 + maj      */ \
0191     evaddw      rT1,rT1,w0; /*    wk = w + k        */ \
0192     xor     rT3,a,b;    /* 1: maj = a xor b     */ \
0193     evmergehi   rT0,rT1,rT1;    /*    wk1/wk2           */ \
0194     and     rT3,rT3,c;  /* 1: maj = maj and c       */ \
0195     add     h,h,rT0;    /* 1: temp1 = temp1 + wk    */ \
0196     add     rT2,rT2,rT3;    /* 1: temp2 = temp2 + maj   */ \
0197     add     g,g,rT1;    /* 2: temp1 = temp1 + wk    */ \
0198     add     d,d,h;      /* 1: d = d + temp1     */ \
0199     rotrwi      rT0,d,6;    /* 2: S1 = e rotr 6     */ \
0200     add     h,h,rT2;    /* 1: h = temp1 + temp2     */ \
0201     rotrwi      rT1,d,11;   /* 2: S1' = e rotr 11       */ \
0202     rotrwi      rT2,d,25;   /* 2: S" = e rotr 25        */ \
0203     xor     rT0,rT0,rT1;    /* 2: S1 = S1 xor S1'       */ \
0204     and     rT3,d,e;    /* 2: ch = e and f      */ \
0205     xor     rT0,rT0,rT2;    /* 2: S1 = S1 xor S1"       */ \
0206     andc        rT1,f,d;    /* 2: ch' = ~e and g        */ \
0207     add     g,g,rT0;    /* 2: temp1 = h + S1        */ \
0208     xor     rT3,rT3,rT1;    /* 2: ch = ch xor ch'       */ \
0209     rotrwi      rT0,h,2;    /* 2: S0 = a rotr 2     */ \
0210     add     g,g,rT3;    /* 2: temp1 = temp1 + ch    */ \
0211     rotrwi      rT1,h,13;   /* 2: S0' = a rotr 13       */ \
0212     rotrwi      rT3,h,22;   /* 2: S0" = a rotr 22       */ \
0213     xor     rT0,rT0,rT1;    /* 2: S0 = S0 xor S0'       */ \
0214     or      rT2,h,a;    /* 2: maj = a or b      */ \
0215     and     rT1,h,a;    /* 2: maj' = a and b        */ \
0216     and     rT2,rT2,b;  /* 2: maj = maj and c       */ \
0217     xor     rT3,rT0,rT3;    /* 2: S0 = S0 xor S0"       */ \
0218     or      rT2,rT1,rT2;    /* 2: maj = maj or maj'     */ \
0219     add     c,c,g;      /* 2: d = d + temp1     */ \
0220     add     rT3,rT3,rT2;    /* 2: temp2 = S0 + maj      */ \
0221     add     g,g,rT3     /* 2: h = temp1 + temp2     */
0222 
0223 _GLOBAL(ppc_spe_sha256_transform)
0224     INITIALIZE
0225 
0226     mtctr       r5
0227     lwz     rH0,0(rHP)
0228     lwz     rH1,4(rHP)
0229     lwz     rH2,8(rHP)
0230     lwz     rH3,12(rHP)
0231     lwz     rH4,16(rHP)
0232     lwz     rH5,20(rHP)
0233     lwz     rH6,24(rHP)
0234     lwz     rH7,28(rHP)
0235 
0236 ppc_spe_sha256_main:
0237     lis     rKP,PPC_SPE_SHA256_K@ha
0238     addi        rKP,rKP,PPC_SPE_SHA256_K@l
0239 
0240     R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
0241     R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
0242     R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
0243     R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
0244     R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
0245     R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
0246     R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
0247     R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
0248 ppc_spe_sha256_16_rounds:
0249     addi        rKP,rKP,64
0250     R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
0251          rW0, rW1, rW4, rW5, rW7, N, 0)
0252     R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
0253          rW1, rW2, rW5, rW6, rW0, N, 8)
0254     R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
0255          rW2, rW3, rW6, rW7, rW1, N, 16)
0256     R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
0257          rW3, rW4, rW7, rW0, rW2, N, 24)
0258     R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
0259          rW4, rW5, rW0, rW1, rW3, N, 32)
0260     R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
0261          rW5, rW6, rW1, rW2, rW4, N, 40)
0262     R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
0263          rW6, rW7, rW2, rW3, rW5, N, 48)
0264     R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
0265          rW7, rW0, rW3, rW4, rW6, C, 56)
0266     bt      gt,ppc_spe_sha256_16_rounds
0267 
0268     lwz     rW0,0(rHP)
0269     NEXT_BLOCK
0270     lwz     rW1,4(rHP)
0271     lwz     rW2,8(rHP)
0272     lwz     rW3,12(rHP)
0273     lwz     rW4,16(rHP)
0274     lwz     rW5,20(rHP)
0275     lwz     rW6,24(rHP)
0276     lwz     rW7,28(rHP)
0277 
0278     add     rH0,rH0,rW0
0279     stw     rH0,0(rHP)
0280     add     rH1,rH1,rW1
0281     stw     rH1,4(rHP)
0282     add     rH2,rH2,rW2
0283     stw     rH2,8(rHP)
0284     add     rH3,rH3,rW3
0285     stw     rH3,12(rHP)
0286     add     rH4,rH4,rW4
0287     stw     rH4,16(rHP)
0288     add     rH5,rH5,rW5
0289     stw     rH5,20(rHP)
0290     add     rH6,rH6,rW6
0291     stw     rH6,24(rHP)
0292     add     rH7,rH7,rW7
0293     stw     rH7,28(rHP)
0294 
0295     bdnz        ppc_spe_sha256_main
0296 
0297     FINALIZE
0298     blr
0299 
0300 .data
0301 .align 5
0302 PPC_SPE_SHA256_K:
0303     .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
0304     .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
0305     .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
0306     .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
0307     .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
0308     .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
0309     .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
0310     .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
0311     .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
0312     .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
0313     .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
0314     .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
0315     .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
0316     .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
0317     .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
0318     .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2