Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * Fast AES implementation for SPE instruction set (PPC)
0004  *
0005  * This code makes use of the SPE SIMD instruction set as defined in
0006  * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
0007  * Implementation is based on optimization guide notes from
0008  * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
0009  *
0010  * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
0011  */
0012 
0013 #include <asm/ppc_asm.h>
0014 #include "aes-spe-regs.h"
0015 
0016 #define EAD(in, bpos) \
0017     rlwimi      rT0,in,28-((bpos+3)%4)*8,20,27;
0018 
0019 #define DAD(in, bpos) \
0020     rlwimi      rT1,in,24-((bpos+3)%4)*8,24,31;
0021 
0022 #define LWH(out, off) \
0023     evlwwsplat  out,off(rT0);   /* load word high       */
0024 
0025 #define LWL(out, off) \
0026     lwz     out,off(rT0);   /* load word low        */
0027 
0028 #define LBZ(out, tab, off) \
0029     lbz     out,off(tab);   /* load byte            */
0030 
0031 #define LAH(out, in, bpos, off) \
0032     EAD(in, bpos)           /* calc addr + load word high   */ \
0033     LWH(out, off)
0034 
0035 #define LAL(out, in, bpos, off) \
0036     EAD(in, bpos)           /* calc addr + load word low    */ \
0037     LWL(out, off)
0038 
0039 #define LAE(out, in, bpos) \
0040     EAD(in, bpos)           /* calc addr + load enc byte    */ \
0041     LBZ(out, rT0, 8)
0042 
0043 #define LBE(out) \
0044     LBZ(out, rT0, 8)        /* load enc byte        */
0045 
0046 #define LAD(out, in, bpos) \
0047     DAD(in, bpos)           /* calc addr + load dec byte    */ \
0048     LBZ(out, rT1, 0)
0049 
0050 #define LBD(out) \
0051     LBZ(out, rT1, 0)
0052 
0053 /*
0054  * ppc_encrypt_block: The central encryption function for a single 16 bytes
0055  * block. It does no stack handling or register saving to support fast calls
0056  * via bl/blr. It expects that caller has pre-xored input data with first
0057  * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
0058  * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
0059  * and rW0-rW3 and caller must execute a final xor on the output registers.
0060  * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
0061  *
0062  */
0063 _GLOBAL(ppc_encrypt_block)
0064     LAH(rW4, rD1, 2, 4)
0065     LAH(rW6, rD0, 3, 0)
0066     LAH(rW3, rD0, 1, 8)
0067 ppc_encrypt_block_loop:
0068     LAH(rW0, rD3, 0, 12)
0069     LAL(rW0, rD0, 0, 12)
0070     LAH(rW1, rD1, 0, 12)
0071     LAH(rW2, rD2, 1, 8)
0072     LAL(rW2, rD3, 1, 8)
0073     LAL(rW3, rD1, 1, 8)
0074     LAL(rW4, rD2, 2, 4)
0075     LAL(rW6, rD1, 3, 0)
0076     LAH(rW5, rD3, 2, 4)
0077     LAL(rW5, rD0, 2, 4)
0078     LAH(rW7, rD2, 3, 0)
0079     evldw       rD1,16(rKP)
0080     EAD(rD3, 3)
0081     evxor       rW2,rW2,rW4
0082     LWL(rW7, 0)
0083     evxor       rW2,rW2,rW6
0084     EAD(rD2, 0)
0085     evxor       rD1,rD1,rW2
0086     LWL(rW1, 12)
0087     evxor       rD1,rD1,rW0
0088     evldw       rD3,24(rKP)
0089     evmergehi   rD0,rD0,rD1
0090     EAD(rD1, 2)
0091     evxor       rW3,rW3,rW5
0092     LWH(rW4, 4)
0093     evxor       rW3,rW3,rW7
0094     EAD(rD0, 3)
0095     evxor       rD3,rD3,rW3
0096     LWH(rW6, 0)
0097     evxor       rD3,rD3,rW1
0098     EAD(rD0, 1)
0099     evmergehi   rD2,rD2,rD3
0100     LWH(rW3, 8)
0101     LAH(rW0, rD3, 0, 12)
0102     LAL(rW0, rD0, 0, 12)
0103     LAH(rW1, rD1, 0, 12)
0104     LAH(rW2, rD2, 1, 8)
0105     LAL(rW2, rD3, 1, 8)
0106     LAL(rW3, rD1, 1, 8)
0107     LAL(rW4, rD2, 2, 4)
0108     LAL(rW6, rD1, 3, 0)
0109     LAH(rW5, rD3, 2, 4)
0110     LAL(rW5, rD0, 2, 4)
0111     LAH(rW7, rD2, 3, 0)
0112     evldw       rD1,32(rKP)
0113     EAD(rD3, 3)
0114     evxor       rW2,rW2,rW4
0115     LWL(rW7, 0)
0116     evxor       rW2,rW2,rW6
0117     EAD(rD2, 0)
0118     evxor       rD1,rD1,rW2
0119     LWL(rW1, 12)
0120     evxor       rD1,rD1,rW0
0121     evldw       rD3,40(rKP)
0122     evmergehi   rD0,rD0,rD1
0123     EAD(rD1, 2)
0124     evxor       rW3,rW3,rW5
0125     LWH(rW4, 4)
0126     evxor       rW3,rW3,rW7
0127     EAD(rD0, 3)
0128     evxor       rD3,rD3,rW3
0129     LWH(rW6, 0)
0130     evxor       rD3,rD3,rW1
0131     EAD(rD0, 1)
0132     evmergehi   rD2,rD2,rD3
0133     LWH(rW3, 8)
0134     addi        rKP,rKP,32
0135     bdnz        ppc_encrypt_block_loop
0136     LAH(rW0, rD3, 0, 12)
0137     LAL(rW0, rD0, 0, 12)
0138     LAH(rW1, rD1, 0, 12)
0139     LAH(rW2, rD2, 1, 8)
0140     LAL(rW2, rD3, 1, 8)
0141     LAL(rW3, rD1, 1, 8)
0142     LAL(rW4, rD2, 2, 4)
0143     LAH(rW5, rD3, 2, 4)
0144     LAL(rW6, rD1, 3, 0)
0145     LAL(rW5, rD0, 2, 4)
0146     LAH(rW7, rD2, 3, 0)
0147     evldw       rD1,16(rKP)
0148     EAD(rD3, 3)
0149     evxor       rW2,rW2,rW4
0150     LWL(rW7, 0)
0151     evxor       rW2,rW2,rW6
0152     EAD(rD2, 0)
0153     evxor       rD1,rD1,rW2
0154     LWL(rW1, 12)
0155     evxor       rD1,rD1,rW0
0156     evldw       rD3,24(rKP)
0157     evmergehi   rD0,rD0,rD1
0158     EAD(rD1, 0)
0159     evxor       rW3,rW3,rW5
0160     LBE(rW2)
0161     evxor       rW3,rW3,rW7
0162     EAD(rD0, 1)
0163     evxor       rD3,rD3,rW3
0164     LBE(rW6)
0165     evxor       rD3,rD3,rW1
0166     EAD(rD0, 0)
0167     evmergehi   rD2,rD2,rD3
0168     LBE(rW1)
0169     LAE(rW0, rD3, 0)
0170     LAE(rW1, rD0, 0)
0171     LAE(rW4, rD2, 1)
0172     LAE(rW5, rD3, 1)
0173     LAE(rW3, rD2, 0)
0174     LAE(rW7, rD1, 1)
0175     rlwimi      rW0,rW4,8,16,23
0176     rlwimi      rW1,rW5,8,16,23
0177     LAE(rW4, rD1, 2)
0178     LAE(rW5, rD2, 2)
0179     rlwimi      rW2,rW6,8,16,23
0180     rlwimi      rW3,rW7,8,16,23
0181     LAE(rW6, rD3, 2)
0182     LAE(rW7, rD0, 2)
0183     rlwimi      rW0,rW4,16,8,15
0184     rlwimi      rW1,rW5,16,8,15
0185     LAE(rW4, rD0, 3)
0186     LAE(rW5, rD1, 3)
0187     rlwimi      rW2,rW6,16,8,15
0188     lwz     rD0,32(rKP)
0189     rlwimi      rW3,rW7,16,8,15
0190     lwz     rD1,36(rKP)
0191     LAE(rW6, rD2, 3)
0192     LAE(rW7, rD3, 3)
0193     rlwimi      rW0,rW4,24,0,7
0194     lwz     rD2,40(rKP)
0195     rlwimi      rW1,rW5,24,0,7
0196     lwz     rD3,44(rKP)
0197     rlwimi      rW2,rW6,24,0,7
0198     rlwimi      rW3,rW7,24,0,7
0199     blr
0200 
0201 /*
0202  * ppc_decrypt_block: The central decryption function for a single 16 bytes
0203  * block. It does no stack handling or register saving to support fast calls
0204  * via bl/blr. It expects that caller has pre-xored input data with first
0205  * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
0206  * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
0207  * and rW0-rW3 and caller must execute a final xor on the output registers.
0208  * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
0209  *
0210  */
0211 _GLOBAL(ppc_decrypt_block)
0212     LAH(rW0, rD1, 0, 12)
0213     LAH(rW6, rD0, 3, 0)
0214     LAH(rW3, rD0, 1, 8)
0215 ppc_decrypt_block_loop:
0216     LAH(rW1, rD3, 0, 12)
0217     LAL(rW0, rD2, 0, 12)
0218     LAH(rW2, rD2, 1, 8)
0219     LAL(rW2, rD3, 1, 8)
0220     LAH(rW4, rD3, 2, 4)
0221     LAL(rW4, rD0, 2, 4)
0222     LAL(rW6, rD1, 3, 0)
0223     LAH(rW5, rD1, 2, 4)
0224     LAH(rW7, rD2, 3, 0)
0225     LAL(rW7, rD3, 3, 0)
0226     LAL(rW3, rD1, 1, 8)
0227     evldw       rD1,16(rKP)
0228     EAD(rD0, 0)
0229     evxor       rW4,rW4,rW6
0230     LWL(rW1, 12)
0231     evxor       rW0,rW0,rW4
0232     EAD(rD2, 2)
0233     evxor       rW0,rW0,rW2
0234     LWL(rW5, 4)
0235     evxor       rD1,rD1,rW0
0236     evldw       rD3,24(rKP)
0237     evmergehi   rD0,rD0,rD1
0238     EAD(rD1, 0)
0239     evxor       rW3,rW3,rW7
0240     LWH(rW0, 12)
0241     evxor       rW3,rW3,rW1
0242     EAD(rD0, 3)
0243     evxor       rD3,rD3,rW3
0244     LWH(rW6, 0)
0245     evxor       rD3,rD3,rW5
0246     EAD(rD0, 1)
0247     evmergehi   rD2,rD2,rD3
0248     LWH(rW3, 8)
0249     LAH(rW1, rD3, 0, 12)
0250     LAL(rW0, rD2, 0, 12)
0251     LAH(rW2, rD2, 1, 8)
0252     LAL(rW2, rD3, 1, 8)
0253     LAH(rW4, rD3, 2, 4)
0254     LAL(rW4, rD0, 2, 4)
0255     LAL(rW6, rD1, 3, 0)
0256     LAH(rW5, rD1, 2, 4)
0257     LAH(rW7, rD2, 3, 0)
0258     LAL(rW7, rD3, 3, 0)
0259     LAL(rW3, rD1, 1, 8)
0260     evldw        rD1,32(rKP)
0261     EAD(rD0, 0)
0262     evxor       rW4,rW4,rW6
0263     LWL(rW1, 12)
0264     evxor       rW0,rW0,rW4
0265     EAD(rD2, 2)
0266     evxor       rW0,rW0,rW2
0267     LWL(rW5, 4)
0268     evxor       rD1,rD1,rW0
0269     evldw       rD3,40(rKP)
0270     evmergehi   rD0,rD0,rD1
0271     EAD(rD1, 0)
0272     evxor       rW3,rW3,rW7
0273     LWH(rW0, 12)
0274     evxor       rW3,rW3,rW1
0275     EAD(rD0, 3)
0276     evxor       rD3,rD3,rW3
0277     LWH(rW6, 0)
0278     evxor       rD3,rD3,rW5
0279     EAD(rD0, 1)
0280     evmergehi   rD2,rD2,rD3
0281     LWH(rW3, 8)
0282     addi        rKP,rKP,32
0283     bdnz        ppc_decrypt_block_loop
0284     LAH(rW1, rD3, 0, 12)
0285     LAL(rW0, rD2, 0, 12)
0286     LAH(rW2, rD2, 1, 8)
0287     LAL(rW2, rD3, 1, 8)
0288     LAH(rW4, rD3, 2, 4)
0289     LAL(rW4, rD0, 2, 4)
0290     LAL(rW6, rD1, 3, 0)
0291     LAH(rW5, rD1, 2, 4)
0292     LAH(rW7, rD2, 3, 0)
0293     LAL(rW7, rD3, 3, 0)
0294     LAL(rW3, rD1, 1, 8)
0295     evldw        rD1,16(rKP)
0296     EAD(rD0, 0)
0297     evxor       rW4,rW4,rW6
0298     LWL(rW1, 12)
0299     evxor       rW0,rW0,rW4
0300     EAD(rD2, 2)
0301     evxor       rW0,rW0,rW2
0302     LWL(rW5, 4)
0303     evxor       rD1,rD1,rW0
0304     evldw       rD3,24(rKP)
0305     evmergehi   rD0,rD0,rD1
0306     DAD(rD1, 0)
0307     evxor       rW3,rW3,rW7
0308     LBD(rW0)
0309     evxor       rW3,rW3,rW1
0310     DAD(rD0, 1)
0311     evxor       rD3,rD3,rW3
0312     LBD(rW6)
0313     evxor       rD3,rD3,rW5
0314     DAD(rD0, 0)
0315     evmergehi   rD2,rD2,rD3
0316     LBD(rW3)
0317     LAD(rW2, rD3, 0)
0318     LAD(rW1, rD2, 0)
0319     LAD(rW4, rD2, 1)
0320     LAD(rW5, rD3, 1)
0321     LAD(rW7, rD1, 1)
0322     rlwimi      rW0,rW4,8,16,23
0323     rlwimi      rW1,rW5,8,16,23
0324     LAD(rW4, rD3, 2)
0325     LAD(rW5, rD0, 2)
0326     rlwimi      rW2,rW6,8,16,23
0327     rlwimi      rW3,rW7,8,16,23
0328     LAD(rW6, rD1, 2)
0329     LAD(rW7, rD2, 2)
0330     rlwimi      rW0,rW4,16,8,15
0331     rlwimi      rW1,rW5,16,8,15
0332     LAD(rW4, rD0, 3)
0333     LAD(rW5, rD1, 3)
0334     rlwimi      rW2,rW6,16,8,15
0335     lwz     rD0,32(rKP)
0336     rlwimi      rW3,rW7,16,8,15
0337     lwz     rD1,36(rKP)
0338     LAD(rW6, rD2, 3)
0339     LAD(rW7, rD3, 3)
0340     rlwimi      rW0,rW4,24,0,7
0341     lwz     rD2,40(rKP)
0342     rlwimi      rW1,rW5,24,0,7
0343     lwz     rD3,44(rKP)
0344     rlwimi      rW2,rW6,24,0,7
0345     rlwimi      rW3,rW7,24,0,7
0346     blr