Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
0004  */
0005 
0006 #ifdef CONFIG_ARM64
0007 #include <asm/neon-intrinsics.h>
0008 
0009 #define AES_ROUND   "aese %0.16b, %1.16b \n\t aesmc %0.16b, %0.16b"
0010 #else
0011 #include <arm_neon.h>
0012 
0013 #define AES_ROUND   "aese.8 %q0, %q1 \n\t aesmc.8 %q0, %q0"
0014 #endif
0015 
0016 #define AEGIS_BLOCK_SIZE    16
0017 
0018 #include <stddef.h>
0019 
0020 extern int aegis128_have_aes_insn;
0021 
0022 void *memcpy(void *dest, const void *src, size_t n);
0023 
0024 struct aegis128_state {
0025     uint8x16_t v[5];
0026 };
0027 
0028 extern const uint8_t crypto_aes_sbox[];
0029 
0030 static struct aegis128_state aegis128_load_state_neon(const void *state)
0031 {
0032     return (struct aegis128_state){ {
0033         vld1q_u8(state),
0034         vld1q_u8(state + 16),
0035         vld1q_u8(state + 32),
0036         vld1q_u8(state + 48),
0037         vld1q_u8(state + 64)
0038     } };
0039 }
0040 
0041 static void aegis128_save_state_neon(struct aegis128_state st, void *state)
0042 {
0043     vst1q_u8(state, st.v[0]);
0044     vst1q_u8(state + 16, st.v[1]);
0045     vst1q_u8(state + 32, st.v[2]);
0046     vst1q_u8(state + 48, st.v[3]);
0047     vst1q_u8(state + 64, st.v[4]);
0048 }
0049 
0050 static inline __attribute__((always_inline))
0051 uint8x16_t aegis_aes_round(uint8x16_t w)
0052 {
0053     uint8x16_t z = {};
0054 
0055 #ifdef CONFIG_ARM64
0056     if (!__builtin_expect(aegis128_have_aes_insn, 1)) {
0057         static const uint8_t shift_rows[] = {
0058             0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
0059             0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
0060         };
0061         static const uint8_t ror32by8[] = {
0062             0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
0063             0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
0064         };
0065         uint8x16_t v;
0066 
0067         // shift rows
0068         w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
0069 
0070         // sub bytes
0071 #ifndef CONFIG_CC_IS_GCC
0072         v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), w);
0073         v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), w - 0x40);
0074         v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), w - 0x80);
0075         v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), w - 0xc0);
0076 #else
0077         asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w));
0078         w -= 0x40;
0079         asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w));
0080         w -= 0x40;
0081         asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w));
0082         w -= 0x40;
0083         asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w));
0084 #endif
0085 
0086         // mix columns
0087         w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
0088         w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
0089         w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
0090 
0091         return w;
0092     }
0093 #endif
0094 
0095     /*
0096      * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics
0097      * to force the compiler to issue the aese/aesmc instructions in pairs.
0098      * This is much faster on many cores, where the instruction pair can
0099      * execute in a single cycle.
0100      */
0101     asm(AES_ROUND : "+w"(w) : "w"(z));
0102     return w;
0103 }
0104 
0105 static inline __attribute__((always_inline))
0106 struct aegis128_state aegis128_update_neon(struct aegis128_state st,
0107                        uint8x16_t m)
0108 {
0109     m       ^= aegis_aes_round(st.v[4]);
0110     st.v[4] ^= aegis_aes_round(st.v[3]);
0111     st.v[3] ^= aegis_aes_round(st.v[2]);
0112     st.v[2] ^= aegis_aes_round(st.v[1]);
0113     st.v[1] ^= aegis_aes_round(st.v[0]);
0114     st.v[0] ^= m;
0115 
0116     return st;
0117 }
0118 
0119 static inline __attribute__((always_inline))
0120 void preload_sbox(void)
0121 {
0122     if (!IS_ENABLED(CONFIG_ARM64) ||
0123         !IS_ENABLED(CONFIG_CC_IS_GCC) ||
0124         __builtin_expect(aegis128_have_aes_insn, 1))
0125         return;
0126 
0127     asm("ld1    {v16.16b-v19.16b}, [%0], #64    \n\t"
0128         "ld1    {v20.16b-v23.16b}, [%0], #64    \n\t"
0129         "ld1    {v24.16b-v27.16b}, [%0], #64    \n\t"
0130         "ld1    {v28.16b-v31.16b}, [%0]     \n\t"
0131         :: "r"(crypto_aes_sbox));
0132 }
0133 
0134 void crypto_aegis128_init_neon(void *state, const void *key, const void *iv)
0135 {
0136     static const uint8_t const0[] = {
0137         0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d,
0138         0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62,
0139     };
0140     static const uint8_t const1[] = {
0141         0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1,
0142         0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd,
0143     };
0144     uint8x16_t k = vld1q_u8(key);
0145     uint8x16_t kiv = k ^ vld1q_u8(iv);
0146     struct aegis128_state st = {{
0147         kiv,
0148         vld1q_u8(const1),
0149         vld1q_u8(const0),
0150         k ^ vld1q_u8(const0),
0151         k ^ vld1q_u8(const1),
0152     }};
0153     int i;
0154 
0155     preload_sbox();
0156 
0157     for (i = 0; i < 5; i++) {
0158         st = aegis128_update_neon(st, k);
0159         st = aegis128_update_neon(st, kiv);
0160     }
0161     aegis128_save_state_neon(st, state);
0162 }
0163 
0164 void crypto_aegis128_update_neon(void *state, const void *msg)
0165 {
0166     struct aegis128_state st = aegis128_load_state_neon(state);
0167 
0168     preload_sbox();
0169 
0170     st = aegis128_update_neon(st, vld1q_u8(msg));
0171 
0172     aegis128_save_state_neon(st, state);
0173 }
0174 
0175 #ifdef CONFIG_ARM
0176 /*
0177  * AArch32 does not provide these intrinsics natively because it does not
0178  * implement the underlying instructions. AArch32 only provides 64-bit
0179  * wide vtbl.8/vtbx.8 instruction, so use those instead.
0180  */
0181 static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
0182 {
0183     union {
0184         uint8x16_t  val;
0185         uint8x8x2_t pair;
0186     } __a = { a };
0187 
0188     return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
0189                vtbl2_u8(__a.pair, vget_high_u8(b)));
0190 }
0191 
0192 static uint8x16_t vqtbx1q_u8(uint8x16_t v, uint8x16_t a, uint8x16_t b)
0193 {
0194     union {
0195         uint8x16_t  val;
0196         uint8x8x2_t pair;
0197     } __a = { a };
0198 
0199     return vcombine_u8(vtbx2_u8(vget_low_u8(v), __a.pair, vget_low_u8(b)),
0200                vtbx2_u8(vget_high_u8(v), __a.pair, vget_high_u8(b)));
0201 }
0202 
0203 static int8_t vminvq_s8(int8x16_t v)
0204 {
0205     int8x8_t s = vpmin_s8(vget_low_s8(v), vget_high_s8(v));
0206 
0207     s = vpmin_s8(s, s);
0208     s = vpmin_s8(s, s);
0209     s = vpmin_s8(s, s);
0210 
0211     return vget_lane_s8(s, 0);
0212 }
0213 #endif
0214 
0215 static const uint8_t permute[] __aligned(64) = {
0216     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0217      0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
0218     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0219 };
0220 
0221 void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
0222                     unsigned int size)
0223 {
0224     struct aegis128_state st = aegis128_load_state_neon(state);
0225     const int short_input = size < AEGIS_BLOCK_SIZE;
0226     uint8x16_t msg;
0227 
0228     preload_sbox();
0229 
0230     while (size >= AEGIS_BLOCK_SIZE) {
0231         uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
0232 
0233         msg = vld1q_u8(src);
0234         st = aegis128_update_neon(st, msg);
0235         msg ^= s;
0236         vst1q_u8(dst, msg);
0237 
0238         size -= AEGIS_BLOCK_SIZE;
0239         src += AEGIS_BLOCK_SIZE;
0240         dst += AEGIS_BLOCK_SIZE;
0241     }
0242 
0243     if (size > 0) {
0244         uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
0245         uint8_t buf[AEGIS_BLOCK_SIZE];
0246         const void *in = src;
0247         void *out = dst;
0248         uint8x16_t m;
0249 
0250         if (__builtin_expect(short_input, 0))
0251             in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
0252 
0253         m = vqtbl1q_u8(vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
0254                    vld1q_u8(permute + 32 - size));
0255 
0256         st = aegis128_update_neon(st, m);
0257 
0258         vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
0259              vqtbl1q_u8(m ^ s, vld1q_u8(permute + size)));
0260 
0261         if (__builtin_expect(short_input, 0))
0262             memcpy(dst, out, size);
0263         else
0264             vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
0265     }
0266 
0267     aegis128_save_state_neon(st, state);
0268 }
0269 
0270 void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
0271                     unsigned int size)
0272 {
0273     struct aegis128_state st = aegis128_load_state_neon(state);
0274     const int short_input = size < AEGIS_BLOCK_SIZE;
0275     uint8x16_t msg;
0276 
0277     preload_sbox();
0278 
0279     while (size >= AEGIS_BLOCK_SIZE) {
0280         msg = vld1q_u8(src) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
0281         st = aegis128_update_neon(st, msg);
0282         vst1q_u8(dst, msg);
0283 
0284         size -= AEGIS_BLOCK_SIZE;
0285         src += AEGIS_BLOCK_SIZE;
0286         dst += AEGIS_BLOCK_SIZE;
0287     }
0288 
0289     if (size > 0) {
0290         uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
0291         uint8_t buf[AEGIS_BLOCK_SIZE];
0292         const void *in = src;
0293         void *out = dst;
0294         uint8x16_t m;
0295 
0296         if (__builtin_expect(short_input, 0))
0297             in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
0298 
0299         m = s ^ vqtbx1q_u8(s, vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
0300                    vld1q_u8(permute + 32 - size));
0301 
0302         st = aegis128_update_neon(st, m);
0303 
0304         vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
0305              vqtbl1q_u8(m, vld1q_u8(permute + size)));
0306 
0307         if (__builtin_expect(short_input, 0))
0308             memcpy(dst, out, size);
0309         else
0310             vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
0311     }
0312 
0313     aegis128_save_state_neon(st, state);
0314 }
0315 
0316 int crypto_aegis128_final_neon(void *state, void *tag_xor,
0317                    unsigned int assoclen,
0318                    unsigned int cryptlen,
0319                    unsigned int authsize)
0320 {
0321     struct aegis128_state st = aegis128_load_state_neon(state);
0322     uint8x16_t v;
0323     int i;
0324 
0325     preload_sbox();
0326 
0327     v = st.v[3] ^ (uint8x16_t)vcombine_u64(vmov_n_u64(8ULL * assoclen),
0328                            vmov_n_u64(8ULL * cryptlen));
0329 
0330     for (i = 0; i < 7; i++)
0331         st = aegis128_update_neon(st, v);
0332 
0333     v = st.v[0] ^ st.v[1] ^ st.v[2] ^ st.v[3] ^ st.v[4];
0334 
0335     if (authsize > 0) {
0336         v = vqtbl1q_u8(~vceqq_u8(v, vld1q_u8(tag_xor)),
0337                    vld1q_u8(permute + authsize));
0338 
0339         return vminvq_s8((int8x16_t)v);
0340     }
0341 
0342     vst1q_u8(tag_xor, v);
0343     return 0;
0344 }