0001
0002
0003
0004
0005
0006 #ifdef CONFIG_ARM64
0007 #include <asm/neon-intrinsics.h>
0008
0009 #define AES_ROUND "aese %0.16b, %1.16b \n\t aesmc %0.16b, %0.16b"
0010 #else
0011 #include <arm_neon.h>
0012
0013 #define AES_ROUND "aese.8 %q0, %q1 \n\t aesmc.8 %q0, %q0"
0014 #endif
0015
0016 #define AEGIS_BLOCK_SIZE 16
0017
0018 #include <stddef.h>
0019
0020 extern int aegis128_have_aes_insn;
0021
0022 void *memcpy(void *dest, const void *src, size_t n);
0023
0024 struct aegis128_state {
0025 uint8x16_t v[5];
0026 };
0027
0028 extern const uint8_t crypto_aes_sbox[];
0029
0030 static struct aegis128_state aegis128_load_state_neon(const void *state)
0031 {
0032 return (struct aegis128_state){ {
0033 vld1q_u8(state),
0034 vld1q_u8(state + 16),
0035 vld1q_u8(state + 32),
0036 vld1q_u8(state + 48),
0037 vld1q_u8(state + 64)
0038 } };
0039 }
0040
0041 static void aegis128_save_state_neon(struct aegis128_state st, void *state)
0042 {
0043 vst1q_u8(state, st.v[0]);
0044 vst1q_u8(state + 16, st.v[1]);
0045 vst1q_u8(state + 32, st.v[2]);
0046 vst1q_u8(state + 48, st.v[3]);
0047 vst1q_u8(state + 64, st.v[4]);
0048 }
0049
0050 static inline __attribute__((always_inline))
0051 uint8x16_t aegis_aes_round(uint8x16_t w)
0052 {
0053 uint8x16_t z = {};
0054
0055 #ifdef CONFIG_ARM64
0056 if (!__builtin_expect(aegis128_have_aes_insn, 1)) {
0057 static const uint8_t shift_rows[] = {
0058 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
0059 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
0060 };
0061 static const uint8_t ror32by8[] = {
0062 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
0063 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
0064 };
0065 uint8x16_t v;
0066
0067
0068 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
0069
0070
0071 #ifndef CONFIG_CC_IS_GCC
0072 v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), w);
0073 v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), w - 0x40);
0074 v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), w - 0x80);
0075 v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), w - 0xc0);
0076 #else
0077 asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w));
0078 w -= 0x40;
0079 asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w));
0080 w -= 0x40;
0081 asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w));
0082 w -= 0x40;
0083 asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w));
0084 #endif
0085
0086
0087 w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
0088 w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
0089 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
0090
0091 return w;
0092 }
0093 #endif
0094
0095
0096
0097
0098
0099
0100
0101 asm(AES_ROUND : "+w"(w) : "w"(z));
0102 return w;
0103 }
0104
0105 static inline __attribute__((always_inline))
0106 struct aegis128_state aegis128_update_neon(struct aegis128_state st,
0107 uint8x16_t m)
0108 {
0109 m ^= aegis_aes_round(st.v[4]);
0110 st.v[4] ^= aegis_aes_round(st.v[3]);
0111 st.v[3] ^= aegis_aes_round(st.v[2]);
0112 st.v[2] ^= aegis_aes_round(st.v[1]);
0113 st.v[1] ^= aegis_aes_round(st.v[0]);
0114 st.v[0] ^= m;
0115
0116 return st;
0117 }
0118
0119 static inline __attribute__((always_inline))
0120 void preload_sbox(void)
0121 {
0122 if (!IS_ENABLED(CONFIG_ARM64) ||
0123 !IS_ENABLED(CONFIG_CC_IS_GCC) ||
0124 __builtin_expect(aegis128_have_aes_insn, 1))
0125 return;
0126
0127 asm("ld1 {v16.16b-v19.16b}, [%0], #64 \n\t"
0128 "ld1 {v20.16b-v23.16b}, [%0], #64 \n\t"
0129 "ld1 {v24.16b-v27.16b}, [%0], #64 \n\t"
0130 "ld1 {v28.16b-v31.16b}, [%0] \n\t"
0131 :: "r"(crypto_aes_sbox));
0132 }
0133
0134 void crypto_aegis128_init_neon(void *state, const void *key, const void *iv)
0135 {
0136 static const uint8_t const0[] = {
0137 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d,
0138 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62,
0139 };
0140 static const uint8_t const1[] = {
0141 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1,
0142 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd,
0143 };
0144 uint8x16_t k = vld1q_u8(key);
0145 uint8x16_t kiv = k ^ vld1q_u8(iv);
0146 struct aegis128_state st = {{
0147 kiv,
0148 vld1q_u8(const1),
0149 vld1q_u8(const0),
0150 k ^ vld1q_u8(const0),
0151 k ^ vld1q_u8(const1),
0152 }};
0153 int i;
0154
0155 preload_sbox();
0156
0157 for (i = 0; i < 5; i++) {
0158 st = aegis128_update_neon(st, k);
0159 st = aegis128_update_neon(st, kiv);
0160 }
0161 aegis128_save_state_neon(st, state);
0162 }
0163
0164 void crypto_aegis128_update_neon(void *state, const void *msg)
0165 {
0166 struct aegis128_state st = aegis128_load_state_neon(state);
0167
0168 preload_sbox();
0169
0170 st = aegis128_update_neon(st, vld1q_u8(msg));
0171
0172 aegis128_save_state_neon(st, state);
0173 }
0174
0175 #ifdef CONFIG_ARM
0176
0177
0178
0179
0180
0181 static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
0182 {
0183 union {
0184 uint8x16_t val;
0185 uint8x8x2_t pair;
0186 } __a = { a };
0187
0188 return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
0189 vtbl2_u8(__a.pair, vget_high_u8(b)));
0190 }
0191
0192 static uint8x16_t vqtbx1q_u8(uint8x16_t v, uint8x16_t a, uint8x16_t b)
0193 {
0194 union {
0195 uint8x16_t val;
0196 uint8x8x2_t pair;
0197 } __a = { a };
0198
0199 return vcombine_u8(vtbx2_u8(vget_low_u8(v), __a.pair, vget_low_u8(b)),
0200 vtbx2_u8(vget_high_u8(v), __a.pair, vget_high_u8(b)));
0201 }
0202
0203 static int8_t vminvq_s8(int8x16_t v)
0204 {
0205 int8x8_t s = vpmin_s8(vget_low_s8(v), vget_high_s8(v));
0206
0207 s = vpmin_s8(s, s);
0208 s = vpmin_s8(s, s);
0209 s = vpmin_s8(s, s);
0210
0211 return vget_lane_s8(s, 0);
0212 }
0213 #endif
0214
0215 static const uint8_t permute[] __aligned(64) = {
0216 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0217 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0218 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0219 };
0220
0221 void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
0222 unsigned int size)
0223 {
0224 struct aegis128_state st = aegis128_load_state_neon(state);
0225 const int short_input = size < AEGIS_BLOCK_SIZE;
0226 uint8x16_t msg;
0227
0228 preload_sbox();
0229
0230 while (size >= AEGIS_BLOCK_SIZE) {
0231 uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
0232
0233 msg = vld1q_u8(src);
0234 st = aegis128_update_neon(st, msg);
0235 msg ^= s;
0236 vst1q_u8(dst, msg);
0237
0238 size -= AEGIS_BLOCK_SIZE;
0239 src += AEGIS_BLOCK_SIZE;
0240 dst += AEGIS_BLOCK_SIZE;
0241 }
0242
0243 if (size > 0) {
0244 uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
0245 uint8_t buf[AEGIS_BLOCK_SIZE];
0246 const void *in = src;
0247 void *out = dst;
0248 uint8x16_t m;
0249
0250 if (__builtin_expect(short_input, 0))
0251 in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
0252
0253 m = vqtbl1q_u8(vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
0254 vld1q_u8(permute + 32 - size));
0255
0256 st = aegis128_update_neon(st, m);
0257
0258 vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
0259 vqtbl1q_u8(m ^ s, vld1q_u8(permute + size)));
0260
0261 if (__builtin_expect(short_input, 0))
0262 memcpy(dst, out, size);
0263 else
0264 vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
0265 }
0266
0267 aegis128_save_state_neon(st, state);
0268 }
0269
0270 void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
0271 unsigned int size)
0272 {
0273 struct aegis128_state st = aegis128_load_state_neon(state);
0274 const int short_input = size < AEGIS_BLOCK_SIZE;
0275 uint8x16_t msg;
0276
0277 preload_sbox();
0278
0279 while (size >= AEGIS_BLOCK_SIZE) {
0280 msg = vld1q_u8(src) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
0281 st = aegis128_update_neon(st, msg);
0282 vst1q_u8(dst, msg);
0283
0284 size -= AEGIS_BLOCK_SIZE;
0285 src += AEGIS_BLOCK_SIZE;
0286 dst += AEGIS_BLOCK_SIZE;
0287 }
0288
0289 if (size > 0) {
0290 uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
0291 uint8_t buf[AEGIS_BLOCK_SIZE];
0292 const void *in = src;
0293 void *out = dst;
0294 uint8x16_t m;
0295
0296 if (__builtin_expect(short_input, 0))
0297 in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
0298
0299 m = s ^ vqtbx1q_u8(s, vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
0300 vld1q_u8(permute + 32 - size));
0301
0302 st = aegis128_update_neon(st, m);
0303
0304 vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
0305 vqtbl1q_u8(m, vld1q_u8(permute + size)));
0306
0307 if (__builtin_expect(short_input, 0))
0308 memcpy(dst, out, size);
0309 else
0310 vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
0311 }
0312
0313 aegis128_save_state_neon(st, state);
0314 }
0315
0316 int crypto_aegis128_final_neon(void *state, void *tag_xor,
0317 unsigned int assoclen,
0318 unsigned int cryptlen,
0319 unsigned int authsize)
0320 {
0321 struct aegis128_state st = aegis128_load_state_neon(state);
0322 uint8x16_t v;
0323 int i;
0324
0325 preload_sbox();
0326
0327 v = st.v[3] ^ (uint8x16_t)vcombine_u64(vmov_n_u64(8ULL * assoclen),
0328 vmov_n_u64(8ULL * cryptlen));
0329
0330 for (i = 0; i < 7; i++)
0331 st = aegis128_update_neon(st, v);
0332
0333 v = st.v[0] ^ st.v[1] ^ st.v[2] ^ st.v[3] ^ st.v[4];
0334
0335 if (authsize > 0) {
0336 v = vqtbl1q_u8(~vceqq_u8(v, vld1q_u8(tag_xor)),
0337 vld1q_u8(permute + authsize));
0338
0339 return vminvq_s8((int8x16_t)v);
0340 }
0341
0342 vst1q_u8(tag_xor, v);
0343 return 0;
0344 }