0001
0002
0003
0004
0005
0006
0007
0008 #include <linux/linkage.h>
0009 #include <asm/assembler.h>
0010
0011 .arch armv8-a
0012 .fpu crypto-neon-fp-armv8
0013
0014 SHASH .req q0
0015 T1 .req q1
0016 XL .req q2
0017 XM .req q3
0018 XH .req q4
0019 IN1 .req q4
0020
0021 SHASH_L .req d0
0022 SHASH_H .req d1
0023 T1_L .req d2
0024 T1_H .req d3
0025 XL_L .req d4
0026 XL_H .req d5
0027 XM_L .req d6
0028 XM_H .req d7
0029 XH_L .req d8
0030
0031 t0l .req d10
0032 t0h .req d11
0033 t1l .req d12
0034 t1h .req d13
0035 t2l .req d14
0036 t2h .req d15
0037 t3l .req d16
0038 t3h .req d17
0039 t4l .req d18
0040 t4h .req d19
0041
0042 t0q .req q5
0043 t1q .req q6
0044 t2q .req q7
0045 t3q .req q8
0046 t4q .req q9
0047 T2 .req q9
0048
0049 s1l .req d20
0050 s1h .req d21
0051 s2l .req d22
0052 s2h .req d23
0053 s3l .req d24
0054 s3h .req d25
0055 s4l .req d26
0056 s4h .req d27
0057
0058 MASK .req d28
0059 SHASH2_p8 .req d28
0060
0061 k16 .req d29
0062 k32 .req d30
0063 k48 .req d31
0064 SHASH2_p64 .req d31
0065
0066 HH .req q10
0067 HH3 .req q11
0068 HH4 .req q12
0069 HH34 .req q13
0070
0071 HH_L .req d20
0072 HH_H .req d21
0073 HH3_L .req d22
0074 HH3_H .req d23
0075 HH4_L .req d24
0076 HH4_H .req d25
0077 HH34_L .req d26
0078 HH34_H .req d27
0079 SHASH2_H .req d29
0080
0081 XL2 .req q5
0082 XM2 .req q6
0083 XH2 .req q7
0084 T3 .req q8
0085
0086 XL2_L .req d10
0087 XL2_H .req d11
0088 XM2_L .req d12
0089 XM2_H .req d13
0090 T3_L .req d16
0091 T3_H .req d17
0092
0093 .text
0094
0095 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
0096 vmull.p64 \rd, \rn, \rm
0097 .endm
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
0110 vext.8 t0l, \ad, \ad, #1 @ A1
0111 .ifc \b1, t4l
0112 vext.8 t4l, \bd, \bd, #1 @ B1
0113 .endif
0114 vmull.p8 t0q, t0l, \bd @ F = A1*B
0115 vext.8 t1l, \ad, \ad, #2 @ A2
0116 vmull.p8 t4q, \ad, \b1 @ E = A*B1
0117 .ifc \b2, t3l
0118 vext.8 t3l, \bd, \bd, #2 @ B2
0119 .endif
0120 vmull.p8 t1q, t1l, \bd @ H = A2*B
0121 vext.8 t2l, \ad, \ad, #3 @ A3
0122 vmull.p8 t3q, \ad, \b2 @ G = A*B2
0123 veor t0q, t0q, t4q @ L = E + F
0124 .ifc \b3, t4l
0125 vext.8 t4l, \bd, \bd, #3 @ B3
0126 .endif
0127 vmull.p8 t2q, t2l, \bd @ J = A3*B
0128 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
0129 veor t1q, t1q, t3q @ M = G + H
0130 .ifc \b4, t3l
0131 vext.8 t3l, \bd, \bd, #4 @ B4
0132 .endif
0133 vmull.p8 t4q, \ad, \b3 @ I = A*B3
0134 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
0135 vmull.p8 t3q, \ad, \b4 @ K = A*B4
0136 vand t0h, t0h, k48
0137 vand t1h, t1h, k32
0138 veor t2q, t2q, t4q @ N = I + J
0139 veor t0l, t0l, t0h
0140 veor t1l, t1l, t1h
0141 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
0142 vand t2h, t2h, k16
0143 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
0144 vmov.i64 t3h, #0
0145 vext.8 t0q, t0q, t0q, #15
0146 veor t2l, t2l, t2h
0147 vext.8 t1q, t1q, t1q, #14
0148 vmull.p8 \rq, \ad, \bd @ D = A*B
0149 vext.8 t2q, t2q, t2q, #13
0150 vext.8 t3q, t3q, t3q, #12
0151 veor t0q, t0q, t1q
0152 veor t2q, t2q, t3q
0153 veor \rq, \rq, t0q
0154 veor \rq, \rq, t2q
0155 .endm
0156
0157 //
0158 // PMULL (64x64->128) based reduction for CPUs that can do
0159 // it in a single instruction.
0160 //
0161 .macro __pmull_reduce_p64
0162 vmull.p64 T1, XL_L, MASK
0163
0164 veor XH_L, XH_L, XM_H
0165 vext.8 T1, T1, T1, #8
0166 veor XL_H, XL_H, XM_L
0167 veor T1, T1, XL
0168
0169 vmull.p64 XL, T1_H, MASK
0170 .endm
0171
0172 //
0173 // Alternative reduction for CPUs that lack support for the
0174 // 64x64->128 PMULL instruction
0175 //
0176 .macro __pmull_reduce_p8
0177 veor XL_H, XL_H, XM_L
0178 veor XH_L, XH_L, XM_H
0179
0180 vshl.i64 T1, XL, #57
0181 vshl.i64 T2, XL, #62
0182 veor T1, T1, T2
0183 vshl.i64 T2, XL, #63
0184 veor T1, T1, T2
0185 veor XL_H, XL_H, T1_L
0186 veor XH_L, XH_L, T1_H
0187
0188 vshr.u64 T1, XL, #1
0189 veor XH, XH, XL
0190 veor XL, XL, T1
0191 vshr.u64 T1, T1, #6
0192 vshr.u64 XL, XL, #1
0193 .endm
0194
0195 .macro ghash_update, pn
0196 vld1.64 {XL}, [r1]
0197
0198
0199 ldr ip, [sp]
0200 teq ip, #0
0201 beq 0f
0202 vld1.64 {T1}, [ip]
0203 teq r0, #0
0204 b 3f
0205
0206 0: .ifc \pn, p64
0207 tst r0, #3 // skip until #blocks is a
0208 bne 2f // round multiple of 4
0209
0210 vld1.8 {XL2-XM2}, [r2]!
0211 1: vld1.8 {T3-T2}, [r2]!
0212 vrev64.8 XL2, XL2
0213 vrev64.8 XM2, XM2
0214
0215 subs r0, r0, #4
0216
0217 vext.8 T1, XL2, XL2, #8
0218 veor XL2_H, XL2_H, XL_L
0219 veor XL, XL, T1
0220
0221 vrev64.8 T3, T3
0222 vrev64.8 T1, T2
0223
0224 vmull.p64 XH, HH4_H, XL_H // a1 * b1
0225 veor XL2_H, XL2_H, XL_H
0226 vmull.p64 XL, HH4_L, XL_L // a0 * b0
0227 vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
0228
0229 vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
0230 veor XM2_L, XM2_L, XM2_H
0231 vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
0232 vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
0233
0234 veor XH, XH, XH2
0235 veor XL, XL, XL2
0236 veor XM, XM, XM2
0237
0238 vmull.p64 XH2, HH_H, T3_L // a1 * b1
0239 veor T3_L, T3_L, T3_H
0240 vmull.p64 XL2, HH_L, T3_H // a0 * b0
0241 vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
0242
0243 veor XH, XH, XH2
0244 veor XL, XL, XL2
0245 veor XM, XM, XM2
0246
0247 vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
0248 veor T1_L, T1_L, T1_H
0249 vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
0250 vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
0251
0252 veor XH, XH, XH2
0253 veor XL, XL, XL2
0254 veor XM, XM, XM2
0255
0256 beq 4f
0257
0258 vld1.8 {XL2-XM2}, [r2]!
0259
0260 veor T1, XL, XH
0261 veor XM, XM, T1
0262
0263 __pmull_reduce_p64
0264
0265 veor T1, T1, XH
0266 veor XL, XL, T1
0267
0268 b 1b
0269 .endif
0270
0271 2: vld1.64 {T1}, [r2]!
0272 subs r0, r0, #1
0273
0274 3:
0275 #ifndef CONFIG_CPU_BIG_ENDIAN
0276 vrev64.8 T1, T1
0277 #endif
0278 vext.8 IN1, T1, T1, #8
0279 veor T1_L, T1_L, XL_H
0280 veor XL, XL, IN1
0281
0282 __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
0283 veor T1, T1, XL
0284 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
0285 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
0286
0287 4: veor T1, XL, XH
0288 veor XM, XM, T1
0289
0290 __pmull_reduce_\pn
0291
0292 veor T1, T1, XH
0293 veor XL, XL, T1
0294
0295 bne 0b
0296
0297 vst1.64 {XL}, [r1]
0298 bx lr
0299 .endm
0300
0301
0302
0303
0304
0305 ENTRY(pmull_ghash_update_p64)
0306 vld1.64 {SHASH}, [r3]!
0307 vld1.64 {HH}, [r3]!
0308 vld1.64 {HH3-HH4}, [r3]
0309
0310 veor SHASH2_p64, SHASH_L, SHASH_H
0311 veor SHASH2_H, HH_L, HH_H
0312 veor HH34_L, HH3_L, HH3_H
0313 veor HH34_H, HH4_L, HH4_H
0314
0315 vmov.i8 MASK, #0xe1
0316 vshl.u64 MASK, MASK, #57
0317
0318 ghash_update p64
0319 ENDPROC(pmull_ghash_update_p64)
0320
0321 ENTRY(pmull_ghash_update_p8)
0322 vld1.64 {SHASH}, [r3]
0323 veor SHASH2_p8, SHASH_L, SHASH_H
0324
0325 vext.8 s1l, SHASH_L, SHASH_L, #1
0326 vext.8 s2l, SHASH_L, SHASH_L, #2
0327 vext.8 s3l, SHASH_L, SHASH_L, #3
0328 vext.8 s4l, SHASH_L, SHASH_L, #4
0329 vext.8 s1h, SHASH_H, SHASH_H, #1
0330 vext.8 s2h, SHASH_H, SHASH_H, #2
0331 vext.8 s3h, SHASH_H, SHASH_H, #3
0332 vext.8 s4h, SHASH_H, SHASH_H, #4
0333
0334 vmov.i64 k16, #0xffff
0335 vmov.i64 k32, #0xffffffff
0336 vmov.i64 k48, #0xffffffffffff
0337
0338 ghash_update p8
0339 ENDPROC(pmull_ghash_update_p8)