0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027 #include <linux/linkage.h>
0028 #define STRIDE_BLOCKS 8
0029
0030 KEY_POWERS .req x0
0031 MSG .req x1
0032 BLOCKS_LEFT .req x2
0033 ACCUMULATOR .req x3
0034 KEY_START .req x10
0035 EXTRA_BYTES .req x11
0036 TMP .req x13
0037
0038 M0 .req v0
0039 M1 .req v1
0040 M2 .req v2
0041 M3 .req v3
0042 M4 .req v4
0043 M5 .req v5
0044 M6 .req v6
0045 M7 .req v7
0046 KEY8 .req v8
0047 KEY7 .req v9
0048 KEY6 .req v10
0049 KEY5 .req v11
0050 KEY4 .req v12
0051 KEY3 .req v13
0052 KEY2 .req v14
0053 KEY1 .req v15
0054 PL .req v16
0055 PH .req v17
0056 TMP_V .req v18
0057 LO .req v20
0058 MI .req v21
0059 HI .req v22
0060 SUM .req v23
0061 GSTAR .req v24
0062
0063 .text
0064
0065 .arch armv8-a+crypto
0066 .align 4
0067
0068 .Lgstar:
0069 .quad 0xc200000000000000, 0xc200000000000000
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093 .macro karatsuba1 X Y
0094 X .req \X
0095 Y .req \Y
0096 ext v25.16b, X.16b, X.16b, #8
0097 ext v26.16b, Y.16b, Y.16b, #8
0098 eor v25.16b, v25.16b, X.16b
0099 eor v26.16b, v26.16b, Y.16b
0100 pmull2 v28.1q, X.2d, Y.2d
0101 pmull v29.1q, X.1d, Y.1d
0102 pmull v27.1q, v25.1d, v26.1d
0103 eor HI.16b, HI.16b, v28.16b
0104 eor LO.16b, LO.16b, v29.16b
0105 eor MI.16b, MI.16b, v27.16b
0106 .unreq X
0107 .unreq Y
0108 .endm
0109
0110
0111
0112
0113
0114 .macro karatsuba1_store X Y
0115 X .req \X
0116 Y .req \Y
0117 ext v25.16b, X.16b, X.16b, #8
0118 ext v26.16b, Y.16b, Y.16b, #8
0119 eor v25.16b, v25.16b, X.16b
0120 eor v26.16b, v26.16b, Y.16b
0121 pmull2 HI.1q, X.2d, Y.2d
0122 pmull LO.1q, X.1d, Y.1d
0123 pmull MI.1q, v25.1d, v26.1d
0124 .unreq X
0125 .unreq Y
0126 .endm
0127
0128
0129
0130
0131
0132
0133
0134 .macro karatsuba2
0135 // v4 = [HI_1 + MI_1 : HI_0 + MI_0]
0136 eor v4.16b, HI.16b, MI.16b
0137 // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
0138 eor v4.16b, v4.16b, LO.16b
0139 // v5 = [HI_0 : LO_1]
0140 ext v5.16b, LO.16b, HI.16b, #8
0141 // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
0142 eor v4.16b, v4.16b, v5.16b
0143 // HI = [HI_0 : HI_1]
0144 ext HI.16b, HI.16b, HI.16b, #8
0145 // LO = [LO_0 : LO_1]
0146 ext LO.16b, LO.16b, LO.16b, #8
0147 // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
0148 ext PH.16b, v4.16b, HI.16b, #8
0149 // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
0150 ext PL.16b, LO.16b, v4.16b, #8
0151 .endm
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192 .macro montgomery_reduction dest
0193 DEST .req \dest
0194 // TMP_V = T_1 : T_0 = P_0 * g*(x)
0195 pmull TMP_V.1q, PL.1d, GSTAR.1d
0196 // TMP_V = T_0 : T_1
0197 ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
0198 // TMP_V = P_1 + T_0 : P_0 + T_1
0199 eor TMP_V.16b, PL.16b, TMP_V.16b
0200 // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
0201 eor PH.16b, PH.16b, TMP_V.16b
0202 // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
0203 pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
0204 eor DEST.16b, PH.16b, TMP_V.16b
0205 .unreq DEST
0206 .endm
0207
0208
0209
0210
0211
0212
0213
0214
0215
0216
0217
0218 .macro full_stride reduce
0219 eor LO.16b, LO.16b, LO.16b
0220 eor MI.16b, MI.16b, MI.16b
0221 eor HI.16b, HI.16b, HI.16b
0222
0223 ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
0224 ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
0225
0226 karatsuba1 M7 KEY1
0227 .if \reduce
0228 pmull TMP_V.1q, PL.1d, GSTAR.1d
0229 .endif
0230
0231 karatsuba1 M6 KEY2
0232 .if \reduce
0233 ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
0234 .endif
0235
0236 karatsuba1 M5 KEY3
0237 .if \reduce
0238 eor TMP_V.16b, PL.16b, TMP_V.16b
0239 .endif
0240
0241 karatsuba1 M4 KEY4
0242 .if \reduce
0243 eor PH.16b, PH.16b, TMP_V.16b
0244 .endif
0245
0246 karatsuba1 M3 KEY5
0247 .if \reduce
0248 pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
0249 .endif
0250
0251 karatsuba1 M2 KEY6
0252 .if \reduce
0253 eor SUM.16b, PH.16b, TMP_V.16b
0254 .endif
0255
0256 karatsuba1 M1 KEY7
0257 eor M0.16b, M0.16b, SUM.16b
0258
0259 karatsuba1 M0 KEY8
0260 karatsuba2
0261 .endm
0262
0263
0264
0265
0266 .macro partial_stride
0267 add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
0268 sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
0269 ld1 {KEY1.16b}, [KEY_POWERS], #16
0270
0271 ld1 {TMP_V.16b}, [MSG], #16
0272 eor SUM.16b, SUM.16b, TMP_V.16b
0273 karatsuba1_store KEY1 SUM
0274 sub BLOCKS_LEFT, BLOCKS_LEFT, #1
0275
0276 tst BLOCKS_LEFT, #4
0277 beq .Lpartial4BlocksDone
0278 ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
0279 ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
0280 karatsuba1 M0 KEY8
0281 karatsuba1 M1 KEY7
0282 karatsuba1 M2 KEY6
0283 karatsuba1 M3 KEY5
0284 .Lpartial4BlocksDone:
0285 tst BLOCKS_LEFT, #2
0286 beq .Lpartial2BlocksDone
0287 ld1 {M0.16b, M1.16b}, [MSG], #32
0288 ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
0289 karatsuba1 M0 KEY8
0290 karatsuba1 M1 KEY7
0291 .Lpartial2BlocksDone:
0292 tst BLOCKS_LEFT, #1
0293 beq .LpartialDone
0294 ld1 {M0.16b}, [MSG], #16
0295 ld1 {KEY8.16b}, [KEY_POWERS], #16
0296 karatsuba1 M0 KEY8
0297 .LpartialDone:
0298 karatsuba2
0299 montgomery_reduction SUM
0300 .endm
0301
0302
0303
0304
0305
0306
0307
0308
0309
0310
0311 SYM_FUNC_START(pmull_polyval_mul)
0312 adr TMP, .Lgstar
0313 ld1 {GSTAR.2d}, [TMP]
0314 ld1 {v0.16b}, [x0]
0315 ld1 {v1.16b}, [x1]
0316 karatsuba1_store v0 v1
0317 karatsuba2
0318 montgomery_reduction SUM
0319 st1 {SUM.16b}, [x0]
0320 ret
0321 SYM_FUNC_END(pmull_polyval_mul)
0322
0323
0324
0325
0326
0327
0328
0329
0330
0331
0332
0333
0334
0335
0336 SYM_FUNC_START(pmull_polyval_update)
0337 adr TMP, .Lgstar
0338 mov KEY_START, KEY_POWERS
0339 ld1 {GSTAR.2d}, [TMP]
0340 ld1 {SUM.16b}, [ACCUMULATOR]
0341 subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
0342 blt .LstrideLoopExit
0343 ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
0344 ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
0345 full_stride 0
0346 subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
0347 blt .LstrideLoopExitReduce
0348 .LstrideLoop:
0349 full_stride 1
0350 subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
0351 bge .LstrideLoop
0352 .LstrideLoopExitReduce:
0353 montgomery_reduction SUM
0354 .LstrideLoopExit:
0355 adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
0356 beq .LskipPartial
0357 partial_stride
0358 .LskipPartial:
0359 st1 {SUM.16b}, [ACCUMULATOR]
0360 ret
0361 SYM_FUNC_END(pmull_polyval_update)