0fc8348585aaa725ad6a30b17d2e302fdc381767
[openwrt/staging/ldir.git] /
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Sun, 5 Jan 2020 22:40:48 -0500
4 Subject: [PATCH] crypto: x86/poly1305 - wire up faster implementations for
5 kernel
6
7 commit d7d7b853566254648df59f7ea27ea05952a6cfa8 upstream.
8
9 These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F.
10 The AVX-512F implementation is disabled on Skylake, due to throttling,
11 but it is quite fast on >= Cannonlake.
12
13 On the left is cycle counts on a Core i7 6700HQ using the AVX-2
14 codepath, comparing this implementation ("new") to the implementation in
15 the current crypto api ("old"). On the right are benchmarks on a Xeon
16 Gold 5120 using the AVX-512 codepath. The new implementation is faster
17 on all benchmarks.
18
19 AVX-2 AVX-512
20 --------- -----------
21
22 size old new size old new
23 ---- ---- ---- ---- ---- ----
24 0 70 68 0 74 70
25 16 92 90 16 96 92
26 32 134 104 32 136 106
27 48 172 120 48 184 124
28 64 218 136 64 218 138
29 80 254 158 80 260 160
30 96 298 174 96 300 176
31 112 342 192 112 342 194
32 128 388 212 128 384 212
33 144 428 228 144 420 226
34 160 466 246 160 464 248
35 176 510 264 176 504 264
36 192 550 282 192 544 282
37 208 594 302 208 582 300
38 224 628 316 224 624 318
39 240 676 334 240 662 338
40 256 716 354 256 708 358
41 272 764 374 272 748 372
42 288 802 352 288 788 358
43 304 420 366 304 422 370
44 320 428 360 320 432 364
45 336 484 378 336 486 380
46 352 426 384 352 434 390
47 368 478 400 368 480 408
48 384 488 394 384 490 398
49 400 542 408 400 542 412
50 416 486 416 416 492 426
51 432 534 430 432 538 436
52 448 544 422 448 546 432
53 464 600 438 464 600 448
54 480 540 448 480 548 456
55 496 594 464 496 594 476
56 512 602 456 512 606 470
57 528 656 476 528 656 480
58 544 600 480 544 606 498
59 560 650 494 560 652 512
60 576 664 490 576 662 508
61 592 714 508 592 716 522
62 608 656 514 608 664 538
63 624 708 532 624 710 552
64 640 716 524 640 720 516
65 656 770 536 656 772 526
66 672 716 548 672 722 544
67 688 770 562 688 768 556
68 704 774 552 704 778 556
69 720 826 568 720 832 568
70 736 768 574 736 780 584
71 752 822 592 752 826 600
72 768 830 584 768 836 560
73 784 884 602 784 888 572
74 800 828 610 800 838 588
75 816 884 628 816 884 604
76 832 888 618 832 894 598
77 848 942 632 848 946 612
78 864 884 644 864 896 628
79 880 936 660 880 942 644
80 896 948 652 896 952 608
81 912 1000 664 912 1004 616
82 928 942 676 928 954 634
83 944 994 690 944 1000 646
84 960 1002 680 960 1008 646
85 976 1054 694 976 1062 658
86 992 1002 706 992 1012 674
87 1008 1052 720 1008 1058 690
88
89 This commit wires in the prior implementation from Andy, and makes the
90 following changes to be suitable for kernel land.
91
92 - Some cosmetic and structural changes, like renaming labels to
93 .Lname, constants, and other Linux conventions, as well as making
94 the code easy for us to maintain moving forward.
95
96 - CPU feature checking is done in C by the glue code.
97
98 - We avoid jumping into the middle of functions, to appease objtool,
99 and instead parameterize shared code.
100
101 - We maintain frame pointers so that stack traces make sense.
102
103 - We remove the dependency on the perl xlate code, which transforms
104 the output into things that assemblers we don't care about use.
105
106 Importantly, none of our changes affect the arithmetic or core code, but
107 just involve the differing environment of kernel space.
108
109 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
110 Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
111 Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
112 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
113 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
114 ---
115 arch/x86/crypto/.gitignore | 1 +
116 arch/x86/crypto/Makefile | 11 +-
117 arch/x86/crypto/poly1305-avx2-x86_64.S | 390 ----------
118 arch/x86/crypto/poly1305-sse2-x86_64.S | 590 ---------------
119 arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 682 ++++++++++--------
120 arch/x86/crypto/poly1305_glue.c | 473 +++++-------
121 lib/crypto/Kconfig | 2 +-
122 7 files changed, 572 insertions(+), 1577 deletions(-)
123 create mode 100644 arch/x86/crypto/.gitignore
124 delete mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S
125 delete mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S
126
127 --- /dev/null
128 +++ b/arch/x86/crypto/.gitignore
129 @@ -0,0 +1 @@
130 +poly1305-x86_64.S
131 --- a/arch/x86/crypto/Makefile
132 +++ b/arch/x86/crypto/Makefile
133 @@ -73,6 +73,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o
134
135 nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
136 blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
137 +poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
138 +ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
139 +targets += poly1305-x86_64-cryptogams.S
140 +endif
141
142 ifeq ($(avx_supported),yes)
143 camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
144 @@ -101,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni
145 aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
146 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
147 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
148 -poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
149 ifeq ($(avx2_supported),yes)
150 sha1-ssse3-y += sha1_avx2_x86_64_asm.o
151 -poly1305-x86_64-y += poly1305-avx2-x86_64.o
152 endif
153 ifeq ($(sha1_ni_supported),yes)
154 sha1-ssse3-y += sha1_ni_asm.o
155 @@ -118,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o
156 endif
157 sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
158 crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
159 +
160 +quiet_cmd_perlasm = PERLASM $@
161 + cmd_perlasm = $(PERL) $< > $@
162 +$(obj)/%.S: $(src)/%.pl FORCE
163 + $(call if_changed,perlasm)
164 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S
165 +++ /dev/null
166 @@ -1,390 +0,0 @@
167 -/* SPDX-License-Identifier: GPL-2.0-or-later */
168 -/*
169 - * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
170 - *
171 - * Copyright (C) 2015 Martin Willi
172 - */
173 -
174 -#include <linux/linkage.h>
175 -
176 -.section .rodata.cst32.ANMASK, "aM", @progbits, 32
177 -.align 32
178 -ANMASK: .octa 0x0000000003ffffff0000000003ffffff
179 - .octa 0x0000000003ffffff0000000003ffffff
180 -
181 -.section .rodata.cst32.ORMASK, "aM", @progbits, 32
182 -.align 32
183 -ORMASK: .octa 0x00000000010000000000000001000000
184 - .octa 0x00000000010000000000000001000000
185 -
186 -.text
187 -
188 -#define h0 0x00(%rdi)
189 -#define h1 0x04(%rdi)
190 -#define h2 0x08(%rdi)
191 -#define h3 0x0c(%rdi)
192 -#define h4 0x10(%rdi)
193 -#define r0 0x00(%rdx)
194 -#define r1 0x04(%rdx)
195 -#define r2 0x08(%rdx)
196 -#define r3 0x0c(%rdx)
197 -#define r4 0x10(%rdx)
198 -#define u0 0x00(%r8)
199 -#define u1 0x04(%r8)
200 -#define u2 0x08(%r8)
201 -#define u3 0x0c(%r8)
202 -#define u4 0x10(%r8)
203 -#define w0 0x18(%r8)
204 -#define w1 0x1c(%r8)
205 -#define w2 0x20(%r8)
206 -#define w3 0x24(%r8)
207 -#define w4 0x28(%r8)
208 -#define y0 0x30(%r8)
209 -#define y1 0x34(%r8)
210 -#define y2 0x38(%r8)
211 -#define y3 0x3c(%r8)
212 -#define y4 0x40(%r8)
213 -#define m %rsi
214 -#define hc0 %ymm0
215 -#define hc1 %ymm1
216 -#define hc2 %ymm2
217 -#define hc3 %ymm3
218 -#define hc4 %ymm4
219 -#define hc0x %xmm0
220 -#define hc1x %xmm1
221 -#define hc2x %xmm2
222 -#define hc3x %xmm3
223 -#define hc4x %xmm4
224 -#define t1 %ymm5
225 -#define t2 %ymm6
226 -#define t1x %xmm5
227 -#define t2x %xmm6
228 -#define ruwy0 %ymm7
229 -#define ruwy1 %ymm8
230 -#define ruwy2 %ymm9
231 -#define ruwy3 %ymm10
232 -#define ruwy4 %ymm11
233 -#define ruwy0x %xmm7
234 -#define ruwy1x %xmm8
235 -#define ruwy2x %xmm9
236 -#define ruwy3x %xmm10
237 -#define ruwy4x %xmm11
238 -#define svxz1 %ymm12
239 -#define svxz2 %ymm13
240 -#define svxz3 %ymm14
241 -#define svxz4 %ymm15
242 -#define d0 %r9
243 -#define d1 %r10
244 -#define d2 %r11
245 -#define d3 %r12
246 -#define d4 %r13
247 -
248 -ENTRY(poly1305_4block_avx2)
249 - # %rdi: Accumulator h[5]
250 - # %rsi: 64 byte input block m
251 - # %rdx: Poly1305 key r[5]
252 - # %rcx: Quadblock count
253 - # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
254 -
255 - # This four-block variant uses loop unrolled block processing. It
256 - # requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
257 - # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
258 -
259 - vzeroupper
260 - push %rbx
261 - push %r12
262 - push %r13
263 -
264 - # combine r0,u0,w0,y0
265 - vmovd y0,ruwy0x
266 - vmovd w0,t1x
267 - vpunpcklqdq t1,ruwy0,ruwy0
268 - vmovd u0,t1x
269 - vmovd r0,t2x
270 - vpunpcklqdq t2,t1,t1
271 - vperm2i128 $0x20,t1,ruwy0,ruwy0
272 -
273 - # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
274 - vmovd y1,ruwy1x
275 - vmovd w1,t1x
276 - vpunpcklqdq t1,ruwy1,ruwy1
277 - vmovd u1,t1x
278 - vmovd r1,t2x
279 - vpunpcklqdq t2,t1,t1
280 - vperm2i128 $0x20,t1,ruwy1,ruwy1
281 - vpslld $2,ruwy1,svxz1
282 - vpaddd ruwy1,svxz1,svxz1
283 -
284 - # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
285 - vmovd y2,ruwy2x
286 - vmovd w2,t1x
287 - vpunpcklqdq t1,ruwy2,ruwy2
288 - vmovd u2,t1x
289 - vmovd r2,t2x
290 - vpunpcklqdq t2,t1,t1
291 - vperm2i128 $0x20,t1,ruwy2,ruwy2
292 - vpslld $2,ruwy2,svxz2
293 - vpaddd ruwy2,svxz2,svxz2
294 -
295 - # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
296 - vmovd y3,ruwy3x
297 - vmovd w3,t1x
298 - vpunpcklqdq t1,ruwy3,ruwy3
299 - vmovd u3,t1x
300 - vmovd r3,t2x
301 - vpunpcklqdq t2,t1,t1
302 - vperm2i128 $0x20,t1,ruwy3,ruwy3
303 - vpslld $2,ruwy3,svxz3
304 - vpaddd ruwy3,svxz3,svxz3
305 -
306 - # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
307 - vmovd y4,ruwy4x
308 - vmovd w4,t1x
309 - vpunpcklqdq t1,ruwy4,ruwy4
310 - vmovd u4,t1x
311 - vmovd r4,t2x
312 - vpunpcklqdq t2,t1,t1
313 - vperm2i128 $0x20,t1,ruwy4,ruwy4
314 - vpslld $2,ruwy4,svxz4
315 - vpaddd ruwy4,svxz4,svxz4
316 -
317 -.Ldoblock4:
318 - # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
319 - # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
320 - vmovd 0x00(m),hc0x
321 - vmovd 0x10(m),t1x
322 - vpunpcklqdq t1,hc0,hc0
323 - vmovd 0x20(m),t1x
324 - vmovd 0x30(m),t2x
325 - vpunpcklqdq t2,t1,t1
326 - vperm2i128 $0x20,t1,hc0,hc0
327 - vpand ANMASK(%rip),hc0,hc0
328 - vmovd h0,t1x
329 - vpaddd t1,hc0,hc0
330 - # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
331 - # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
332 - vmovd 0x03(m),hc1x
333 - vmovd 0x13(m),t1x
334 - vpunpcklqdq t1,hc1,hc1
335 - vmovd 0x23(m),t1x
336 - vmovd 0x33(m),t2x
337 - vpunpcklqdq t2,t1,t1
338 - vperm2i128 $0x20,t1,hc1,hc1
339 - vpsrld $2,hc1,hc1
340 - vpand ANMASK(%rip),hc1,hc1
341 - vmovd h1,t1x
342 - vpaddd t1,hc1,hc1
343 - # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
344 - # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
345 - vmovd 0x06(m),hc2x
346 - vmovd 0x16(m),t1x
347 - vpunpcklqdq t1,hc2,hc2
348 - vmovd 0x26(m),t1x
349 - vmovd 0x36(m),t2x
350 - vpunpcklqdq t2,t1,t1
351 - vperm2i128 $0x20,t1,hc2,hc2
352 - vpsrld $4,hc2,hc2
353 - vpand ANMASK(%rip),hc2,hc2
354 - vmovd h2,t1x
355 - vpaddd t1,hc2,hc2
356 - # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
357 - # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
358 - vmovd 0x09(m),hc3x
359 - vmovd 0x19(m),t1x
360 - vpunpcklqdq t1,hc3,hc3
361 - vmovd 0x29(m),t1x
362 - vmovd 0x39(m),t2x
363 - vpunpcklqdq t2,t1,t1
364 - vperm2i128 $0x20,t1,hc3,hc3
365 - vpsrld $6,hc3,hc3
366 - vpand ANMASK(%rip),hc3,hc3
367 - vmovd h3,t1x
368 - vpaddd t1,hc3,hc3
369 - # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
370 - # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
371 - vmovd 0x0c(m),hc4x
372 - vmovd 0x1c(m),t1x
373 - vpunpcklqdq t1,hc4,hc4
374 - vmovd 0x2c(m),t1x
375 - vmovd 0x3c(m),t2x
376 - vpunpcklqdq t2,t1,t1
377 - vperm2i128 $0x20,t1,hc4,hc4
378 - vpsrld $8,hc4,hc4
379 - vpor ORMASK(%rip),hc4,hc4
380 - vmovd h4,t1x
381 - vpaddd t1,hc4,hc4
382 -
383 - # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
384 - vpmuludq hc0,ruwy0,t1
385 - # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
386 - vpmuludq hc1,svxz4,t2
387 - vpaddq t2,t1,t1
388 - # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
389 - vpmuludq hc2,svxz3,t2
390 - vpaddq t2,t1,t1
391 - # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
392 - vpmuludq hc3,svxz2,t2
393 - vpaddq t2,t1,t1
394 - # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
395 - vpmuludq hc4,svxz1,t2
396 - vpaddq t2,t1,t1
397 - # d0 = t1[0] + t1[1] + t[2] + t[3]
398 - vpermq $0xee,t1,t2
399 - vpaddq t2,t1,t1
400 - vpsrldq $8,t1,t2
401 - vpaddq t2,t1,t1
402 - vmovq t1x,d0
403 -
404 - # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
405 - vpmuludq hc0,ruwy1,t1
406 - # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
407 - vpmuludq hc1,ruwy0,t2
408 - vpaddq t2,t1,t1
409 - # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
410 - vpmuludq hc2,svxz4,t2
411 - vpaddq t2,t1,t1
412 - # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
413 - vpmuludq hc3,svxz3,t2
414 - vpaddq t2,t1,t1
415 - # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
416 - vpmuludq hc4,svxz2,t2
417 - vpaddq t2,t1,t1
418 - # d1 = t1[0] + t1[1] + t1[3] + t1[4]
419 - vpermq $0xee,t1,t2
420 - vpaddq t2,t1,t1
421 - vpsrldq $8,t1,t2
422 - vpaddq t2,t1,t1
423 - vmovq t1x,d1
424 -
425 - # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
426 - vpmuludq hc0,ruwy2,t1
427 - # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
428 - vpmuludq hc1,ruwy1,t2
429 - vpaddq t2,t1,t1
430 - # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
431 - vpmuludq hc2,ruwy0,t2
432 - vpaddq t2,t1,t1
433 - # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
434 - vpmuludq hc3,svxz4,t2
435 - vpaddq t2,t1,t1
436 - # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
437 - vpmuludq hc4,svxz3,t2
438 - vpaddq t2,t1,t1
439 - # d2 = t1[0] + t1[1] + t1[2] + t1[3]
440 - vpermq $0xee,t1,t2
441 - vpaddq t2,t1,t1
442 - vpsrldq $8,t1,t2
443 - vpaddq t2,t1,t1
444 - vmovq t1x,d2
445 -
446 - # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
447 - vpmuludq hc0,ruwy3,t1
448 - # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
449 - vpmuludq hc1,ruwy2,t2
450 - vpaddq t2,t1,t1
451 - # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
452 - vpmuludq hc2,ruwy1,t2
453 - vpaddq t2,t1,t1
454 - # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
455 - vpmuludq hc3,ruwy0,t2
456 - vpaddq t2,t1,t1
457 - # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
458 - vpmuludq hc4,svxz4,t2
459 - vpaddq t2,t1,t1
460 - # d3 = t1[0] + t1[1] + t1[2] + t1[3]
461 - vpermq $0xee,t1,t2
462 - vpaddq t2,t1,t1
463 - vpsrldq $8,t1,t2
464 - vpaddq t2,t1,t1
465 - vmovq t1x,d3
466 -
467 - # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
468 - vpmuludq hc0,ruwy4,t1
469 - # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
470 - vpmuludq hc1,ruwy3,t2
471 - vpaddq t2,t1,t1
472 - # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
473 - vpmuludq hc2,ruwy2,t2
474 - vpaddq t2,t1,t1
475 - # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
476 - vpmuludq hc3,ruwy1,t2
477 - vpaddq t2,t1,t1
478 - # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
479 - vpmuludq hc4,ruwy0,t2
480 - vpaddq t2,t1,t1
481 - # d4 = t1[0] + t1[1] + t1[2] + t1[3]
482 - vpermq $0xee,t1,t2
483 - vpaddq t2,t1,t1
484 - vpsrldq $8,t1,t2
485 - vpaddq t2,t1,t1
486 - vmovq t1x,d4
487 -
488 - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
489 - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
490 - # amount. Careful: we must not assume the carry bits 'd0 >> 26',
491 - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
492 - # integers. It's true in a single-block implementation, but not here.
493 -
494 - # d1 += d0 >> 26
495 - mov d0,%rax
496 - shr $26,%rax
497 - add %rax,d1
498 - # h0 = d0 & 0x3ffffff
499 - mov d0,%rbx
500 - and $0x3ffffff,%ebx
501 -
502 - # d2 += d1 >> 26
503 - mov d1,%rax
504 - shr $26,%rax
505 - add %rax,d2
506 - # h1 = d1 & 0x3ffffff
507 - mov d1,%rax
508 - and $0x3ffffff,%eax
509 - mov %eax,h1
510 -
511 - # d3 += d2 >> 26
512 - mov d2,%rax
513 - shr $26,%rax
514 - add %rax,d3
515 - # h2 = d2 & 0x3ffffff
516 - mov d2,%rax
517 - and $0x3ffffff,%eax
518 - mov %eax,h2
519 -
520 - # d4 += d3 >> 26
521 - mov d3,%rax
522 - shr $26,%rax
523 - add %rax,d4
524 - # h3 = d3 & 0x3ffffff
525 - mov d3,%rax
526 - and $0x3ffffff,%eax
527 - mov %eax,h3
528 -
529 - # h0 += (d4 >> 26) * 5
530 - mov d4,%rax
531 - shr $26,%rax
532 - lea (%rax,%rax,4),%rax
533 - add %rax,%rbx
534 - # h4 = d4 & 0x3ffffff
535 - mov d4,%rax
536 - and $0x3ffffff,%eax
537 - mov %eax,h4
538 -
539 - # h1 += h0 >> 26
540 - mov %rbx,%rax
541 - shr $26,%rax
542 - add %eax,h1
543 - # h0 = h0 & 0x3ffffff
544 - andl $0x3ffffff,%ebx
545 - mov %ebx,h0
546 -
547 - add $0x40,m
548 - dec %rcx
549 - jnz .Ldoblock4
550 -
551 - vzeroupper
552 - pop %r13
553 - pop %r12
554 - pop %rbx
555 - ret
556 -ENDPROC(poly1305_4block_avx2)
557 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S
558 +++ /dev/null
559 @@ -1,590 +0,0 @@
560 -/* SPDX-License-Identifier: GPL-2.0-or-later */
561 -/*
562 - * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
563 - *
564 - * Copyright (C) 2015 Martin Willi
565 - */
566 -
567 -#include <linux/linkage.h>
568 -
569 -.section .rodata.cst16.ANMASK, "aM", @progbits, 16
570 -.align 16
571 -ANMASK: .octa 0x0000000003ffffff0000000003ffffff
572 -
573 -.section .rodata.cst16.ORMASK, "aM", @progbits, 16
574 -.align 16
575 -ORMASK: .octa 0x00000000010000000000000001000000
576 -
577 -.text
578 -
579 -#define h0 0x00(%rdi)
580 -#define h1 0x04(%rdi)
581 -#define h2 0x08(%rdi)
582 -#define h3 0x0c(%rdi)
583 -#define h4 0x10(%rdi)
584 -#define r0 0x00(%rdx)
585 -#define r1 0x04(%rdx)
586 -#define r2 0x08(%rdx)
587 -#define r3 0x0c(%rdx)
588 -#define r4 0x10(%rdx)
589 -#define s1 0x00(%rsp)
590 -#define s2 0x04(%rsp)
591 -#define s3 0x08(%rsp)
592 -#define s4 0x0c(%rsp)
593 -#define m %rsi
594 -#define h01 %xmm0
595 -#define h23 %xmm1
596 -#define h44 %xmm2
597 -#define t1 %xmm3
598 -#define t2 %xmm4
599 -#define t3 %xmm5
600 -#define t4 %xmm6
601 -#define mask %xmm7
602 -#define d0 %r8
603 -#define d1 %r9
604 -#define d2 %r10
605 -#define d3 %r11
606 -#define d4 %r12
607 -
608 -ENTRY(poly1305_block_sse2)
609 - # %rdi: Accumulator h[5]
610 - # %rsi: 16 byte input block m
611 - # %rdx: Poly1305 key r[5]
612 - # %rcx: Block count
613 -
614 - # This single block variant tries to improve performance by doing two
615 - # multiplications in parallel using SSE instructions. There is quite
616 - # some quardword packing involved, hence the speedup is marginal.
617 -
618 - push %rbx
619 - push %r12
620 - sub $0x10,%rsp
621 -
622 - # s1..s4 = r1..r4 * 5
623 - mov r1,%eax
624 - lea (%eax,%eax,4),%eax
625 - mov %eax,s1
626 - mov r2,%eax
627 - lea (%eax,%eax,4),%eax
628 - mov %eax,s2
629 - mov r3,%eax
630 - lea (%eax,%eax,4),%eax
631 - mov %eax,s3
632 - mov r4,%eax
633 - lea (%eax,%eax,4),%eax
634 - mov %eax,s4
635 -
636 - movdqa ANMASK(%rip),mask
637 -
638 -.Ldoblock:
639 - # h01 = [0, h1, 0, h0]
640 - # h23 = [0, h3, 0, h2]
641 - # h44 = [0, h4, 0, h4]
642 - movd h0,h01
643 - movd h1,t1
644 - movd h2,h23
645 - movd h3,t2
646 - movd h4,h44
647 - punpcklqdq t1,h01
648 - punpcklqdq t2,h23
649 - punpcklqdq h44,h44
650 -
651 - # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
652 - movd 0x00(m),t1
653 - movd 0x03(m),t2
654 - psrld $2,t2
655 - punpcklqdq t2,t1
656 - pand mask,t1
657 - paddd t1,h01
658 - # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
659 - movd 0x06(m),t1
660 - movd 0x09(m),t2
661 - psrld $4,t1
662 - psrld $6,t2
663 - punpcklqdq t2,t1
664 - pand mask,t1
665 - paddd t1,h23
666 - # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
667 - mov 0x0c(m),%eax
668 - shr $8,%eax
669 - or $0x01000000,%eax
670 - movd %eax,t1
671 - pshufd $0xc4,t1,t1
672 - paddd t1,h44
673 -
674 - # t1[0] = h0 * r0 + h2 * s3
675 - # t1[1] = h1 * s4 + h3 * s2
676 - movd r0,t1
677 - movd s4,t2
678 - punpcklqdq t2,t1
679 - pmuludq h01,t1
680 - movd s3,t2
681 - movd s2,t3
682 - punpcklqdq t3,t2
683 - pmuludq h23,t2
684 - paddq t2,t1
685 - # t2[0] = h0 * r1 + h2 * s4
686 - # t2[1] = h1 * r0 + h3 * s3
687 - movd r1,t2
688 - movd r0,t3
689 - punpcklqdq t3,t2
690 - pmuludq h01,t2
691 - movd s4,t3
692 - movd s3,t4
693 - punpcklqdq t4,t3
694 - pmuludq h23,t3
695 - paddq t3,t2
696 - # t3[0] = h4 * s1
697 - # t3[1] = h4 * s2
698 - movd s1,t3
699 - movd s2,t4
700 - punpcklqdq t4,t3
701 - pmuludq h44,t3
702 - # d0 = t1[0] + t1[1] + t3[0]
703 - # d1 = t2[0] + t2[1] + t3[1]
704 - movdqa t1,t4
705 - punpcklqdq t2,t4
706 - punpckhqdq t2,t1
707 - paddq t4,t1
708 - paddq t3,t1
709 - movq t1,d0
710 - psrldq $8,t1
711 - movq t1,d1
712 -
713 - # t1[0] = h0 * r2 + h2 * r0
714 - # t1[1] = h1 * r1 + h3 * s4
715 - movd r2,t1
716 - movd r1,t2
717 - punpcklqdq t2,t1
718 - pmuludq h01,t1
719 - movd r0,t2
720 - movd s4,t3
721 - punpcklqdq t3,t2
722 - pmuludq h23,t2
723 - paddq t2,t1
724 - # t2[0] = h0 * r3 + h2 * r1
725 - # t2[1] = h1 * r2 + h3 * r0
726 - movd r3,t2
727 - movd r2,t3
728 - punpcklqdq t3,t2
729 - pmuludq h01,t2
730 - movd r1,t3
731 - movd r0,t4
732 - punpcklqdq t4,t3
733 - pmuludq h23,t3
734 - paddq t3,t2
735 - # t3[0] = h4 * s3
736 - # t3[1] = h4 * s4
737 - movd s3,t3
738 - movd s4,t4
739 - punpcklqdq t4,t3
740 - pmuludq h44,t3
741 - # d2 = t1[0] + t1[1] + t3[0]
742 - # d3 = t2[0] + t2[1] + t3[1]
743 - movdqa t1,t4
744 - punpcklqdq t2,t4
745 - punpckhqdq t2,t1
746 - paddq t4,t1
747 - paddq t3,t1
748 - movq t1,d2
749 - psrldq $8,t1
750 - movq t1,d3
751 -
752 - # t1[0] = h0 * r4 + h2 * r2
753 - # t1[1] = h1 * r3 + h3 * r1
754 - movd r4,t1
755 - movd r3,t2
756 - punpcklqdq t2,t1
757 - pmuludq h01,t1
758 - movd r2,t2
759 - movd r1,t3
760 - punpcklqdq t3,t2
761 - pmuludq h23,t2
762 - paddq t2,t1
763 - # t3[0] = h4 * r0
764 - movd r0,t3
765 - pmuludq h44,t3
766 - # d4 = t1[0] + t1[1] + t3[0]
767 - movdqa t1,t4
768 - psrldq $8,t4
769 - paddq t4,t1
770 - paddq t3,t1
771 - movq t1,d4
772 -
773 - # d1 += d0 >> 26
774 - mov d0,%rax
775 - shr $26,%rax
776 - add %rax,d1
777 - # h0 = d0 & 0x3ffffff
778 - mov d0,%rbx
779 - and $0x3ffffff,%ebx
780 -
781 - # d2 += d1 >> 26
782 - mov d1,%rax
783 - shr $26,%rax
784 - add %rax,d2
785 - # h1 = d1 & 0x3ffffff
786 - mov d1,%rax
787 - and $0x3ffffff,%eax
788 - mov %eax,h1
789 -
790 - # d3 += d2 >> 26
791 - mov d2,%rax
792 - shr $26,%rax
793 - add %rax,d3
794 - # h2 = d2 & 0x3ffffff
795 - mov d2,%rax
796 - and $0x3ffffff,%eax
797 - mov %eax,h2
798 -
799 - # d4 += d3 >> 26
800 - mov d3,%rax
801 - shr $26,%rax
802 - add %rax,d4
803 - # h3 = d3 & 0x3ffffff
804 - mov d3,%rax
805 - and $0x3ffffff,%eax
806 - mov %eax,h3
807 -
808 - # h0 += (d4 >> 26) * 5
809 - mov d4,%rax
810 - shr $26,%rax
811 - lea (%rax,%rax,4),%rax
812 - add %rax,%rbx
813 - # h4 = d4 & 0x3ffffff
814 - mov d4,%rax
815 - and $0x3ffffff,%eax
816 - mov %eax,h4
817 -
818 - # h1 += h0 >> 26
819 - mov %rbx,%rax
820 - shr $26,%rax
821 - add %eax,h1
822 - # h0 = h0 & 0x3ffffff
823 - andl $0x3ffffff,%ebx
824 - mov %ebx,h0
825 -
826 - add $0x10,m
827 - dec %rcx
828 - jnz .Ldoblock
829 -
830 - # Zeroing of key material
831 - mov %rcx,0x00(%rsp)
832 - mov %rcx,0x08(%rsp)
833 -
834 - add $0x10,%rsp
835 - pop %r12
836 - pop %rbx
837 - ret
838 -ENDPROC(poly1305_block_sse2)
839 -
840 -
841 -#define u0 0x00(%r8)
842 -#define u1 0x04(%r8)
843 -#define u2 0x08(%r8)
844 -#define u3 0x0c(%r8)
845 -#define u4 0x10(%r8)
846 -#define hc0 %xmm0
847 -#define hc1 %xmm1
848 -#define hc2 %xmm2
849 -#define hc3 %xmm5
850 -#define hc4 %xmm6
851 -#define ru0 %xmm7
852 -#define ru1 %xmm8
853 -#define ru2 %xmm9
854 -#define ru3 %xmm10
855 -#define ru4 %xmm11
856 -#define sv1 %xmm12
857 -#define sv2 %xmm13
858 -#define sv3 %xmm14
859 -#define sv4 %xmm15
860 -#undef d0
861 -#define d0 %r13
862 -
863 -ENTRY(poly1305_2block_sse2)
864 - # %rdi: Accumulator h[5]
865 - # %rsi: 16 byte input block m
866 - # %rdx: Poly1305 key r[5]
867 - # %rcx: Doubleblock count
868 - # %r8: Poly1305 derived key r^2 u[5]
869 -
870 - # This two-block variant further improves performance by using loop
871 - # unrolled block processing. This is more straight forward and does
872 - # less byte shuffling, but requires a second Poly1305 key r^2:
873 - # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
874 -
875 - push %rbx
876 - push %r12
877 - push %r13
878 -
879 - # combine r0,u0
880 - movd u0,ru0
881 - movd r0,t1
882 - punpcklqdq t1,ru0
883 -
884 - # combine r1,u1 and s1=r1*5,v1=u1*5
885 - movd u1,ru1
886 - movd r1,t1
887 - punpcklqdq t1,ru1
888 - movdqa ru1,sv1
889 - pslld $2,sv1
890 - paddd ru1,sv1
891 -
892 - # combine r2,u2 and s2=r2*5,v2=u2*5
893 - movd u2,ru2
894 - movd r2,t1
895 - punpcklqdq t1,ru2
896 - movdqa ru2,sv2
897 - pslld $2,sv2
898 - paddd ru2,sv2
899 -
900 - # combine r3,u3 and s3=r3*5,v3=u3*5
901 - movd u3,ru3
902 - movd r3,t1
903 - punpcklqdq t1,ru3
904 - movdqa ru3,sv3
905 - pslld $2,sv3
906 - paddd ru3,sv3
907 -
908 - # combine r4,u4 and s4=r4*5,v4=u4*5
909 - movd u4,ru4
910 - movd r4,t1
911 - punpcklqdq t1,ru4
912 - movdqa ru4,sv4
913 - pslld $2,sv4
914 - paddd ru4,sv4
915 -
916 -.Ldoblock2:
917 - # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
918 - movd 0x00(m),hc0
919 - movd 0x10(m),t1
920 - punpcklqdq t1,hc0
921 - pand ANMASK(%rip),hc0
922 - movd h0,t1
923 - paddd t1,hc0
924 - # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
925 - movd 0x03(m),hc1
926 - movd 0x13(m),t1
927 - punpcklqdq t1,hc1
928 - psrld $2,hc1
929 - pand ANMASK(%rip),hc1
930 - movd h1,t1
931 - paddd t1,hc1
932 - # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
933 - movd 0x06(m),hc2
934 - movd 0x16(m),t1
935 - punpcklqdq t1,hc2
936 - psrld $4,hc2
937 - pand ANMASK(%rip),hc2
938 - movd h2,t1
939 - paddd t1,hc2
940 - # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
941 - movd 0x09(m),hc3
942 - movd 0x19(m),t1
943 - punpcklqdq t1,hc3
944 - psrld $6,hc3
945 - pand ANMASK(%rip),hc3
946 - movd h3,t1
947 - paddd t1,hc3
948 - # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
949 - movd 0x0c(m),hc4
950 - movd 0x1c(m),t1
951 - punpcklqdq t1,hc4
952 - psrld $8,hc4
953 - por ORMASK(%rip),hc4
954 - movd h4,t1
955 - paddd t1,hc4
956 -
957 - # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
958 - movdqa ru0,t1
959 - pmuludq hc0,t1
960 - # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
961 - movdqa sv4,t2
962 - pmuludq hc1,t2
963 - paddq t2,t1
964 - # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
965 - movdqa sv3,t2
966 - pmuludq hc2,t2
967 - paddq t2,t1
968 - # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
969 - movdqa sv2,t2
970 - pmuludq hc3,t2
971 - paddq t2,t1
972 - # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
973 - movdqa sv1,t2
974 - pmuludq hc4,t2
975 - paddq t2,t1
976 - # d0 = t1[0] + t1[1]
977 - movdqa t1,t2
978 - psrldq $8,t2
979 - paddq t2,t1
980 - movq t1,d0
981 -
982 - # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
983 - movdqa ru1,t1
984 - pmuludq hc0,t1
985 - # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
986 - movdqa ru0,t2
987 - pmuludq hc1,t2
988 - paddq t2,t1
989 - # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
990 - movdqa sv4,t2
991 - pmuludq hc2,t2
992 - paddq t2,t1
993 - # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
994 - movdqa sv3,t2
995 - pmuludq hc3,t2
996 - paddq t2,t1
997 - # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
998 - movdqa sv2,t2
999 - pmuludq hc4,t2
1000 - paddq t2,t1
1001 - # d1 = t1[0] + t1[1]
1002 - movdqa t1,t2
1003 - psrldq $8,t2
1004 - paddq t2,t1
1005 - movq t1,d1
1006 -
1007 - # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
1008 - movdqa ru2,t1
1009 - pmuludq hc0,t1
1010 - # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
1011 - movdqa ru1,t2
1012 - pmuludq hc1,t2
1013 - paddq t2,t1
1014 - # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
1015 - movdqa ru0,t2
1016 - pmuludq hc2,t2
1017 - paddq t2,t1
1018 - # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
1019 - movdqa sv4,t2
1020 - pmuludq hc3,t2
1021 - paddq t2,t1
1022 - # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
1023 - movdqa sv3,t2
1024 - pmuludq hc4,t2
1025 - paddq t2,t1
1026 - # d2 = t1[0] + t1[1]
1027 - movdqa t1,t2
1028 - psrldq $8,t2
1029 - paddq t2,t1
1030 - movq t1,d2
1031 -
1032 - # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
1033 - movdqa ru3,t1
1034 - pmuludq hc0,t1
1035 - # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
1036 - movdqa ru2,t2
1037 - pmuludq hc1,t2
1038 - paddq t2,t1
1039 - # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
1040 - movdqa ru1,t2
1041 - pmuludq hc2,t2
1042 - paddq t2,t1
1043 - # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
1044 - movdqa ru0,t2
1045 - pmuludq hc3,t2
1046 - paddq t2,t1
1047 - # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
1048 - movdqa sv4,t2
1049 - pmuludq hc4,t2
1050 - paddq t2,t1
1051 - # d3 = t1[0] + t1[1]
1052 - movdqa t1,t2
1053 - psrldq $8,t2
1054 - paddq t2,t1
1055 - movq t1,d3
1056 -
1057 - # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
1058 - movdqa ru4,t1
1059 - pmuludq hc0,t1
1060 - # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
1061 - movdqa ru3,t2
1062 - pmuludq hc1,t2
1063 - paddq t2,t1
1064 - # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
1065 - movdqa ru2,t2
1066 - pmuludq hc2,t2
1067 - paddq t2,t1
1068 - # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
1069 - movdqa ru1,t2
1070 - pmuludq hc3,t2
1071 - paddq t2,t1
1072 - # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
1073 - movdqa ru0,t2
1074 - pmuludq hc4,t2
1075 - paddq t2,t1
1076 - # d4 = t1[0] + t1[1]
1077 - movdqa t1,t2
1078 - psrldq $8,t2
1079 - paddq t2,t1
1080 - movq t1,d4
1081 -
1082 - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
1083 - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
1084 - # amount. Careful: we must not assume the carry bits 'd0 >> 26',
1085 - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
1086 - # integers. It's true in a single-block implementation, but not here.
1087 -
1088 - # d1 += d0 >> 26
1089 - mov d0,%rax
1090 - shr $26,%rax
1091 - add %rax,d1
1092 - # h0 = d0 & 0x3ffffff
1093 - mov d0,%rbx
1094 - and $0x3ffffff,%ebx
1095 -
1096 - # d2 += d1 >> 26
1097 - mov d1,%rax
1098 - shr $26,%rax
1099 - add %rax,d2
1100 - # h1 = d1 & 0x3ffffff
1101 - mov d1,%rax
1102 - and $0x3ffffff,%eax
1103 - mov %eax,h1
1104 -
1105 - # d3 += d2 >> 26
1106 - mov d2,%rax
1107 - shr $26,%rax
1108 - add %rax,d3
1109 - # h2 = d2 & 0x3ffffff
1110 - mov d2,%rax
1111 - and $0x3ffffff,%eax
1112 - mov %eax,h2
1113 -
1114 - # d4 += d3 >> 26
1115 - mov d3,%rax
1116 - shr $26,%rax
1117 - add %rax,d4
1118 - # h3 = d3 & 0x3ffffff
1119 - mov d3,%rax
1120 - and $0x3ffffff,%eax
1121 - mov %eax,h3
1122 -
1123 - # h0 += (d4 >> 26) * 5
1124 - mov d4,%rax
1125 - shr $26,%rax
1126 - lea (%rax,%rax,4),%rax
1127 - add %rax,%rbx
1128 - # h4 = d4 & 0x3ffffff
1129 - mov d4,%rax
1130 - and $0x3ffffff,%eax
1131 - mov %eax,h4
1132 -
1133 - # h1 += h0 >> 26
1134 - mov %rbx,%rax
1135 - shr $26,%rax
1136 - add %eax,h1
1137 - # h0 = h0 & 0x3ffffff
1138 - andl $0x3ffffff,%ebx
1139 - mov %ebx,h0
1140 -
1141 - add $0x20,m
1142 - dec %rcx
1143 - jnz .Ldoblock2
1144 -
1145 - pop %r13
1146 - pop %r12
1147 - pop %rbx
1148 - ret
1149 -ENDPROC(poly1305_2block_sse2)
1150 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
1151 +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
1152 @@ -1,11 +1,14 @@
1153 -#! /usr/bin/env perl
1154 -# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
1155 +#!/usr/bin/env perl
1156 +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
1157 #
1158 -# Licensed under the OpenSSL license (the "License"). You may not use
1159 -# this file except in compliance with the License. You can obtain a copy
1160 -# in the file LICENSE in the source distribution or at
1161 -# https://www.openssl.org/source/license.html
1162 -
1163 +# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
1164 +# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
1165 +# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
1166 +#
1167 +# This code is taken from the OpenSSL project but the author, Andy Polyakov,
1168 +# has relicensed it under the licenses specified in the SPDX header above.
1169 +# The original headers, including the original license headers, are
1170 +# included below for completeness.
1171 #
1172 # ====================================================================
1173 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
1174 @@ -32,7 +35,7 @@
1175 # Skylake-X system performance. Since we are likely to suppress
1176 # AVX512F capability flag [at least on Skylake-X], conversion serves
1177 # as kind of "investment protection". Note that next *lake processor,
1178 -# Cannolake, has AVX512IFMA code path to execute...
1179 +# Cannonlake, has AVX512IFMA code path to execute...
1180 #
1181 # Numbers are cycles per processed byte with poly1305_blocks alone,
1182 # measured with rdtsc at fixed clock frequency.
1183 @@ -68,39 +71,114 @@ $output = shift;
1184 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
1185
1186 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
1187 +$kernel=0; $kernel=1 if (!$flavour && !$output);
1188
1189 -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
1190 -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
1191 -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
1192 -die "can't locate x86_64-xlate.pl";
1193 -
1194 -if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
1195 - =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
1196 - $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
1197 +if (!$kernel) {
1198 + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
1199 + ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
1200 + ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
1201 + die "can't locate x86_64-xlate.pl";
1202 +
1203 + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
1204 + *STDOUT=*OUT;
1205 +
1206 + if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
1207 + =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
1208 + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
1209 + }
1210 +
1211 + if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
1212 + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
1213 + $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
1214 + $avx += 1 if ($1==2.11 && $2>=8);
1215 + }
1216 +
1217 + if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
1218 + `ml64 2>&1` =~ /Version ([0-9]+)\./) {
1219 + $avx = ($1>=10) + ($1>=11);
1220 + }
1221 +
1222 + if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
1223 + $avx = ($2>=3.0) + ($2>3.0);
1224 + }
1225 +} else {
1226 + $avx = 4; # The kernel uses ifdefs for this.
1227 }
1228
1229 -if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
1230 - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
1231 - $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
1232 - $avx += 2 if ($1==2.11 && $2>=8);
1233 +sub declare_function() {
1234 + my ($name, $align, $nargs) = @_;
1235 + if($kernel) {
1236 + $code .= ".align $align\n";
1237 + $code .= "ENTRY($name)\n";
1238 + $code .= ".L$name:\n";
1239 + } else {
1240 + $code .= ".globl $name\n";
1241 + $code .= ".type $name,\@function,$nargs\n";
1242 + $code .= ".align $align\n";
1243 + $code .= "$name:\n";
1244 + }
1245 }
1246
1247 -if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
1248 - `ml64 2>&1` =~ /Version ([0-9]+)\./) {
1249 - $avx = ($1>=10) + ($1>=12);
1250 +sub end_function() {
1251 + my ($name) = @_;
1252 + if($kernel) {
1253 + $code .= "ENDPROC($name)\n";
1254 + } else {
1255 + $code .= ".size $name,.-$name\n";
1256 + }
1257 }
1258
1259 -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
1260 - $avx = ($2>=3.0) + ($2>3.0);
1261 -}
1262 +$code.=<<___ if $kernel;
1263 +#include <linux/linkage.h>
1264 +___
1265
1266 -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
1267 -*STDOUT=*OUT;
1268 +if ($avx) {
1269 +$code.=<<___ if $kernel;
1270 +.section .rodata
1271 +___
1272 +$code.=<<___;
1273 +.align 64
1274 +.Lconst:
1275 +.Lmask24:
1276 +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1277 +.L129:
1278 +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
1279 +.Lmask26:
1280 +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1281 +.Lpermd_avx2:
1282 +.long 2,2,2,3,2,0,2,1
1283 +.Lpermd_avx512:
1284 +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
1285 +
1286 +.L2_44_inp_permd:
1287 +.long 0,1,1,2,2,3,7,7
1288 +.L2_44_inp_shift:
1289 +.quad 0,12,24,64
1290 +.L2_44_mask:
1291 +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
1292 +.L2_44_shift_rgt:
1293 +.quad 44,44,42,64
1294 +.L2_44_shift_lft:
1295 +.quad 8,8,10,64
1296 +
1297 +.align 64
1298 +.Lx_mask44:
1299 +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1300 +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1301 +.Lx_mask42:
1302 +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1303 +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1304 +___
1305 +}
1306 +$code.=<<___ if (!$kernel);
1307 +.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1308 +.align 16
1309 +___
1310
1311 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
1312 my ($mac,$nonce)=($inp,$len); # *_emit arguments
1313 -my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
1314 -my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
1315 +my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
1316 +my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
1317
1318 sub poly1305_iteration {
1319 # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
1320 @@ -155,19 +233,19 @@ ___
1321
1322 $code.=<<___;
1323 .text
1324 -
1325 +___
1326 +$code.=<<___ if (!$kernel);
1327 .extern OPENSSL_ia32cap_P
1328
1329 -.globl poly1305_init
1330 -.hidden poly1305_init
1331 -.globl poly1305_blocks
1332 -.hidden poly1305_blocks
1333 -.globl poly1305_emit
1334 -.hidden poly1305_emit
1335 -
1336 -.type poly1305_init,\@function,3
1337 -.align 32
1338 -poly1305_init:
1339 +.globl poly1305_init_x86_64
1340 +.hidden poly1305_init_x86_64
1341 +.globl poly1305_blocks_x86_64
1342 +.hidden poly1305_blocks_x86_64
1343 +.globl poly1305_emit_x86_64
1344 +.hidden poly1305_emit_x86_64
1345 +___
1346 +&declare_function("poly1305_init_x86_64", 32, 3);
1347 +$code.=<<___;
1348 xor %rax,%rax
1349 mov %rax,0($ctx) # initialize hash value
1350 mov %rax,8($ctx)
1351 @@ -175,11 +253,12 @@ poly1305_init:
1352
1353 cmp \$0,$inp
1354 je .Lno_key
1355 -
1356 - lea poly1305_blocks(%rip),%r10
1357 - lea poly1305_emit(%rip),%r11
1358 ___
1359 -$code.=<<___ if ($avx);
1360 +$code.=<<___ if (!$kernel);
1361 + lea poly1305_blocks_x86_64(%rip),%r10
1362 + lea poly1305_emit_x86_64(%rip),%r11
1363 +___
1364 +$code.=<<___ if (!$kernel && $avx);
1365 mov OPENSSL_ia32cap_P+4(%rip),%r9
1366 lea poly1305_blocks_avx(%rip),%rax
1367 lea poly1305_emit_avx(%rip),%rcx
1368 @@ -187,12 +266,12 @@ $code.=<<___ if ($avx);
1369 cmovc %rax,%r10
1370 cmovc %rcx,%r11
1371 ___
1372 -$code.=<<___ if ($avx>1);
1373 +$code.=<<___ if (!$kernel && $avx>1);
1374 lea poly1305_blocks_avx2(%rip),%rax
1375 bt \$`5+32`,%r9 # AVX2?
1376 cmovc %rax,%r10
1377 ___
1378 -$code.=<<___ if ($avx>3);
1379 +$code.=<<___ if (!$kernel && $avx>3);
1380 mov \$`(1<<31|1<<21|1<<16)`,%rax
1381 shr \$32,%r9
1382 and %rax,%r9
1383 @@ -207,11 +286,11 @@ $code.=<<___;
1384 mov %rax,24($ctx)
1385 mov %rcx,32($ctx)
1386 ___
1387 -$code.=<<___ if ($flavour !~ /elf32/);
1388 +$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
1389 mov %r10,0(%rdx)
1390 mov %r11,8(%rdx)
1391 ___
1392 -$code.=<<___ if ($flavour =~ /elf32/);
1393 +$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
1394 mov %r10d,0(%rdx)
1395 mov %r11d,4(%rdx)
1396 ___
1397 @@ -219,11 +298,11 @@ $code.=<<___;
1398 mov \$1,%eax
1399 .Lno_key:
1400 ret
1401 -.size poly1305_init,.-poly1305_init
1402 +___
1403 +&end_function("poly1305_init_x86_64");
1404
1405 -.type poly1305_blocks,\@function,4
1406 -.align 32
1407 -poly1305_blocks:
1408 +&declare_function("poly1305_blocks_x86_64", 32, 4);
1409 +$code.=<<___;
1410 .cfi_startproc
1411 .Lblocks:
1412 shr \$4,$len
1413 @@ -231,8 +310,6 @@ poly1305_blocks:
1414
1415 push %rbx
1416 .cfi_push %rbx
1417 - push %rbp
1418 -.cfi_push %rbp
1419 push %r12
1420 .cfi_push %r12
1421 push %r13
1422 @@ -241,6 +318,8 @@ poly1305_blocks:
1423 .cfi_push %r14
1424 push %r15
1425 .cfi_push %r15
1426 + push $ctx
1427 +.cfi_push $ctx
1428 .Lblocks_body:
1429
1430 mov $len,%r15 # reassign $len
1431 @@ -265,26 +344,29 @@ poly1305_blocks:
1432 lea 16($inp),$inp
1433 adc $padbit,$h2
1434 ___
1435 +
1436 &poly1305_iteration();
1437 +
1438 $code.=<<___;
1439 mov $r1,%rax
1440 dec %r15 # len-=16
1441 jnz .Loop
1442
1443 + mov 0(%rsp),$ctx
1444 +.cfi_restore $ctx
1445 +
1446 mov $h0,0($ctx) # store hash value
1447 mov $h1,8($ctx)
1448 mov $h2,16($ctx)
1449
1450 - mov 0(%rsp),%r15
1451 + mov 8(%rsp),%r15
1452 .cfi_restore %r15
1453 - mov 8(%rsp),%r14
1454 + mov 16(%rsp),%r14
1455 .cfi_restore %r14
1456 - mov 16(%rsp),%r13
1457 + mov 24(%rsp),%r13
1458 .cfi_restore %r13
1459 - mov 24(%rsp),%r12
1460 + mov 32(%rsp),%r12
1461 .cfi_restore %r12
1462 - mov 32(%rsp),%rbp
1463 -.cfi_restore %rbp
1464 mov 40(%rsp),%rbx
1465 .cfi_restore %rbx
1466 lea 48(%rsp),%rsp
1467 @@ -293,11 +375,11 @@ $code.=<<___;
1468 .Lblocks_epilogue:
1469 ret
1470 .cfi_endproc
1471 -.size poly1305_blocks,.-poly1305_blocks
1472 +___
1473 +&end_function("poly1305_blocks_x86_64");
1474
1475 -.type poly1305_emit,\@function,3
1476 -.align 32
1477 -poly1305_emit:
1478 +&declare_function("poly1305_emit_x86_64", 32, 3);
1479 +$code.=<<___;
1480 .Lemit:
1481 mov 0($ctx),%r8 # load hash value
1482 mov 8($ctx),%r9
1483 @@ -318,10 +400,14 @@ poly1305_emit:
1484 mov %rcx,8($mac)
1485
1486 ret
1487 -.size poly1305_emit,.-poly1305_emit
1488 ___
1489 +&end_function("poly1305_emit_x86_64");
1490 if ($avx) {
1491
1492 +if($kernel) {
1493 + $code .= "#ifdef CONFIG_AS_AVX\n";
1494 +}
1495 +
1496 ########################################################################
1497 # Layout of opaque area is following.
1498 #
1499 @@ -342,15 +428,19 @@ $code.=<<___;
1500 .type __poly1305_block,\@abi-omnipotent
1501 .align 32
1502 __poly1305_block:
1503 + push $ctx
1504 ___
1505 &poly1305_iteration();
1506 $code.=<<___;
1507 + pop $ctx
1508 ret
1509 .size __poly1305_block,.-__poly1305_block
1510
1511 .type __poly1305_init_avx,\@abi-omnipotent
1512 .align 32
1513 __poly1305_init_avx:
1514 + push %rbp
1515 + mov %rsp,%rbp
1516 mov $r0,$h0
1517 mov $r1,$h1
1518 xor $h2,$h2
1519 @@ -507,12 +597,13 @@ __poly1305_init_avx:
1520 mov $d1#d,`16*8+8-64`($ctx)
1521
1522 lea -48-64($ctx),$ctx # size [de-]optimization
1523 + pop %rbp
1524 ret
1525 .size __poly1305_init_avx,.-__poly1305_init_avx
1526 +___
1527
1528 -.type poly1305_blocks_avx,\@function,4
1529 -.align 32
1530 -poly1305_blocks_avx:
1531 +&declare_function("poly1305_blocks_avx", 32, 4);
1532 +$code.=<<___;
1533 .cfi_startproc
1534 mov 20($ctx),%r8d # is_base2_26
1535 cmp \$128,$len
1536 @@ -532,10 +623,11 @@ poly1305_blocks_avx:
1537 test \$31,$len
1538 jz .Leven_avx
1539
1540 - push %rbx
1541 -.cfi_push %rbx
1542 push %rbp
1543 .cfi_push %rbp
1544 + mov %rsp,%rbp
1545 + push %rbx
1546 +.cfi_push %rbx
1547 push %r12
1548 .cfi_push %r12
1549 push %r13
1550 @@ -645,20 +737,18 @@ poly1305_blocks_avx:
1551 mov $h2#d,16($ctx)
1552 .align 16
1553 .Ldone_avx:
1554 - mov 0(%rsp),%r15
1555 + pop %r15
1556 .cfi_restore %r15
1557 - mov 8(%rsp),%r14
1558 + pop %r14
1559 .cfi_restore %r14
1560 - mov 16(%rsp),%r13
1561 + pop %r13
1562 .cfi_restore %r13
1563 - mov 24(%rsp),%r12
1564 + pop %r12
1565 .cfi_restore %r12
1566 - mov 32(%rsp),%rbp
1567 -.cfi_restore %rbp
1568 - mov 40(%rsp),%rbx
1569 + pop %rbx
1570 .cfi_restore %rbx
1571 - lea 48(%rsp),%rsp
1572 -.cfi_adjust_cfa_offset -48
1573 + pop %rbp
1574 +.cfi_restore %rbp
1575 .Lno_data_avx:
1576 .Lblocks_avx_epilogue:
1577 ret
1578 @@ -667,10 +757,11 @@ poly1305_blocks_avx:
1579 .align 32
1580 .Lbase2_64_avx:
1581 .cfi_startproc
1582 - push %rbx
1583 -.cfi_push %rbx
1584 push %rbp
1585 .cfi_push %rbp
1586 + mov %rsp,%rbp
1587 + push %rbx
1588 +.cfi_push %rbx
1589 push %r12
1590 .cfi_push %r12
1591 push %r13
1592 @@ -736,22 +827,18 @@ poly1305_blocks_avx:
1593
1594 .Lproceed_avx:
1595 mov %r15,$len
1596 -
1597 - mov 0(%rsp),%r15
1598 + pop %r15
1599 .cfi_restore %r15
1600 - mov 8(%rsp),%r14
1601 + pop %r14
1602 .cfi_restore %r14
1603 - mov 16(%rsp),%r13
1604 + pop %r13
1605 .cfi_restore %r13
1606 - mov 24(%rsp),%r12
1607 + pop %r12
1608 .cfi_restore %r12
1609 - mov 32(%rsp),%rbp
1610 -.cfi_restore %rbp
1611 - mov 40(%rsp),%rbx
1612 + pop %rbx
1613 .cfi_restore %rbx
1614 - lea 48(%rsp),%rax
1615 - lea 48(%rsp),%rsp
1616 -.cfi_adjust_cfa_offset -48
1617 + pop %rbp
1618 +.cfi_restore %rbp
1619 .Lbase2_64_avx_epilogue:
1620 jmp .Ldo_avx
1621 .cfi_endproc
1622 @@ -768,8 +855,11 @@ poly1305_blocks_avx:
1623 .Ldo_avx:
1624 ___
1625 $code.=<<___ if (!$win64);
1626 + lea 8(%rsp),%r10
1627 +.cfi_def_cfa_register %r10
1628 + and \$-32,%rsp
1629 + sub \$-8,%rsp
1630 lea -0x58(%rsp),%r11
1631 -.cfi_def_cfa %r11,0x60
1632 sub \$0x178,%rsp
1633 ___
1634 $code.=<<___ if ($win64);
1635 @@ -1361,18 +1451,18 @@ $code.=<<___ if ($win64);
1636 .Ldo_avx_epilogue:
1637 ___
1638 $code.=<<___ if (!$win64);
1639 - lea 0x58(%r11),%rsp
1640 -.cfi_def_cfa %rsp,8
1641 + lea -8(%r10),%rsp
1642 +.cfi_def_cfa_register %rsp
1643 ___
1644 $code.=<<___;
1645 vzeroupper
1646 ret
1647 .cfi_endproc
1648 -.size poly1305_blocks_avx,.-poly1305_blocks_avx
1649 +___
1650 +&end_function("poly1305_blocks_avx");
1651
1652 -.type poly1305_emit_avx,\@function,3
1653 -.align 32
1654 -poly1305_emit_avx:
1655 +&declare_function("poly1305_emit_avx", 32, 3);
1656 +$code.=<<___;
1657 cmpl \$0,20($ctx) # is_base2_26?
1658 je .Lemit
1659
1660 @@ -1423,41 +1513,51 @@ poly1305_emit_avx:
1661 mov %rcx,8($mac)
1662
1663 ret
1664 -.size poly1305_emit_avx,.-poly1305_emit_avx
1665 ___
1666 +&end_function("poly1305_emit_avx");
1667 +
1668 +if ($kernel) {
1669 + $code .= "#endif\n";
1670 +}
1671
1672 if ($avx>1) {
1673 +
1674 +if ($kernel) {
1675 + $code .= "#ifdef CONFIG_AS_AVX2\n";
1676 +}
1677 +
1678 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1679 map("%ymm$_",(0..15));
1680 my $S4=$MASK;
1681
1682 +sub poly1305_blocks_avxN {
1683 + my ($avx512) = @_;
1684 + my $suffix = $avx512 ? "_avx512" : "";
1685 $code.=<<___;
1686 -.type poly1305_blocks_avx2,\@function,4
1687 -.align 32
1688 -poly1305_blocks_avx2:
1689 .cfi_startproc
1690 mov 20($ctx),%r8d # is_base2_26
1691 cmp \$128,$len
1692 - jae .Lblocks_avx2
1693 + jae .Lblocks_avx2$suffix
1694 test %r8d,%r8d
1695 jz .Lblocks
1696
1697 -.Lblocks_avx2:
1698 +.Lblocks_avx2$suffix:
1699 and \$-16,$len
1700 - jz .Lno_data_avx2
1701 + jz .Lno_data_avx2$suffix
1702
1703 vzeroupper
1704
1705 test %r8d,%r8d
1706 - jz .Lbase2_64_avx2
1707 + jz .Lbase2_64_avx2$suffix
1708
1709 test \$63,$len
1710 - jz .Leven_avx2
1711 + jz .Leven_avx2$suffix
1712
1713 - push %rbx
1714 -.cfi_push %rbx
1715 push %rbp
1716 .cfi_push %rbp
1717 + mov %rsp,%rbp
1718 + push %rbx
1719 +.cfi_push %rbx
1720 push %r12
1721 .cfi_push %r12
1722 push %r13
1723 @@ -1466,7 +1566,7 @@ poly1305_blocks_avx2:
1724 .cfi_push %r14
1725 push %r15
1726 .cfi_push %r15
1727 -.Lblocks_avx2_body:
1728 +.Lblocks_avx2_body$suffix:
1729
1730 mov $len,%r15 # reassign $len
1731
1732 @@ -1513,7 +1613,7 @@ poly1305_blocks_avx2:
1733 shr \$2,$s1
1734 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1735
1736 -.Lbase2_26_pre_avx2:
1737 +.Lbase2_26_pre_avx2$suffix:
1738 add 0($inp),$h0 # accumulate input
1739 adc 8($inp),$h1
1740 lea 16($inp),$inp
1741 @@ -1524,10 +1624,10 @@ poly1305_blocks_avx2:
1742 mov $r1,%rax
1743
1744 test \$63,%r15
1745 - jnz .Lbase2_26_pre_avx2
1746 + jnz .Lbase2_26_pre_avx2$suffix
1747
1748 test $padbit,$padbit # if $padbit is zero,
1749 - jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
1750 + jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
1751
1752 ################################# base 2^64 -> base 2^26
1753 mov $h0,%rax
1754 @@ -1548,57 +1648,56 @@ poly1305_blocks_avx2:
1755 or $r1,$h2 # h[4]
1756
1757 test %r15,%r15
1758 - jz .Lstore_base2_26_avx2
1759 + jz .Lstore_base2_26_avx2$suffix
1760
1761 vmovd %rax#d,%x#$H0
1762 vmovd %rdx#d,%x#$H1
1763 vmovd $h0#d,%x#$H2
1764 vmovd $h1#d,%x#$H3
1765 vmovd $h2#d,%x#$H4
1766 - jmp .Lproceed_avx2
1767 + jmp .Lproceed_avx2$suffix
1768
1769 .align 32
1770 -.Lstore_base2_64_avx2:
1771 +.Lstore_base2_64_avx2$suffix:
1772 mov $h0,0($ctx)
1773 mov $h1,8($ctx)
1774 mov $h2,16($ctx) # note that is_base2_26 is zeroed
1775 - jmp .Ldone_avx2
1776 + jmp .Ldone_avx2$suffix
1777
1778 .align 16
1779 -.Lstore_base2_26_avx2:
1780 +.Lstore_base2_26_avx2$suffix:
1781 mov %rax#d,0($ctx) # store hash value base 2^26
1782 mov %rdx#d,4($ctx)
1783 mov $h0#d,8($ctx)
1784 mov $h1#d,12($ctx)
1785 mov $h2#d,16($ctx)
1786 .align 16
1787 -.Ldone_avx2:
1788 - mov 0(%rsp),%r15
1789 +.Ldone_avx2$suffix:
1790 + pop %r15
1791 .cfi_restore %r15
1792 - mov 8(%rsp),%r14
1793 + pop %r14
1794 .cfi_restore %r14
1795 - mov 16(%rsp),%r13
1796 + pop %r13
1797 .cfi_restore %r13
1798 - mov 24(%rsp),%r12
1799 + pop %r12
1800 .cfi_restore %r12
1801 - mov 32(%rsp),%rbp
1802 -.cfi_restore %rbp
1803 - mov 40(%rsp),%rbx
1804 + pop %rbx
1805 .cfi_restore %rbx
1806 - lea 48(%rsp),%rsp
1807 -.cfi_adjust_cfa_offset -48
1808 -.Lno_data_avx2:
1809 -.Lblocks_avx2_epilogue:
1810 + pop %rbp
1811 +.cfi_restore %rbp
1812 +.Lno_data_avx2$suffix:
1813 +.Lblocks_avx2_epilogue$suffix:
1814 ret
1815 .cfi_endproc
1816
1817 .align 32
1818 -.Lbase2_64_avx2:
1819 +.Lbase2_64_avx2$suffix:
1820 .cfi_startproc
1821 - push %rbx
1822 -.cfi_push %rbx
1823 push %rbp
1824 .cfi_push %rbp
1825 + mov %rsp,%rbp
1826 + push %rbx
1827 +.cfi_push %rbx
1828 push %r12
1829 .cfi_push %r12
1830 push %r13
1831 @@ -1607,7 +1706,7 @@ poly1305_blocks_avx2:
1832 .cfi_push %r14
1833 push %r15
1834 .cfi_push %r15
1835 -.Lbase2_64_avx2_body:
1836 +.Lbase2_64_avx2_body$suffix:
1837
1838 mov $len,%r15 # reassign $len
1839
1840 @@ -1624,9 +1723,9 @@ poly1305_blocks_avx2:
1841 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1842
1843 test \$63,$len
1844 - jz .Linit_avx2
1845 + jz .Linit_avx2$suffix
1846
1847 -.Lbase2_64_pre_avx2:
1848 +.Lbase2_64_pre_avx2$suffix:
1849 add 0($inp),$h0 # accumulate input
1850 adc 8($inp),$h1
1851 lea 16($inp),$inp
1852 @@ -1637,9 +1736,9 @@ poly1305_blocks_avx2:
1853 mov $r1,%rax
1854
1855 test \$63,%r15
1856 - jnz .Lbase2_64_pre_avx2
1857 + jnz .Lbase2_64_pre_avx2$suffix
1858
1859 -.Linit_avx2:
1860 +.Linit_avx2$suffix:
1861 ################################# base 2^64 -> base 2^26
1862 mov $h0,%rax
1863 mov $h0,%rdx
1864 @@ -1667,69 +1766,77 @@ poly1305_blocks_avx2:
1865
1866 call __poly1305_init_avx
1867
1868 -.Lproceed_avx2:
1869 +.Lproceed_avx2$suffix:
1870 mov %r15,$len # restore $len
1871 - mov OPENSSL_ia32cap_P+8(%rip),%r10d
1872 +___
1873 +$code.=<<___ if (!$kernel);
1874 + mov OPENSSL_ia32cap_P+8(%rip),%r9d
1875 mov \$`(1<<31|1<<30|1<<16)`,%r11d
1876 -
1877 - mov 0(%rsp),%r15
1878 +___
1879 +$code.=<<___;
1880 + pop %r15
1881 .cfi_restore %r15
1882 - mov 8(%rsp),%r14
1883 + pop %r14
1884 .cfi_restore %r14
1885 - mov 16(%rsp),%r13
1886 + pop %r13
1887 .cfi_restore %r13
1888 - mov 24(%rsp),%r12
1889 + pop %r12
1890 .cfi_restore %r12
1891 - mov 32(%rsp),%rbp
1892 -.cfi_restore %rbp
1893 - mov 40(%rsp),%rbx
1894 + pop %rbx
1895 .cfi_restore %rbx
1896 - lea 48(%rsp),%rax
1897 - lea 48(%rsp),%rsp
1898 -.cfi_adjust_cfa_offset -48
1899 -.Lbase2_64_avx2_epilogue:
1900 - jmp .Ldo_avx2
1901 + pop %rbp
1902 +.cfi_restore %rbp
1903 +.Lbase2_64_avx2_epilogue$suffix:
1904 + jmp .Ldo_avx2$suffix
1905 .cfi_endproc
1906
1907 .align 32
1908 -.Leven_avx2:
1909 +.Leven_avx2$suffix:
1910 .cfi_startproc
1911 - mov OPENSSL_ia32cap_P+8(%rip),%r10d
1912 +___
1913 +$code.=<<___ if (!$kernel);
1914 + mov OPENSSL_ia32cap_P+8(%rip),%r9d
1915 +___
1916 +$code.=<<___;
1917 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1918 vmovd 4*1($ctx),%x#$H1
1919 vmovd 4*2($ctx),%x#$H2
1920 vmovd 4*3($ctx),%x#$H3
1921 vmovd 4*4($ctx),%x#$H4
1922
1923 -.Ldo_avx2:
1924 +.Ldo_avx2$suffix:
1925 ___
1926 -$code.=<<___ if ($avx>2);
1927 +$code.=<<___ if (!$kernel && $avx>2);
1928 cmp \$512,$len
1929 jb .Lskip_avx512
1930 - and %r11d,%r10d
1931 - test \$`1<<16`,%r10d # check for AVX512F
1932 + and %r11d,%r9d
1933 + test \$`1<<16`,%r9d # check for AVX512F
1934 jnz .Lblocks_avx512
1935 -.Lskip_avx512:
1936 +.Lskip_avx512$suffix:
1937 +___
1938 +$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
1939 + cmp \$512,$len
1940 + jae .Lblocks_avx512
1941 ___
1942 $code.=<<___ if (!$win64);
1943 - lea -8(%rsp),%r11
1944 -.cfi_def_cfa %r11,16
1945 + lea 8(%rsp),%r10
1946 +.cfi_def_cfa_register %r10
1947 sub \$0x128,%rsp
1948 ___
1949 $code.=<<___ if ($win64);
1950 - lea -0xf8(%rsp),%r11
1951 + lea 8(%rsp),%r10
1952 sub \$0x1c8,%rsp
1953 - vmovdqa %xmm6,0x50(%r11)
1954 - vmovdqa %xmm7,0x60(%r11)
1955 - vmovdqa %xmm8,0x70(%r11)
1956 - vmovdqa %xmm9,0x80(%r11)
1957 - vmovdqa %xmm10,0x90(%r11)
1958 - vmovdqa %xmm11,0xa0(%r11)
1959 - vmovdqa %xmm12,0xb0(%r11)
1960 - vmovdqa %xmm13,0xc0(%r11)
1961 - vmovdqa %xmm14,0xd0(%r11)
1962 - vmovdqa %xmm15,0xe0(%r11)
1963 -.Ldo_avx2_body:
1964 + vmovdqa %xmm6,-0xb0(%r10)
1965 + vmovdqa %xmm7,-0xa0(%r10)
1966 + vmovdqa %xmm8,-0x90(%r10)
1967 + vmovdqa %xmm9,-0x80(%r10)
1968 + vmovdqa %xmm10,-0x70(%r10)
1969 + vmovdqa %xmm11,-0x60(%r10)
1970 + vmovdqa %xmm12,-0x50(%r10)
1971 + vmovdqa %xmm13,-0x40(%r10)
1972 + vmovdqa %xmm14,-0x30(%r10)
1973 + vmovdqa %xmm15,-0x20(%r10)
1974 +.Ldo_avx2_body$suffix:
1975 ___
1976 $code.=<<___;
1977 lea .Lconst(%rip),%rcx
1978 @@ -1794,11 +1901,11 @@ $code.=<<___;
1979
1980 vpaddq $H2,$T2,$H2 # accumulate input
1981 sub \$64,$len
1982 - jz .Ltail_avx2
1983 - jmp .Loop_avx2
1984 + jz .Ltail_avx2$suffix
1985 + jmp .Loop_avx2$suffix
1986
1987 .align 32
1988 -.Loop_avx2:
1989 +.Loop_avx2$suffix:
1990 ################################################################
1991 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1992 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1993 @@ -1946,10 +2053,10 @@ $code.=<<___;
1994 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1995
1996 sub \$64,$len
1997 - jnz .Loop_avx2
1998 + jnz .Loop_avx2$suffix
1999
2000 .byte 0x66,0x90
2001 -.Ltail_avx2:
2002 +.Ltail_avx2$suffix:
2003 ################################################################
2004 # while above multiplications were by r^4 in all lanes, in last
2005 # iteration we multiply least significant lane by r^4 and most
2006 @@ -2087,37 +2194,29 @@ $code.=<<___;
2007 vmovd %x#$H4,`4*4-48-64`($ctx)
2008 ___
2009 $code.=<<___ if ($win64);
2010 - vmovdqa 0x50(%r11),%xmm6
2011 - vmovdqa 0x60(%r11),%xmm7
2012 - vmovdqa 0x70(%r11),%xmm8
2013 - vmovdqa 0x80(%r11),%xmm9
2014 - vmovdqa 0x90(%r11),%xmm10
2015 - vmovdqa 0xa0(%r11),%xmm11
2016 - vmovdqa 0xb0(%r11),%xmm12
2017 - vmovdqa 0xc0(%r11),%xmm13
2018 - vmovdqa 0xd0(%r11),%xmm14
2019 - vmovdqa 0xe0(%r11),%xmm15
2020 - lea 0xf8(%r11),%rsp
2021 -.Ldo_avx2_epilogue:
2022 + vmovdqa -0xb0(%r10),%xmm6
2023 + vmovdqa -0xa0(%r10),%xmm7
2024 + vmovdqa -0x90(%r10),%xmm8
2025 + vmovdqa -0x80(%r10),%xmm9
2026 + vmovdqa -0x70(%r10),%xmm10
2027 + vmovdqa -0x60(%r10),%xmm11
2028 + vmovdqa -0x50(%r10),%xmm12
2029 + vmovdqa -0x40(%r10),%xmm13
2030 + vmovdqa -0x30(%r10),%xmm14
2031 + vmovdqa -0x20(%r10),%xmm15
2032 + lea -8(%r10),%rsp
2033 +.Ldo_avx2_epilogue$suffix:
2034 ___
2035 $code.=<<___ if (!$win64);
2036 - lea 8(%r11),%rsp
2037 -.cfi_def_cfa %rsp,8
2038 + lea -8(%r10),%rsp
2039 +.cfi_def_cfa_register %rsp
2040 ___
2041 $code.=<<___;
2042 vzeroupper
2043 ret
2044 .cfi_endproc
2045 -.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
2046 ___
2047 -#######################################################################
2048 -if ($avx>2) {
2049 -# On entry we have input length divisible by 64. But since inner loop
2050 -# processes 128 bytes per iteration, cases when length is not divisible
2051 -# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2052 -# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2053 -# for this tail, we wouldn't have to even allocate stack frame...
2054 -
2055 +if($avx > 2 && $avx512) {
2056 my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2057 my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2058 my $PADBIT="%zmm30";
2059 @@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2060 map(s/%y/%z/,($MASK));
2061
2062 $code.=<<___;
2063 -.type poly1305_blocks_avx512,\@function,4
2064 -.align 32
2065 -poly1305_blocks_avx512:
2066 .cfi_startproc
2067 .Lblocks_avx512:
2068 mov \$15,%eax
2069 kmovw %eax,%k2
2070 ___
2071 $code.=<<___ if (!$win64);
2072 - lea -8(%rsp),%r11
2073 -.cfi_def_cfa %r11,16
2074 + lea 8(%rsp),%r10
2075 +.cfi_def_cfa_register %r10
2076 sub \$0x128,%rsp
2077 ___
2078 $code.=<<___ if ($win64);
2079 - lea -0xf8(%rsp),%r11
2080 + lea 8(%rsp),%r10
2081 sub \$0x1c8,%rsp
2082 - vmovdqa %xmm6,0x50(%r11)
2083 - vmovdqa %xmm7,0x60(%r11)
2084 - vmovdqa %xmm8,0x70(%r11)
2085 - vmovdqa %xmm9,0x80(%r11)
2086 - vmovdqa %xmm10,0x90(%r11)
2087 - vmovdqa %xmm11,0xa0(%r11)
2088 - vmovdqa %xmm12,0xb0(%r11)
2089 - vmovdqa %xmm13,0xc0(%r11)
2090 - vmovdqa %xmm14,0xd0(%r11)
2091 - vmovdqa %xmm15,0xe0(%r11)
2092 + vmovdqa %xmm6,-0xb0(%r10)
2093 + vmovdqa %xmm7,-0xa0(%r10)
2094 + vmovdqa %xmm8,-0x90(%r10)
2095 + vmovdqa %xmm9,-0x80(%r10)
2096 + vmovdqa %xmm10,-0x70(%r10)
2097 + vmovdqa %xmm11,-0x60(%r10)
2098 + vmovdqa %xmm12,-0x50(%r10)
2099 + vmovdqa %xmm13,-0x40(%r10)
2100 + vmovdqa %xmm14,-0x30(%r10)
2101 + vmovdqa %xmm15,-0x20(%r10)
2102 .Ldo_avx512_body:
2103 ___
2104 $code.=<<___;
2105 @@ -2679,7 +2775,7 @@ $code.=<<___;
2106
2107 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
2108 add \$64,$len
2109 - jnz .Ltail_avx2
2110 + jnz .Ltail_avx2$suffix
2111
2112 vpsubq $T2,$H2,$H2 # undo input accumulation
2113 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2114 @@ -2690,29 +2786,61 @@ $code.=<<___;
2115 vzeroall
2116 ___
2117 $code.=<<___ if ($win64);
2118 - movdqa 0x50(%r11),%xmm6
2119 - movdqa 0x60(%r11),%xmm7
2120 - movdqa 0x70(%r11),%xmm8
2121 - movdqa 0x80(%r11),%xmm9
2122 - movdqa 0x90(%r11),%xmm10
2123 - movdqa 0xa0(%r11),%xmm11
2124 - movdqa 0xb0(%r11),%xmm12
2125 - movdqa 0xc0(%r11),%xmm13
2126 - movdqa 0xd0(%r11),%xmm14
2127 - movdqa 0xe0(%r11),%xmm15
2128 - lea 0xf8(%r11),%rsp
2129 + movdqa -0xb0(%r10),%xmm6
2130 + movdqa -0xa0(%r10),%xmm7
2131 + movdqa -0x90(%r10),%xmm8
2132 + movdqa -0x80(%r10),%xmm9
2133 + movdqa -0x70(%r10),%xmm10
2134 + movdqa -0x60(%r10),%xmm11
2135 + movdqa -0x50(%r10),%xmm12
2136 + movdqa -0x40(%r10),%xmm13
2137 + movdqa -0x30(%r10),%xmm14
2138 + movdqa -0x20(%r10),%xmm15
2139 + lea -8(%r10),%rsp
2140 .Ldo_avx512_epilogue:
2141 ___
2142 $code.=<<___ if (!$win64);
2143 - lea 8(%r11),%rsp
2144 -.cfi_def_cfa %rsp,8
2145 + lea -8(%r10),%rsp
2146 +.cfi_def_cfa_register %rsp
2147 ___
2148 $code.=<<___;
2149 ret
2150 .cfi_endproc
2151 -.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
2152 ___
2153 -if ($avx>3) {
2154 +
2155 +}
2156 +
2157 +}
2158 +
2159 +&declare_function("poly1305_blocks_avx2", 32, 4);
2160 +poly1305_blocks_avxN(0);
2161 +&end_function("poly1305_blocks_avx2");
2162 +
2163 +if($kernel) {
2164 + $code .= "#endif\n";
2165 +}
2166 +
2167 +#######################################################################
2168 +if ($avx>2) {
2169 +# On entry we have input length divisible by 64. But since inner loop
2170 +# processes 128 bytes per iteration, cases when length is not divisible
2171 +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2172 +# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2173 +# for this tail, we wouldn't have to even allocate stack frame...
2174 +
2175 +if($kernel) {
2176 + $code .= "#ifdef CONFIG_AS_AVX512\n";
2177 +}
2178 +
2179 +&declare_function("poly1305_blocks_avx512", 32, 4);
2180 +poly1305_blocks_avxN(1);
2181 +&end_function("poly1305_blocks_avx512");
2182 +
2183 +if ($kernel) {
2184 + $code .= "#endif\n";
2185 +}
2186 +
2187 +if (!$kernel && $avx>3) {
2188 ########################################################################
2189 # VPMADD52 version using 2^44 radix.
2190 #
2191 @@ -3753,45 +3881,9 @@ poly1305_emit_base2_44:
2192 .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
2193 ___
2194 } } }
2195 -$code.=<<___;
2196 -.align 64
2197 -.Lconst:
2198 -.Lmask24:
2199 -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
2200 -.L129:
2201 -.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
2202 -.Lmask26:
2203 -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
2204 -.Lpermd_avx2:
2205 -.long 2,2,2,3,2,0,2,1
2206 -.Lpermd_avx512:
2207 -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
2208 -
2209 -.L2_44_inp_permd:
2210 -.long 0,1,1,2,2,3,7,7
2211 -.L2_44_inp_shift:
2212 -.quad 0,12,24,64
2213 -.L2_44_mask:
2214 -.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
2215 -.L2_44_shift_rgt:
2216 -.quad 44,44,42,64
2217 -.L2_44_shift_lft:
2218 -.quad 8,8,10,64
2219 -
2220 -.align 64
2221 -.Lx_mask44:
2222 -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
2223 -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
2224 -.Lx_mask42:
2225 -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
2226 -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
2227 -___
2228 }
2229 -$code.=<<___;
2230 -.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2231 -.align 16
2232 -___
2233
2234 +if (!$kernel)
2235 { # chacha20-poly1305 helpers
2236 my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
2237 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
2238 @@ -4038,17 +4130,17 @@ avx_handler:
2239
2240 .section .pdata
2241 .align 4
2242 - .rva .LSEH_begin_poly1305_init
2243 - .rva .LSEH_end_poly1305_init
2244 - .rva .LSEH_info_poly1305_init
2245 -
2246 - .rva .LSEH_begin_poly1305_blocks
2247 - .rva .LSEH_end_poly1305_blocks
2248 - .rva .LSEH_info_poly1305_blocks
2249 -
2250 - .rva .LSEH_begin_poly1305_emit
2251 - .rva .LSEH_end_poly1305_emit
2252 - .rva .LSEH_info_poly1305_emit
2253 + .rva .LSEH_begin_poly1305_init_x86_64
2254 + .rva .LSEH_end_poly1305_init_x86_64
2255 + .rva .LSEH_info_poly1305_init_x86_64
2256 +
2257 + .rva .LSEH_begin_poly1305_blocks_x86_64
2258 + .rva .LSEH_end_poly1305_blocks_x86_64
2259 + .rva .LSEH_info_poly1305_blocks_x86_64
2260 +
2261 + .rva .LSEH_begin_poly1305_emit_x86_64
2262 + .rva .LSEH_end_poly1305_emit_x86_64
2263 + .rva .LSEH_info_poly1305_emit_x86_64
2264 ___
2265 $code.=<<___ if ($avx);
2266 .rva .LSEH_begin_poly1305_blocks_avx
2267 @@ -4088,20 +4180,20 @@ ___
2268 $code.=<<___;
2269 .section .xdata
2270 .align 8
2271 -.LSEH_info_poly1305_init:
2272 +.LSEH_info_poly1305_init_x86_64:
2273 .byte 9,0,0,0
2274 .rva se_handler
2275 - .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
2276 + .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
2277
2278 -.LSEH_info_poly1305_blocks:
2279 +.LSEH_info_poly1305_blocks_x86_64:
2280 .byte 9,0,0,0
2281 .rva se_handler
2282 .rva .Lblocks_body,.Lblocks_epilogue
2283
2284 -.LSEH_info_poly1305_emit:
2285 +.LSEH_info_poly1305_emit_x86_64:
2286 .byte 9,0,0,0
2287 .rva se_handler
2288 - .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
2289 + .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
2290 ___
2291 $code.=<<___ if ($avx);
2292 .LSEH_info_poly1305_blocks_avx_1:
2293 @@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2);
2294 ___
2295 }
2296
2297 +open SELF,$0;
2298 +while(<SELF>) {
2299 + next if (/^#!/);
2300 + last if (!s/^#/\/\// and !/^$/);
2301 + print;
2302 +}
2303 +close SELF;
2304 +
2305 foreach (split('\n',$code)) {
2306 s/\`([^\`]*)\`/eval($1)/ge;
2307 s/%r([a-z]+)#d/%e$1/g;
2308 s/%r([0-9]+)#d/%r$1d/g;
2309 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
2310
2311 + if ($kernel) {
2312 + s/(^\.type.*),[0-9]+$/\1/;
2313 + s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
2314 + next if /^\.cfi.*/;
2315 + }
2316 +
2317 print $_,"\n";
2318 }
2319 close STDOUT;
2320 --- a/arch/x86/crypto/poly1305_glue.c
2321 +++ b/arch/x86/crypto/poly1305_glue.c
2322 @@ -1,8 +1,6 @@
2323 -// SPDX-License-Identifier: GPL-2.0-or-later
2324 +// SPDX-License-Identifier: GPL-2.0 OR MIT
2325 /*
2326 - * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
2327 - *
2328 - * Copyright (C) 2015 Martin Willi
2329 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
2330 */
2331
2332 #include <crypto/algapi.h>
2333 @@ -13,279 +11,170 @@
2334 #include <linux/jump_label.h>
2335 #include <linux/kernel.h>
2336 #include <linux/module.h>
2337 +#include <asm/intel-family.h>
2338 #include <asm/simd.h>
2339
2340 -asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
2341 - const u32 *r, unsigned int blocks);
2342 -asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
2343 - unsigned int blocks, const u32 *u);
2344 -asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
2345 - unsigned int blocks, const u32 *u);
2346 +asmlinkage void poly1305_init_x86_64(void *ctx,
2347 + const u8 key[POLY1305_KEY_SIZE]);
2348 +asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
2349 + const size_t len, const u32 padbit);
2350 +asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
2351 + const u32 nonce[4]);
2352 +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
2353 + const u32 nonce[4]);
2354 +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
2355 + const u32 padbit);
2356 +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
2357 + const u32 padbit);
2358 +asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
2359 + const size_t len, const u32 padbit);
2360
2361 -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
2362 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
2363 static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
2364 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
2365
2366 -static inline u64 mlt(u64 a, u64 b)
2367 -{
2368 - return a * b;
2369 -}
2370 -
2371 -static inline u32 sr(u64 v, u_char n)
2372 -{
2373 - return v >> n;
2374 -}
2375 -
2376 -static inline u32 and(u32 v, u32 mask)
2377 -{
2378 - return v & mask;
2379 -}
2380 -
2381 -static void poly1305_simd_mult(u32 *a, const u32 *b)
2382 -{
2383 - u8 m[POLY1305_BLOCK_SIZE];
2384 -
2385 - memset(m, 0, sizeof(m));
2386 - /* The poly1305 block function adds a hi-bit to the accumulator which
2387 - * we don't need for key multiplication; compensate for it. */
2388 - a[4] -= 1 << 24;
2389 - poly1305_block_sse2(a, m, b, 1);
2390 -}
2391 -
2392 -static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key)
2393 -{
2394 - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
2395 - key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff;
2396 - key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03;
2397 - key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff;
2398 - key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff;
2399 - key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
2400 -}
2401 +struct poly1305_arch_internal {
2402 + union {
2403 + struct {
2404 + u32 h[5];
2405 + u32 is_base2_26;
2406 + };
2407 + u64 hs[3];
2408 + };
2409 + u64 r[2];
2410 + u64 pad;
2411 + struct { u32 r2, r1, r4, r3; } rn[9];
2412 +};
2413
2414 -static void poly1305_integer_blocks(struct poly1305_state *state,
2415 - const struct poly1305_key *key,
2416 - const void *src,
2417 - unsigned int nblocks, u32 hibit)
2418 +/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
2419 + * the unfortunate situation of using AVX and then having to go back to scalar
2420 + * -- because the user is silly and has called the update function from two
2421 + * separate contexts -- then we need to convert back to the original base before
2422 + * proceeding. It is possible to reason that the initial reduction below is
2423 + * sufficient given the implementation invariants. However, for an avoidance of
2424 + * doubt and because this is not performance critical, we do the full reduction
2425 + * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
2426 + */
2427 +static void convert_to_base2_64(void *ctx)
2428 {
2429 - u32 r0, r1, r2, r3, r4;
2430 - u32 s1, s2, s3, s4;
2431 - u32 h0, h1, h2, h3, h4;
2432 - u64 d0, d1, d2, d3, d4;
2433 + struct poly1305_arch_internal *state = ctx;
2434 + u32 cy;
2435
2436 - if (!nblocks)
2437 + if (!state->is_base2_26)
2438 return;
2439
2440 - r0 = key->r[0];
2441 - r1 = key->r[1];
2442 - r2 = key->r[2];
2443 - r3 = key->r[3];
2444 - r4 = key->r[4];
2445 -
2446 - s1 = r1 * 5;
2447 - s2 = r2 * 5;
2448 - s3 = r3 * 5;
2449 - s4 = r4 * 5;
2450 -
2451 - h0 = state->h[0];
2452 - h1 = state->h[1];
2453 - h2 = state->h[2];
2454 - h3 = state->h[3];
2455 - h4 = state->h[4];
2456 -
2457 - do {
2458 - /* h += m[i] */
2459 - h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff;
2460 - h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
2461 - h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
2462 - h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
2463 - h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
2464 -
2465 - /* h *= r */
2466 - d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
2467 - mlt(h3, s2) + mlt(h4, s1);
2468 - d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
2469 - mlt(h3, s3) + mlt(h4, s2);
2470 - d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
2471 - mlt(h3, s4) + mlt(h4, s3);
2472 - d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
2473 - mlt(h3, r0) + mlt(h4, s4);
2474 - d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
2475 - mlt(h3, r1) + mlt(h4, r0);
2476 -
2477 - /* (partial) h %= p */
2478 - d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
2479 - d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
2480 - d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
2481 - d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
2482 - h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
2483 - h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
2484 -
2485 - src += POLY1305_BLOCK_SIZE;
2486 - } while (--nblocks);
2487 -
2488 - state->h[0] = h0;
2489 - state->h[1] = h1;
2490 - state->h[2] = h2;
2491 - state->h[3] = h3;
2492 - state->h[4] = h4;
2493 -}
2494 -
2495 -static void poly1305_integer_emit(const struct poly1305_state *state, void *dst)
2496 -{
2497 - u32 h0, h1, h2, h3, h4;
2498 - u32 g0, g1, g2, g3, g4;
2499 - u32 mask;
2500 -
2501 - /* fully carry h */
2502 - h0 = state->h[0];
2503 - h1 = state->h[1];
2504 - h2 = state->h[2];
2505 - h3 = state->h[3];
2506 - h4 = state->h[4];
2507 -
2508 - h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
2509 - h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
2510 - h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
2511 - h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
2512 - h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
2513 -
2514 - /* compute h + -p */
2515 - g0 = h0 + 5;
2516 - g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
2517 - g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
2518 - g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
2519 - g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
2520 -
2521 - /* select h if h < p, or h + -p if h >= p */
2522 - mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
2523 - g0 &= mask;
2524 - g1 &= mask;
2525 - g2 &= mask;
2526 - g3 &= mask;
2527 - g4 &= mask;
2528 - mask = ~mask;
2529 - h0 = (h0 & mask) | g0;
2530 - h1 = (h1 & mask) | g1;
2531 - h2 = (h2 & mask) | g2;
2532 - h3 = (h3 & mask) | g3;
2533 - h4 = (h4 & mask) | g4;
2534 -
2535 - /* h = h % (2^128) */
2536 - put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0);
2537 - put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4);
2538 - put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8);
2539 - put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12);
2540 -}
2541 -
2542 -void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
2543 -{
2544 - poly1305_integer_setkey(desc->opaque_r, key);
2545 - desc->s[0] = get_unaligned_le32(key + 16);
2546 - desc->s[1] = get_unaligned_le32(key + 20);
2547 - desc->s[2] = get_unaligned_le32(key + 24);
2548 - desc->s[3] = get_unaligned_le32(key + 28);
2549 - poly1305_core_init(&desc->h);
2550 - desc->buflen = 0;
2551 - desc->sset = true;
2552 - desc->rset = 1;
2553 -}
2554 -EXPORT_SYMBOL_GPL(poly1305_init_arch);
2555 -
2556 -static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
2557 - const u8 *src, unsigned int srclen)
2558 -{
2559 - if (!dctx->sset) {
2560 - if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
2561 - poly1305_integer_setkey(dctx->r, src);
2562 - src += POLY1305_BLOCK_SIZE;
2563 - srclen -= POLY1305_BLOCK_SIZE;
2564 - dctx->rset = 1;
2565 - }
2566 - if (srclen >= POLY1305_BLOCK_SIZE) {
2567 - dctx->s[0] = get_unaligned_le32(src + 0);
2568 - dctx->s[1] = get_unaligned_le32(src + 4);
2569 - dctx->s[2] = get_unaligned_le32(src + 8);
2570 - dctx->s[3] = get_unaligned_le32(src + 12);
2571 - src += POLY1305_BLOCK_SIZE;
2572 - srclen -= POLY1305_BLOCK_SIZE;
2573 - dctx->sset = true;
2574 - }
2575 + cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
2576 + cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
2577 + cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
2578 + cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
2579 + state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
2580 + state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
2581 + state->hs[2] = state->h[4] >> 24;
2582 +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
2583 + cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
2584 + state->hs[2] &= 3;
2585 + state->hs[0] += cy;
2586 + state->hs[1] += (cy = ULT(state->hs[0], cy));
2587 + state->hs[2] += ULT(state->hs[1], cy);
2588 +#undef ULT
2589 + state->is_base2_26 = 0;
2590 +}
2591 +
2592 +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
2593 +{
2594 + poly1305_init_x86_64(ctx, key);
2595 +}
2596 +
2597 +static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
2598 + const u32 padbit)
2599 +{
2600 + struct poly1305_arch_internal *state = ctx;
2601 +
2602 + /* SIMD disables preemption, so relax after processing each page. */
2603 + BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
2604 + PAGE_SIZE % POLY1305_BLOCK_SIZE);
2605 +
2606 + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
2607 + (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
2608 + !crypto_simd_usable()) {
2609 + convert_to_base2_64(ctx);
2610 + poly1305_blocks_x86_64(ctx, inp, len, padbit);
2611 + return;
2612 }
2613 - return srclen;
2614 -}
2615
2616 -static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
2617 - const u8 *src, unsigned int srclen)
2618 -{
2619 - unsigned int datalen;
2620 + for (;;) {
2621 + const size_t bytes = min_t(size_t, len, PAGE_SIZE);
2622
2623 - if (unlikely(!dctx->sset)) {
2624 - datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
2625 - src += srclen - datalen;
2626 - srclen = datalen;
2627 - }
2628 - if (srclen >= POLY1305_BLOCK_SIZE) {
2629 - poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src,
2630 - srclen / POLY1305_BLOCK_SIZE, 1);
2631 - srclen %= POLY1305_BLOCK_SIZE;
2632 + kernel_fpu_begin();
2633 + if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
2634 + poly1305_blocks_avx512(ctx, inp, bytes, padbit);
2635 + else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
2636 + poly1305_blocks_avx2(ctx, inp, bytes, padbit);
2637 + else
2638 + poly1305_blocks_avx(ctx, inp, bytes, padbit);
2639 + kernel_fpu_end();
2640 + len -= bytes;
2641 + if (!len)
2642 + break;
2643 + inp += bytes;
2644 }
2645 - return srclen;
2646 }
2647
2648 -static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
2649 - const u8 *src, unsigned int srclen)
2650 -{
2651 - unsigned int blocks, datalen;
2652 +static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
2653 + const u32 nonce[4])
2654 +{
2655 + struct poly1305_arch_internal *state = ctx;
2656 +
2657 + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
2658 + !state->is_base2_26 || !crypto_simd_usable()) {
2659 + convert_to_base2_64(ctx);
2660 + poly1305_emit_x86_64(ctx, mac, nonce);
2661 + } else
2662 + poly1305_emit_avx(ctx, mac, nonce);
2663 +}
2664 +
2665 +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
2666 +{
2667 + poly1305_simd_init(&dctx->h, key);
2668 + dctx->s[0] = get_unaligned_le32(&key[16]);
2669 + dctx->s[1] = get_unaligned_le32(&key[20]);
2670 + dctx->s[2] = get_unaligned_le32(&key[24]);
2671 + dctx->s[3] = get_unaligned_le32(&key[28]);
2672 + dctx->buflen = 0;
2673 + dctx->sset = true;
2674 +}
2675 +EXPORT_SYMBOL(poly1305_init_arch);
2676
2677 +static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
2678 + const u8 *inp, unsigned int len)
2679 +{
2680 + unsigned int acc = 0;
2681 if (unlikely(!dctx->sset)) {
2682 - datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
2683 - src += srclen - datalen;
2684 - srclen = datalen;
2685 - }
2686 -
2687 - if (IS_ENABLED(CONFIG_AS_AVX2) &&
2688 - static_branch_likely(&poly1305_use_avx2) &&
2689 - srclen >= POLY1305_BLOCK_SIZE * 4) {
2690 - if (unlikely(dctx->rset < 4)) {
2691 - if (dctx->rset < 2) {
2692 - dctx->r[1] = dctx->r[0];
2693 - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
2694 - }
2695 - dctx->r[2] = dctx->r[1];
2696 - poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r);
2697 - dctx->r[3] = dctx->r[2];
2698 - poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r);
2699 - dctx->rset = 4;
2700 + if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
2701 + poly1305_simd_init(&dctx->h, inp);
2702 + inp += POLY1305_BLOCK_SIZE;
2703 + len -= POLY1305_BLOCK_SIZE;
2704 + acc += POLY1305_BLOCK_SIZE;
2705 + dctx->rset = 1;
2706 }
2707 - blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
2708 - poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks,
2709 - dctx->r[1].r);
2710 - src += POLY1305_BLOCK_SIZE * 4 * blocks;
2711 - srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
2712 - }
2713 -
2714 - if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
2715 - if (unlikely(dctx->rset < 2)) {
2716 - dctx->r[1] = dctx->r[0];
2717 - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
2718 - dctx->rset = 2;
2719 + if (len >= POLY1305_BLOCK_SIZE) {
2720 + dctx->s[0] = get_unaligned_le32(&inp[0]);
2721 + dctx->s[1] = get_unaligned_le32(&inp[4]);
2722 + dctx->s[2] = get_unaligned_le32(&inp[8]);
2723 + dctx->s[3] = get_unaligned_le32(&inp[12]);
2724 + inp += POLY1305_BLOCK_SIZE;
2725 + len -= POLY1305_BLOCK_SIZE;
2726 + acc += POLY1305_BLOCK_SIZE;
2727 + dctx->sset = true;
2728 }
2729 - blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
2730 - poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r,
2731 - blocks, dctx->r[1].r);
2732 - src += POLY1305_BLOCK_SIZE * 2 * blocks;
2733 - srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
2734 - }
2735 - if (srclen >= POLY1305_BLOCK_SIZE) {
2736 - poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1);
2737 - srclen -= POLY1305_BLOCK_SIZE;
2738 }
2739 - return srclen;
2740 + return acc;
2741 }
2742
2743 void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
2744 unsigned int srclen)
2745 {
2746 - unsigned int bytes;
2747 + unsigned int bytes, used;
2748
2749 if (unlikely(dctx->buflen)) {
2750 bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
2751 @@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly130
2752 dctx->buflen += bytes;
2753
2754 if (dctx->buflen == POLY1305_BLOCK_SIZE) {
2755 - if (static_branch_likely(&poly1305_use_simd) &&
2756 - likely(crypto_simd_usable())) {
2757 - kernel_fpu_begin();
2758 - poly1305_simd_blocks(dctx, dctx->buf,
2759 - POLY1305_BLOCK_SIZE);
2760 - kernel_fpu_end();
2761 - } else {
2762 - poly1305_scalar_blocks(dctx, dctx->buf,
2763 - POLY1305_BLOCK_SIZE);
2764 - }
2765 + if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
2766 + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
2767 dctx->buflen = 0;
2768 }
2769 }
2770
2771 if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
2772 - if (static_branch_likely(&poly1305_use_simd) &&
2773 - likely(crypto_simd_usable())) {
2774 - kernel_fpu_begin();
2775 - bytes = poly1305_simd_blocks(dctx, src, srclen);
2776 - kernel_fpu_end();
2777 - } else {
2778 - bytes = poly1305_scalar_blocks(dctx, src, srclen);
2779 - }
2780 - src += srclen - bytes;
2781 - srclen = bytes;
2782 + bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
2783 + srclen -= bytes;
2784 + used = crypto_poly1305_setdctxkey(dctx, src, bytes);
2785 + if (likely(bytes - used))
2786 + poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
2787 + src += bytes;
2788 }
2789
2790 if (unlikely(srclen)) {
2791 @@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly130
2792 }
2793 EXPORT_SYMBOL(poly1305_update_arch);
2794
2795 -void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst)
2796 +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
2797 {
2798 - __le32 digest[4];
2799 - u64 f = 0;
2800 -
2801 - if (unlikely(desc->buflen)) {
2802 - desc->buf[desc->buflen++] = 1;
2803 - memset(desc->buf + desc->buflen, 0,
2804 - POLY1305_BLOCK_SIZE - desc->buflen);
2805 - poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0);
2806 + if (unlikely(dctx->buflen)) {
2807 + dctx->buf[dctx->buflen++] = 1;
2808 + memset(dctx->buf + dctx->buflen, 0,
2809 + POLY1305_BLOCK_SIZE - dctx->buflen);
2810 + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
2811 }
2812
2813 - poly1305_integer_emit(&desc->h, digest);
2814 -
2815 - /* mac = (h + s) % (2^128) */
2816 - f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
2817 - put_unaligned_le32(f, dst + 0);
2818 - f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
2819 - put_unaligned_le32(f, dst + 4);
2820 - f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
2821 - put_unaligned_le32(f, dst + 8);
2822 - f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
2823 - put_unaligned_le32(f, dst + 12);
2824 -
2825 - *desc = (struct poly1305_desc_ctx){};
2826 + poly1305_simd_emit(&dctx->h, dst, dctx->s);
2827 + *dctx = (struct poly1305_desc_ctx){};
2828 }
2829 EXPORT_SYMBOL(poly1305_final_arch);
2830
2831 @@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct s
2832 {
2833 struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2834
2835 - poly1305_core_init(&dctx->h);
2836 - dctx->buflen = 0;
2837 - dctx->rset = 0;
2838 - dctx->sset = false;
2839 -
2840 + *dctx = (struct poly1305_desc_ctx){};
2841 return 0;
2842 }
2843
2844 -static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
2845 +static int crypto_poly1305_update(struct shash_desc *desc,
2846 + const u8 *src, unsigned int srclen)
2847 {
2848 struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2849
2850 - if (unlikely(!dctx->sset))
2851 - return -ENOKEY;
2852 -
2853 - poly1305_final_arch(dctx, dst);
2854 + poly1305_update_arch(dctx, src, srclen);
2855 return 0;
2856 }
2857
2858 -static int poly1305_simd_update(struct shash_desc *desc,
2859 - const u8 *src, unsigned int srclen)
2860 +static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
2861 {
2862 struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2863
2864 - poly1305_update_arch(dctx, src, srclen);
2865 + if (unlikely(!dctx->sset))
2866 + return -ENOKEY;
2867 +
2868 + poly1305_final_arch(dctx, dst);
2869 return 0;
2870 }
2871
2872 static struct shash_alg alg = {
2873 .digestsize = POLY1305_DIGEST_SIZE,
2874 .init = crypto_poly1305_init,
2875 - .update = poly1305_simd_update,
2876 + .update = crypto_poly1305_update,
2877 .final = crypto_poly1305_final,
2878 .descsize = sizeof(struct poly1305_desc_ctx),
2879 .base = {
2880 @@ -406,17 +265,19 @@ static struct shash_alg alg = {
2881
2882 static int __init poly1305_simd_mod_init(void)
2883 {
2884 - if (!boot_cpu_has(X86_FEATURE_XMM2))
2885 - return 0;
2886 -
2887 - static_branch_enable(&poly1305_use_simd);
2888 -
2889 - if (IS_ENABLED(CONFIG_AS_AVX2) &&
2890 - boot_cpu_has(X86_FEATURE_AVX) &&
2891 + if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
2892 + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
2893 + static_branch_enable(&poly1305_use_avx);
2894 + if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
2895 boot_cpu_has(X86_FEATURE_AVX2) &&
2896 cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
2897 static_branch_enable(&poly1305_use_avx2);
2898 -
2899 + if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
2900 + boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
2901 + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
2902 + /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
2903 + boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
2904 + static_branch_enable(&poly1305_use_avx512);
2905 return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
2906 }
2907
2908 @@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init);
2909 module_exit(poly1305_simd_mod_exit);
2910
2911 MODULE_LICENSE("GPL");
2912 -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
2913 +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
2914 MODULE_DESCRIPTION("Poly1305 authenticator");
2915 MODULE_ALIAS_CRYPTO("poly1305");
2916 MODULE_ALIAS_CRYPTO("poly1305-simd");
2917 --- a/lib/crypto/Kconfig
2918 +++ b/lib/crypto/Kconfig
2919 @@ -90,7 +90,7 @@ config CRYPTO_LIB_DES
2920 config CRYPTO_LIB_POLY1305_RSIZE
2921 int
2922 default 2 if MIPS
2923 - default 4 if X86_64
2924 + default 11 if X86_64
2925 default 9 if ARM || ARM64
2926 default 1
2927