1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Sun, 5 Jan 2020 22:40:48 -0500
4 Subject: [PATCH] crypto: x86/poly1305 - wire up faster implementations for
7 commit d7d7b853566254648df59f7ea27ea05952a6cfa8 upstream.
9 These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F.
10 The AVX-512F implementation is disabled on Skylake, due to throttling,
11 but it is quite fast on >= Cannonlake.
13 On the left is cycle counts on a Core i7 6700HQ using the AVX-2
14 codepath, comparing this implementation ("new") to the implementation in
15 the current crypto api ("old"). On the right are benchmarks on a Xeon
16 Gold 5120 using the AVX-512 codepath. The new implementation is faster
22 size old new size old new
23 ---- ---- ---- ---- ---- ----
31 112 342 192 112 342 194
32 128 388 212 128 384 212
33 144 428 228 144 420 226
34 160 466 246 160 464 248
35 176 510 264 176 504 264
36 192 550 282 192 544 282
37 208 594 302 208 582 300
38 224 628 316 224 624 318
39 240 676 334 240 662 338
40 256 716 354 256 708 358
41 272 764 374 272 748 372
42 288 802 352 288 788 358
43 304 420 366 304 422 370
44 320 428 360 320 432 364
45 336 484 378 336 486 380
46 352 426 384 352 434 390
47 368 478 400 368 480 408
48 384 488 394 384 490 398
49 400 542 408 400 542 412
50 416 486 416 416 492 426
51 432 534 430 432 538 436
52 448 544 422 448 546 432
53 464 600 438 464 600 448
54 480 540 448 480 548 456
55 496 594 464 496 594 476
56 512 602 456 512 606 470
57 528 656 476 528 656 480
58 544 600 480 544 606 498
59 560 650 494 560 652 512
60 576 664 490 576 662 508
61 592 714 508 592 716 522
62 608 656 514 608 664 538
63 624 708 532 624 710 552
64 640 716 524 640 720 516
65 656 770 536 656 772 526
66 672 716 548 672 722 544
67 688 770 562 688 768 556
68 704 774 552 704 778 556
69 720 826 568 720 832 568
70 736 768 574 736 780 584
71 752 822 592 752 826 600
72 768 830 584 768 836 560
73 784 884 602 784 888 572
74 800 828 610 800 838 588
75 816 884 628 816 884 604
76 832 888 618 832 894 598
77 848 942 632 848 946 612
78 864 884 644 864 896 628
79 880 936 660 880 942 644
80 896 948 652 896 952 608
81 912 1000 664 912 1004 616
82 928 942 676 928 954 634
83 944 994 690 944 1000 646
84 960 1002 680 960 1008 646
85 976 1054 694 976 1062 658
86 992 1002 706 992 1012 674
87 1008 1052 720 1008 1058 690
89 This commit wires in the prior implementation from Andy, and makes the
90 following changes to be suitable for kernel land.
92 - Some cosmetic and structural changes, like renaming labels to
93 .Lname, constants, and other Linux conventions, as well as making
94 the code easy for us to maintain moving forward.
96 - CPU feature checking is done in C by the glue code.
98 - We avoid jumping into the middle of functions, to appease objtool,
99 and instead parameterize shared code.
101 - We maintain frame pointers so that stack traces make sense.
103 - We remove the dependency on the perl xlate code, which transforms
104 the output into things that assemblers we don't care about use.
106 Importantly, none of our changes affect the arithmetic or core code, but
107 just involve the differing environment of kernel space.
109 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
110 Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
111 Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
112 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
113 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
115 arch/x86/crypto/.gitignore | 1 +
116 arch/x86/crypto/Makefile | 11 +-
117 arch/x86/crypto/poly1305-avx2-x86_64.S | 390 ----------
118 arch/x86/crypto/poly1305-sse2-x86_64.S | 590 ---------------
119 arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 682 ++++++++++--------
120 arch/x86/crypto/poly1305_glue.c | 473 +++++-------
121 lib/crypto/Kconfig | 2 +-
122 7 files changed, 572 insertions(+), 1577 deletions(-)
123 create mode 100644 arch/x86/crypto/.gitignore
124 delete mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S
125 delete mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S
128 +++ b/arch/x86/crypto/.gitignore
131 --- a/arch/x86/crypto/Makefile
132 +++ b/arch/x86/crypto/Makefile
133 @@ -73,6 +73,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o
135 nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
136 blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
137 +poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
138 +ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
139 +targets += poly1305-x86_64-cryptogams.S
142 ifeq ($(avx_supported),yes)
143 camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
144 @@ -101,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni
145 aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
146 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
147 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
148 -poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
149 ifeq ($(avx2_supported),yes)
150 sha1-ssse3-y += sha1_avx2_x86_64_asm.o
151 -poly1305-x86_64-y += poly1305-avx2-x86_64.o
153 ifeq ($(sha1_ni_supported),yes)
154 sha1-ssse3-y += sha1_ni_asm.o
155 @@ -118,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o
157 sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
158 crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
160 +quiet_cmd_perlasm = PERLASM $@
161 + cmd_perlasm = $(PERL) $< > $@
162 +$(obj)/%.S: $(src)/%.pl FORCE
163 + $(call if_changed,perlasm)
164 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S
167 -/* SPDX-License-Identifier: GPL-2.0-or-later */
169 - * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
171 - * Copyright (C) 2015 Martin Willi
174 -#include <linux/linkage.h>
176 -.section .rodata.cst32.ANMASK, "aM", @progbits, 32
178 -ANMASK: .octa 0x0000000003ffffff0000000003ffffff
179 - .octa 0x0000000003ffffff0000000003ffffff
181 -.section .rodata.cst32.ORMASK, "aM", @progbits, 32
183 -ORMASK: .octa 0x00000000010000000000000001000000
184 - .octa 0x00000000010000000000000001000000
188 -#define h0 0x00(%rdi)
189 -#define h1 0x04(%rdi)
190 -#define h2 0x08(%rdi)
191 -#define h3 0x0c(%rdi)
192 -#define h4 0x10(%rdi)
193 -#define r0 0x00(%rdx)
194 -#define r1 0x04(%rdx)
195 -#define r2 0x08(%rdx)
196 -#define r3 0x0c(%rdx)
197 -#define r4 0x10(%rdx)
198 -#define u0 0x00(%r8)
199 -#define u1 0x04(%r8)
200 -#define u2 0x08(%r8)
201 -#define u3 0x0c(%r8)
202 -#define u4 0x10(%r8)
203 -#define w0 0x18(%r8)
204 -#define w1 0x1c(%r8)
205 -#define w2 0x20(%r8)
206 -#define w3 0x24(%r8)
207 -#define w4 0x28(%r8)
208 -#define y0 0x30(%r8)
209 -#define y1 0x34(%r8)
210 -#define y2 0x38(%r8)
211 -#define y3 0x3c(%r8)
212 -#define y4 0x40(%r8)
231 -#define ruwy3 %ymm10
232 -#define ruwy4 %ymm11
233 -#define ruwy0x %xmm7
234 -#define ruwy1x %xmm8
235 -#define ruwy2x %xmm9
236 -#define ruwy3x %xmm10
237 -#define ruwy4x %xmm11
238 -#define svxz1 %ymm12
239 -#define svxz2 %ymm13
240 -#define svxz3 %ymm14
241 -#define svxz4 %ymm15
248 -ENTRY(poly1305_4block_avx2)
249 - # %rdi: Accumulator h[5]
250 - # %rsi: 64 byte input block m
251 - # %rdx: Poly1305 key r[5]
252 - # %rcx: Quadblock count
253 - # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
255 - # This four-block variant uses loop unrolled block processing. It
256 - # requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
257 - # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
264 - # combine r0,u0,w0,y0
267 - vpunpcklqdq t1,ruwy0,ruwy0
270 - vpunpcklqdq t2,t1,t1
271 - vperm2i128 $0x20,t1,ruwy0,ruwy0
273 - # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
276 - vpunpcklqdq t1,ruwy1,ruwy1
279 - vpunpcklqdq t2,t1,t1
280 - vperm2i128 $0x20,t1,ruwy1,ruwy1
281 - vpslld $2,ruwy1,svxz1
282 - vpaddd ruwy1,svxz1,svxz1
284 - # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
287 - vpunpcklqdq t1,ruwy2,ruwy2
290 - vpunpcklqdq t2,t1,t1
291 - vperm2i128 $0x20,t1,ruwy2,ruwy2
292 - vpslld $2,ruwy2,svxz2
293 - vpaddd ruwy2,svxz2,svxz2
295 - # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
298 - vpunpcklqdq t1,ruwy3,ruwy3
301 - vpunpcklqdq t2,t1,t1
302 - vperm2i128 $0x20,t1,ruwy3,ruwy3
303 - vpslld $2,ruwy3,svxz3
304 - vpaddd ruwy3,svxz3,svxz3
306 - # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
309 - vpunpcklqdq t1,ruwy4,ruwy4
312 - vpunpcklqdq t2,t1,t1
313 - vperm2i128 $0x20,t1,ruwy4,ruwy4
314 - vpslld $2,ruwy4,svxz4
315 - vpaddd ruwy4,svxz4,svxz4
318 - # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
319 - # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
322 - vpunpcklqdq t1,hc0,hc0
325 - vpunpcklqdq t2,t1,t1
326 - vperm2i128 $0x20,t1,hc0,hc0
327 - vpand ANMASK(%rip),hc0,hc0
330 - # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
331 - # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
334 - vpunpcklqdq t1,hc1,hc1
337 - vpunpcklqdq t2,t1,t1
338 - vperm2i128 $0x20,t1,hc1,hc1
340 - vpand ANMASK(%rip),hc1,hc1
343 - # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
344 - # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
347 - vpunpcklqdq t1,hc2,hc2
350 - vpunpcklqdq t2,t1,t1
351 - vperm2i128 $0x20,t1,hc2,hc2
353 - vpand ANMASK(%rip),hc2,hc2
356 - # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
357 - # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
360 - vpunpcklqdq t1,hc3,hc3
363 - vpunpcklqdq t2,t1,t1
364 - vperm2i128 $0x20,t1,hc3,hc3
366 - vpand ANMASK(%rip),hc3,hc3
369 - # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
370 - # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
373 - vpunpcklqdq t1,hc4,hc4
376 - vpunpcklqdq t2,t1,t1
377 - vperm2i128 $0x20,t1,hc4,hc4
379 - vpor ORMASK(%rip),hc4,hc4
383 - # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
384 - vpmuludq hc0,ruwy0,t1
385 - # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
386 - vpmuludq hc1,svxz4,t2
388 - # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
389 - vpmuludq hc2,svxz3,t2
391 - # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
392 - vpmuludq hc3,svxz2,t2
394 - # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
395 - vpmuludq hc4,svxz1,t2
397 - # d0 = t1[0] + t1[1] + t[2] + t[3]
404 - # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
405 - vpmuludq hc0,ruwy1,t1
406 - # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
407 - vpmuludq hc1,ruwy0,t2
409 - # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
410 - vpmuludq hc2,svxz4,t2
412 - # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
413 - vpmuludq hc3,svxz3,t2
415 - # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
416 - vpmuludq hc4,svxz2,t2
418 - # d1 = t1[0] + t1[1] + t1[3] + t1[4]
425 - # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
426 - vpmuludq hc0,ruwy2,t1
427 - # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
428 - vpmuludq hc1,ruwy1,t2
430 - # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
431 - vpmuludq hc2,ruwy0,t2
433 - # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
434 - vpmuludq hc3,svxz4,t2
436 - # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
437 - vpmuludq hc4,svxz3,t2
439 - # d2 = t1[0] + t1[1] + t1[2] + t1[3]
446 - # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
447 - vpmuludq hc0,ruwy3,t1
448 - # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
449 - vpmuludq hc1,ruwy2,t2
451 - # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
452 - vpmuludq hc2,ruwy1,t2
454 - # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
455 - vpmuludq hc3,ruwy0,t2
457 - # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
458 - vpmuludq hc4,svxz4,t2
460 - # d3 = t1[0] + t1[1] + t1[2] + t1[3]
467 - # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
468 - vpmuludq hc0,ruwy4,t1
469 - # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
470 - vpmuludq hc1,ruwy3,t2
472 - # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
473 - vpmuludq hc2,ruwy2,t2
475 - # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
476 - vpmuludq hc3,ruwy1,t2
478 - # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
479 - vpmuludq hc4,ruwy0,t2
481 - # d4 = t1[0] + t1[1] + t1[2] + t1[3]
488 - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
489 - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
490 - # amount. Careful: we must not assume the carry bits 'd0 >> 26',
491 - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
492 - # integers. It's true in a single-block implementation, but not here.
498 - # h0 = d0 & 0x3ffffff
500 - and $0x3ffffff,%ebx
506 - # h1 = d1 & 0x3ffffff
508 - and $0x3ffffff,%eax
515 - # h2 = d2 & 0x3ffffff
517 - and $0x3ffffff,%eax
524 - # h3 = d3 & 0x3ffffff
526 - and $0x3ffffff,%eax
529 - # h0 += (d4 >> 26) * 5
532 - lea (%rax,%rax,4),%rax
534 - # h4 = d4 & 0x3ffffff
536 - and $0x3ffffff,%eax
543 - # h0 = h0 & 0x3ffffff
544 - andl $0x3ffffff,%ebx
556 -ENDPROC(poly1305_4block_avx2)
557 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S
560 -/* SPDX-License-Identifier: GPL-2.0-or-later */
562 - * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
564 - * Copyright (C) 2015 Martin Willi
567 -#include <linux/linkage.h>
569 -.section .rodata.cst16.ANMASK, "aM", @progbits, 16
571 -ANMASK: .octa 0x0000000003ffffff0000000003ffffff
573 -.section .rodata.cst16.ORMASK, "aM", @progbits, 16
575 -ORMASK: .octa 0x00000000010000000000000001000000
579 -#define h0 0x00(%rdi)
580 -#define h1 0x04(%rdi)
581 -#define h2 0x08(%rdi)
582 -#define h3 0x0c(%rdi)
583 -#define h4 0x10(%rdi)
584 -#define r0 0x00(%rdx)
585 -#define r1 0x04(%rdx)
586 -#define r2 0x08(%rdx)
587 -#define r3 0x0c(%rdx)
588 -#define r4 0x10(%rdx)
589 -#define s1 0x00(%rsp)
590 -#define s2 0x04(%rsp)
591 -#define s3 0x08(%rsp)
592 -#define s4 0x0c(%rsp)
608 -ENTRY(poly1305_block_sse2)
609 - # %rdi: Accumulator h[5]
610 - # %rsi: 16 byte input block m
611 - # %rdx: Poly1305 key r[5]
612 - # %rcx: Block count
614 - # This single block variant tries to improve performance by doing two
615 - # multiplications in parallel using SSE instructions. There is quite
616 - # some quardword packing involved, hence the speedup is marginal.
622 - # s1..s4 = r1..r4 * 5
624 - lea (%eax,%eax,4),%eax
627 - lea (%eax,%eax,4),%eax
630 - lea (%eax,%eax,4),%eax
633 - lea (%eax,%eax,4),%eax
636 - movdqa ANMASK(%rip),mask
639 - # h01 = [0, h1, 0, h0]
640 - # h23 = [0, h3, 0, h2]
641 - # h44 = [0, h4, 0, h4]
651 - # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
658 - # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
666 - # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
669 - or $0x01000000,%eax
674 - # t1[0] = h0 * r0 + h2 * s3
675 - # t1[1] = h1 * s4 + h3 * s2
685 - # t2[0] = h0 * r1 + h2 * s4
686 - # t2[1] = h1 * r0 + h3 * s3
702 - # d0 = t1[0] + t1[1] + t3[0]
703 - # d1 = t2[0] + t2[1] + t3[1]
713 - # t1[0] = h0 * r2 + h2 * r0
714 - # t1[1] = h1 * r1 + h3 * s4
724 - # t2[0] = h0 * r3 + h2 * r1
725 - # t2[1] = h1 * r2 + h3 * r0
741 - # d2 = t1[0] + t1[1] + t3[0]
742 - # d3 = t2[0] + t2[1] + t3[1]
752 - # t1[0] = h0 * r4 + h2 * r2
753 - # t1[1] = h1 * r3 + h3 * r1
766 - # d4 = t1[0] + t1[1] + t3[0]
777 - # h0 = d0 & 0x3ffffff
779 - and $0x3ffffff,%ebx
785 - # h1 = d1 & 0x3ffffff
787 - and $0x3ffffff,%eax
794 - # h2 = d2 & 0x3ffffff
796 - and $0x3ffffff,%eax
803 - # h3 = d3 & 0x3ffffff
805 - and $0x3ffffff,%eax
808 - # h0 += (d4 >> 26) * 5
811 - lea (%rax,%rax,4),%rax
813 - # h4 = d4 & 0x3ffffff
815 - and $0x3ffffff,%eax
822 - # h0 = h0 & 0x3ffffff
823 - andl $0x3ffffff,%ebx
830 - # Zeroing of key material
831 - mov %rcx,0x00(%rsp)
832 - mov %rcx,0x08(%rsp)
838 -ENDPROC(poly1305_block_sse2)
841 -#define u0 0x00(%r8)
842 -#define u1 0x04(%r8)
843 -#define u2 0x08(%r8)
844 -#define u3 0x0c(%r8)
845 -#define u4 0x10(%r8)
863 -ENTRY(poly1305_2block_sse2)
864 - # %rdi: Accumulator h[5]
865 - # %rsi: 16 byte input block m
866 - # %rdx: Poly1305 key r[5]
867 - # %rcx: Doubleblock count
868 - # %r8: Poly1305 derived key r^2 u[5]
870 - # This two-block variant further improves performance by using loop
871 - # unrolled block processing. This is more straight forward and does
872 - # less byte shuffling, but requires a second Poly1305 key r^2:
873 - # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
884 - # combine r1,u1 and s1=r1*5,v1=u1*5
892 - # combine r2,u2 and s2=r2*5,v2=u2*5
900 - # combine r3,u3 and s3=r3*5,v3=u3*5
908 - # combine r4,u4 and s4=r4*5,v4=u4*5
917 - # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
921 - pand ANMASK(%rip),hc0
924 - # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
929 - pand ANMASK(%rip),hc1
932 - # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
937 - pand ANMASK(%rip),hc2
940 - # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
945 - pand ANMASK(%rip),hc3
948 - # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
953 - por ORMASK(%rip),hc4
957 - # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
960 - # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
964 - # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
968 - # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
972 - # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
976 - # d0 = t1[0] + t1[1]
982 - # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
985 - # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
989 - # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
993 - # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
997 - # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
1001 - # d1 = t1[0] + t1[1]
1007 - # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
1010 - # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
1014 - # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
1018 - # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
1022 - # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
1026 - # d2 = t1[0] + t1[1]
1032 - # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
1035 - # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
1039 - # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
1043 - # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
1047 - # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
1051 - # d3 = t1[0] + t1[1]
1057 - # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
1060 - # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
1064 - # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
1068 - # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
1072 - # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
1076 - # d4 = t1[0] + t1[1]
1082 - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
1083 - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
1084 - # amount. Careful: we must not assume the carry bits 'd0 >> 26',
1085 - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
1086 - # integers. It's true in a single-block implementation, but not here.
1092 - # h0 = d0 & 0x3ffffff
1094 - and $0x3ffffff,%ebx
1100 - # h1 = d1 & 0x3ffffff
1102 - and $0x3ffffff,%eax
1109 - # h2 = d2 & 0x3ffffff
1111 - and $0x3ffffff,%eax
1118 - # h3 = d3 & 0x3ffffff
1120 - and $0x3ffffff,%eax
1123 - # h0 += (d4 >> 26) * 5
1126 - lea (%rax,%rax,4),%rax
1128 - # h4 = d4 & 0x3ffffff
1130 - and $0x3ffffff,%eax
1137 - # h0 = h0 & 0x3ffffff
1138 - andl $0x3ffffff,%ebx
1149 -ENDPROC(poly1305_2block_sse2)
1150 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
1151 +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
1153 -#! /usr/bin/env perl
1154 -# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
1155 +#!/usr/bin/env perl
1156 +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
1158 -# Licensed under the OpenSSL license (the "License"). You may not use
1159 -# this file except in compliance with the License. You can obtain a copy
1160 -# in the file LICENSE in the source distribution or at
1161 -# https://www.openssl.org/source/license.html
1163 +# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
1164 +# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
1165 +# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
1167 +# This code is taken from the OpenSSL project but the author, Andy Polyakov,
1168 +# has relicensed it under the licenses specified in the SPDX header above.
1169 +# The original headers, including the original license headers, are
1170 +# included below for completeness.
1172 # ====================================================================
1173 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
1175 # Skylake-X system performance. Since we are likely to suppress
1176 # AVX512F capability flag [at least on Skylake-X], conversion serves
1177 # as kind of "investment protection". Note that next *lake processor,
1178 -# Cannolake, has AVX512IFMA code path to execute...
1179 +# Cannonlake, has AVX512IFMA code path to execute...
1181 # Numbers are cycles per processed byte with poly1305_blocks alone,
1182 # measured with rdtsc at fixed clock frequency.
1183 @@ -68,39 +71,114 @@ $output = shift;
1184 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
1186 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
1187 +$kernel=0; $kernel=1 if (!$flavour && !$output);
1189 -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
1190 -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
1191 -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
1192 -die "can't locate x86_64-xlate.pl";
1194 -if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
1195 - =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
1196 - $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
1198 + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
1199 + ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
1200 + ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
1201 + die "can't locate x86_64-xlate.pl";
1203 + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
1206 + if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
1207 + =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
1208 + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
1211 + if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
1212 + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
1213 + $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
1214 + $avx += 1 if ($1==2.11 && $2>=8);
1217 + if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
1218 + `ml64 2>&1` =~ /Version ([0-9]+)\./) {
1219 + $avx = ($1>=10) + ($1>=11);
1222 + if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
1223 + $avx = ($2>=3.0) + ($2>3.0);
1226 + $avx = 4; # The kernel uses ifdefs for this.
1229 -if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
1230 - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
1231 - $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
1232 - $avx += 2 if ($1==2.11 && $2>=8);
1233 +sub declare_function() {
1234 + my ($name, $align, $nargs) = @_;
1236 + $code .= ".align $align\n";
1237 + $code .= "ENTRY($name)\n";
1238 + $code .= ".L$name:\n";
1240 + $code .= ".globl $name\n";
1241 + $code .= ".type $name,\@function,$nargs\n";
1242 + $code .= ".align $align\n";
1243 + $code .= "$name:\n";
1247 -if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
1248 - `ml64 2>&1` =~ /Version ([0-9]+)\./) {
1249 - $avx = ($1>=10) + ($1>=12);
1250 +sub end_function() {
1253 + $code .= "ENDPROC($name)\n";
1255 + $code .= ".size $name,.-$name\n";
1259 -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
1260 - $avx = ($2>=3.0) + ($2>3.0);
1262 +$code.=<<___ if $kernel;
1263 +#include <linux/linkage.h>
1266 -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
1269 +$code.=<<___ if $kernel;
1276 +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1278 +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
1280 +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1282 +.long 2,2,2,3,2,0,2,1
1284 +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
1287 +.long 0,1,1,2,2,3,7,7
1291 +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
1299 +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1300 +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1302 +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1303 +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1306 +$code.=<<___ if (!$kernel);
1307 +.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1311 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
1312 my ($mac,$nonce)=($inp,$len); # *_emit arguments
1313 -my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
1314 -my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
1315 +my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
1316 +my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
1318 sub poly1305_iteration {
1319 # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
1320 @@ -155,19 +233,19 @@ ___
1326 +$code.=<<___ if (!$kernel);
1327 .extern OPENSSL_ia32cap_P
1329 -.globl poly1305_init
1330 -.hidden poly1305_init
1331 -.globl poly1305_blocks
1332 -.hidden poly1305_blocks
1333 -.globl poly1305_emit
1334 -.hidden poly1305_emit
1336 -.type poly1305_init,\@function,3
1339 +.globl poly1305_init_x86_64
1340 +.hidden poly1305_init_x86_64
1341 +.globl poly1305_blocks_x86_64
1342 +.hidden poly1305_blocks_x86_64
1343 +.globl poly1305_emit_x86_64
1344 +.hidden poly1305_emit_x86_64
1346 +&declare_function("poly1305_init_x86_64", 32, 3);
1349 mov %rax,0($ctx) # initialize hash value
1351 @@ -175,11 +253,12 @@ poly1305_init:
1356 - lea poly1305_blocks(%rip),%r10
1357 - lea poly1305_emit(%rip),%r11
1359 -$code.=<<___ if ($avx);
1360 +$code.=<<___ if (!$kernel);
1361 + lea poly1305_blocks_x86_64(%rip),%r10
1362 + lea poly1305_emit_x86_64(%rip),%r11
1364 +$code.=<<___ if (!$kernel && $avx);
1365 mov OPENSSL_ia32cap_P+4(%rip),%r9
1366 lea poly1305_blocks_avx(%rip),%rax
1367 lea poly1305_emit_avx(%rip),%rcx
1368 @@ -187,12 +266,12 @@ $code.=<<___ if ($avx);
1372 -$code.=<<___ if ($avx>1);
1373 +$code.=<<___ if (!$kernel && $avx>1);
1374 lea poly1305_blocks_avx2(%rip),%rax
1375 bt \$`5+32`,%r9 # AVX2?
1378 -$code.=<<___ if ($avx>3);
1379 +$code.=<<___ if (!$kernel && $avx>3);
1380 mov \$`(1<<31|1<<21|1<<16)`,%rax
1383 @@ -207,11 +286,11 @@ $code.=<<___;
1387 -$code.=<<___ if ($flavour !~ /elf32/);
1388 +$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
1392 -$code.=<<___ if ($flavour =~ /elf32/);
1393 +$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
1397 @@ -219,11 +298,11 @@ $code.=<<___;
1401 -.size poly1305_init,.-poly1305_init
1403 +&end_function("poly1305_init_x86_64");
1405 -.type poly1305_blocks,\@function,4
1408 +&declare_function("poly1305_blocks_x86_64", 32, 4);
1413 @@ -231,8 +310,6 @@ poly1305_blocks:
1422 @@ -241,6 +318,8 @@ poly1305_blocks:
1430 mov $len,%r15 # reassign $len
1431 @@ -265,26 +344,29 @@ poly1305_blocks:
1436 &poly1305_iteration();
1446 mov $h0,0($ctx) # store hash value
1467 @@ -293,11 +375,11 @@ $code.=<<___;
1471 -.size poly1305_blocks,.-poly1305_blocks
1473 +&end_function("poly1305_blocks_x86_64");
1475 -.type poly1305_emit,\@function,3
1478 +&declare_function("poly1305_emit_x86_64", 32, 3);
1481 mov 0($ctx),%r8 # load hash value
1483 @@ -318,10 +400,14 @@ poly1305_emit:
1487 -.size poly1305_emit,.-poly1305_emit
1489 +&end_function("poly1305_emit_x86_64");
1493 + $code .= "#ifdef CONFIG_AS_AVX\n";
1496 ########################################################################
1497 # Layout of opaque area is following.
1499 @@ -342,15 +428,19 @@ $code.=<<___;
1500 .type __poly1305_block,\@abi-omnipotent
1505 &poly1305_iteration();
1509 .size __poly1305_block,.-__poly1305_block
1511 .type __poly1305_init_avx,\@abi-omnipotent
1513 __poly1305_init_avx:
1519 @@ -507,12 +597,13 @@ __poly1305_init_avx:
1520 mov $d1#d,`16*8+8-64`($ctx)
1522 lea -48-64($ctx),$ctx # size [de-]optimization
1525 .size __poly1305_init_avx,.-__poly1305_init_avx
1528 -.type poly1305_blocks_avx,\@function,4
1530 -poly1305_blocks_avx:
1531 +&declare_function("poly1305_blocks_avx", 32, 4);
1534 mov 20($ctx),%r8d # is_base2_26
1536 @@ -532,10 +623,11 @@ poly1305_blocks_avx:
1550 @@ -645,20 +737,18 @@ poly1305_blocks_avx:
1572 -.cfi_adjust_cfa_offset -48
1576 .Lblocks_avx_epilogue:
1578 @@ -667,10 +757,11 @@ poly1305_blocks_avx:
1592 @@ -736,22 +827,18 @@ poly1305_blocks_avx:
1616 -.cfi_adjust_cfa_offset -48
1619 .Lbase2_64_avx_epilogue:
1622 @@ -768,8 +855,11 @@ poly1305_blocks_avx:
1625 $code.=<<___ if (!$win64);
1627 +.cfi_def_cfa_register %r10
1630 lea -0x58(%rsp),%r11
1631 -.cfi_def_cfa %r11,0x60
1634 $code.=<<___ if ($win64);
1635 @@ -1361,18 +1451,18 @@ $code.=<<___ if ($win64);
1638 $code.=<<___ if (!$win64);
1639 - lea 0x58(%r11),%rsp
1640 -.cfi_def_cfa %rsp,8
1642 +.cfi_def_cfa_register %rsp
1648 -.size poly1305_blocks_avx,.-poly1305_blocks_avx
1650 +&end_function("poly1305_blocks_avx");
1652 -.type poly1305_emit_avx,\@function,3
1655 +&declare_function("poly1305_emit_avx", 32, 3);
1657 cmpl \$0,20($ctx) # is_base2_26?
1660 @@ -1423,41 +1513,51 @@ poly1305_emit_avx:
1664 -.size poly1305_emit_avx,.-poly1305_emit_avx
1666 +&end_function("poly1305_emit_avx");
1669 + $code .= "#endif\n";
1675 + $code .= "#ifdef CONFIG_AS_AVX2\n";
1678 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1679 map("%ymm$_",(0..15));
1682 +sub poly1305_blocks_avxN {
1683 + my ($avx512) = @_;
1684 + my $suffix = $avx512 ? "_avx512" : "";
1686 -.type poly1305_blocks_avx2,\@function,4
1688 -poly1305_blocks_avx2:
1690 mov 20($ctx),%r8d # is_base2_26
1693 + jae .Lblocks_avx2$suffix
1698 +.Lblocks_avx2$suffix:
1701 + jz .Lno_data_avx2$suffix
1706 - jz .Lbase2_64_avx2
1707 + jz .Lbase2_64_avx2$suffix
1711 + jz .Leven_avx2$suffix
1723 @@ -1466,7 +1566,7 @@ poly1305_blocks_avx2:
1727 -.Lblocks_avx2_body:
1728 +.Lblocks_avx2_body$suffix:
1730 mov $len,%r15 # reassign $len
1732 @@ -1513,7 +1613,7 @@ poly1305_blocks_avx2:
1734 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1736 -.Lbase2_26_pre_avx2:
1737 +.Lbase2_26_pre_avx2$suffix:
1738 add 0($inp),$h0 # accumulate input
1741 @@ -1524,10 +1624,10 @@ poly1305_blocks_avx2:
1745 - jnz .Lbase2_26_pre_avx2
1746 + jnz .Lbase2_26_pre_avx2$suffix
1748 test $padbit,$padbit # if $padbit is zero,
1749 - jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
1750 + jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
1752 ################################# base 2^64 -> base 2^26
1754 @@ -1548,57 +1648,56 @@ poly1305_blocks_avx2:
1758 - jz .Lstore_base2_26_avx2
1759 + jz .Lstore_base2_26_avx2$suffix
1766 - jmp .Lproceed_avx2
1767 + jmp .Lproceed_avx2$suffix
1770 -.Lstore_base2_64_avx2:
1771 +.Lstore_base2_64_avx2$suffix:
1774 mov $h2,16($ctx) # note that is_base2_26 is zeroed
1776 + jmp .Ldone_avx2$suffix
1779 -.Lstore_base2_26_avx2:
1780 +.Lstore_base2_26_avx2$suffix:
1781 mov %rax#d,0($ctx) # store hash value base 2^26
1789 +.Ldone_avx2$suffix:
1807 -.cfi_adjust_cfa_offset -48
1809 -.Lblocks_avx2_epilogue:
1812 +.Lno_data_avx2$suffix:
1813 +.Lblocks_avx2_epilogue$suffix:
1819 +.Lbase2_64_avx2$suffix:
1831 @@ -1607,7 +1706,7 @@ poly1305_blocks_avx2:
1835 -.Lbase2_64_avx2_body:
1836 +.Lbase2_64_avx2_body$suffix:
1838 mov $len,%r15 # reassign $len
1840 @@ -1624,9 +1723,9 @@ poly1305_blocks_avx2:
1841 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1845 + jz .Linit_avx2$suffix
1847 -.Lbase2_64_pre_avx2:
1848 +.Lbase2_64_pre_avx2$suffix:
1849 add 0($inp),$h0 # accumulate input
1852 @@ -1637,9 +1736,9 @@ poly1305_blocks_avx2:
1856 - jnz .Lbase2_64_pre_avx2
1857 + jnz .Lbase2_64_pre_avx2$suffix
1860 +.Linit_avx2$suffix:
1861 ################################# base 2^64 -> base 2^26
1864 @@ -1667,69 +1766,77 @@ poly1305_blocks_avx2:
1866 call __poly1305_init_avx
1869 +.Lproceed_avx2$suffix:
1870 mov %r15,$len # restore $len
1871 - mov OPENSSL_ia32cap_P+8(%rip),%r10d
1873 +$code.=<<___ if (!$kernel);
1874 + mov OPENSSL_ia32cap_P+8(%rip),%r9d
1875 mov \$`(1<<31|1<<30|1<<16)`,%r11d
1898 -.cfi_adjust_cfa_offset -48
1899 -.Lbase2_64_avx2_epilogue:
1903 +.Lbase2_64_avx2_epilogue$suffix:
1904 + jmp .Ldo_avx2$suffix
1909 +.Leven_avx2$suffix:
1911 - mov OPENSSL_ia32cap_P+8(%rip),%r10d
1913 +$code.=<<___ if (!$kernel);
1914 + mov OPENSSL_ia32cap_P+8(%rip),%r9d
1917 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1918 vmovd 4*1($ctx),%x#$H1
1919 vmovd 4*2($ctx),%x#$H2
1920 vmovd 4*3($ctx),%x#$H3
1921 vmovd 4*4($ctx),%x#$H4
1926 -$code.=<<___ if ($avx>2);
1927 +$code.=<<___ if (!$kernel && $avx>2);
1931 - test \$`1<<16`,%r10d # check for AVX512F
1933 + test \$`1<<16`,%r9d # check for AVX512F
1936 +.Lskip_avx512$suffix:
1938 +$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
1940 + jae .Lblocks_avx512
1942 $code.=<<___ if (!$win64);
1944 -.cfi_def_cfa %r11,16
1946 +.cfi_def_cfa_register %r10
1949 $code.=<<___ if ($win64);
1950 - lea -0xf8(%rsp),%r11
1953 - vmovdqa %xmm6,0x50(%r11)
1954 - vmovdqa %xmm7,0x60(%r11)
1955 - vmovdqa %xmm8,0x70(%r11)
1956 - vmovdqa %xmm9,0x80(%r11)
1957 - vmovdqa %xmm10,0x90(%r11)
1958 - vmovdqa %xmm11,0xa0(%r11)
1959 - vmovdqa %xmm12,0xb0(%r11)
1960 - vmovdqa %xmm13,0xc0(%r11)
1961 - vmovdqa %xmm14,0xd0(%r11)
1962 - vmovdqa %xmm15,0xe0(%r11)
1964 + vmovdqa %xmm6,-0xb0(%r10)
1965 + vmovdqa %xmm7,-0xa0(%r10)
1966 + vmovdqa %xmm8,-0x90(%r10)
1967 + vmovdqa %xmm9,-0x80(%r10)
1968 + vmovdqa %xmm10,-0x70(%r10)
1969 + vmovdqa %xmm11,-0x60(%r10)
1970 + vmovdqa %xmm12,-0x50(%r10)
1971 + vmovdqa %xmm13,-0x40(%r10)
1972 + vmovdqa %xmm14,-0x30(%r10)
1973 + vmovdqa %xmm15,-0x20(%r10)
1974 +.Ldo_avx2_body$suffix:
1977 lea .Lconst(%rip),%rcx
1978 @@ -1794,11 +1901,11 @@ $code.=<<___;
1980 vpaddq $H2,$T2,$H2 # accumulate input
1984 + jz .Ltail_avx2$suffix
1985 + jmp .Loop_avx2$suffix
1990 ################################################################
1991 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1992 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1993 @@ -1946,10 +2053,10 @@ $code.=<<___;
1994 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1998 + jnz .Loop_avx2$suffix
2002 +.Ltail_avx2$suffix:
2003 ################################################################
2004 # while above multiplications were by r^4 in all lanes, in last
2005 # iteration we multiply least significant lane by r^4 and most
2006 @@ -2087,37 +2194,29 @@ $code.=<<___;
2007 vmovd %x#$H4,`4*4-48-64`($ctx)
2009 $code.=<<___ if ($win64);
2010 - vmovdqa 0x50(%r11),%xmm6
2011 - vmovdqa 0x60(%r11),%xmm7
2012 - vmovdqa 0x70(%r11),%xmm8
2013 - vmovdqa 0x80(%r11),%xmm9
2014 - vmovdqa 0x90(%r11),%xmm10
2015 - vmovdqa 0xa0(%r11),%xmm11
2016 - vmovdqa 0xb0(%r11),%xmm12
2017 - vmovdqa 0xc0(%r11),%xmm13
2018 - vmovdqa 0xd0(%r11),%xmm14
2019 - vmovdqa 0xe0(%r11),%xmm15
2020 - lea 0xf8(%r11),%rsp
2021 -.Ldo_avx2_epilogue:
2022 + vmovdqa -0xb0(%r10),%xmm6
2023 + vmovdqa -0xa0(%r10),%xmm7
2024 + vmovdqa -0x90(%r10),%xmm8
2025 + vmovdqa -0x80(%r10),%xmm9
2026 + vmovdqa -0x70(%r10),%xmm10
2027 + vmovdqa -0x60(%r10),%xmm11
2028 + vmovdqa -0x50(%r10),%xmm12
2029 + vmovdqa -0x40(%r10),%xmm13
2030 + vmovdqa -0x30(%r10),%xmm14
2031 + vmovdqa -0x20(%r10),%xmm15
2033 +.Ldo_avx2_epilogue$suffix:
2035 $code.=<<___ if (!$win64);
2037 -.cfi_def_cfa %rsp,8
2039 +.cfi_def_cfa_register %rsp
2045 -.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
2047 -#######################################################################
2049 -# On entry we have input length divisible by 64. But since inner loop
2050 -# processes 128 bytes per iteration, cases when length is not divisible
2051 -# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2052 -# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2053 -# for this tail, we wouldn't have to even allocate stack frame...
2055 +if($avx > 2 && $avx512) {
2056 my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2057 my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2058 my $PADBIT="%zmm30";
2059 @@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2060 map(s/%y/%z/,($MASK));
2063 -.type poly1305_blocks_avx512,\@function,4
2065 -poly1305_blocks_avx512:
2071 $code.=<<___ if (!$win64);
2073 -.cfi_def_cfa %r11,16
2075 +.cfi_def_cfa_register %r10
2078 $code.=<<___ if ($win64);
2079 - lea -0xf8(%rsp),%r11
2082 - vmovdqa %xmm6,0x50(%r11)
2083 - vmovdqa %xmm7,0x60(%r11)
2084 - vmovdqa %xmm8,0x70(%r11)
2085 - vmovdqa %xmm9,0x80(%r11)
2086 - vmovdqa %xmm10,0x90(%r11)
2087 - vmovdqa %xmm11,0xa0(%r11)
2088 - vmovdqa %xmm12,0xb0(%r11)
2089 - vmovdqa %xmm13,0xc0(%r11)
2090 - vmovdqa %xmm14,0xd0(%r11)
2091 - vmovdqa %xmm15,0xe0(%r11)
2092 + vmovdqa %xmm6,-0xb0(%r10)
2093 + vmovdqa %xmm7,-0xa0(%r10)
2094 + vmovdqa %xmm8,-0x90(%r10)
2095 + vmovdqa %xmm9,-0x80(%r10)
2096 + vmovdqa %xmm10,-0x70(%r10)
2097 + vmovdqa %xmm11,-0x60(%r10)
2098 + vmovdqa %xmm12,-0x50(%r10)
2099 + vmovdqa %xmm13,-0x40(%r10)
2100 + vmovdqa %xmm14,-0x30(%r10)
2101 + vmovdqa %xmm15,-0x20(%r10)
2105 @@ -2679,7 +2775,7 @@ $code.=<<___;
2107 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
2110 + jnz .Ltail_avx2$suffix
2112 vpsubq $T2,$H2,$H2 # undo input accumulation
2113 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2114 @@ -2690,29 +2786,61 @@ $code.=<<___;
2117 $code.=<<___ if ($win64);
2118 - movdqa 0x50(%r11),%xmm6
2119 - movdqa 0x60(%r11),%xmm7
2120 - movdqa 0x70(%r11),%xmm8
2121 - movdqa 0x80(%r11),%xmm9
2122 - movdqa 0x90(%r11),%xmm10
2123 - movdqa 0xa0(%r11),%xmm11
2124 - movdqa 0xb0(%r11),%xmm12
2125 - movdqa 0xc0(%r11),%xmm13
2126 - movdqa 0xd0(%r11),%xmm14
2127 - movdqa 0xe0(%r11),%xmm15
2128 - lea 0xf8(%r11),%rsp
2129 + movdqa -0xb0(%r10),%xmm6
2130 + movdqa -0xa0(%r10),%xmm7
2131 + movdqa -0x90(%r10),%xmm8
2132 + movdqa -0x80(%r10),%xmm9
2133 + movdqa -0x70(%r10),%xmm10
2134 + movdqa -0x60(%r10),%xmm11
2135 + movdqa -0x50(%r10),%xmm12
2136 + movdqa -0x40(%r10),%xmm13
2137 + movdqa -0x30(%r10),%xmm14
2138 + movdqa -0x20(%r10),%xmm15
2140 .Ldo_avx512_epilogue:
2142 $code.=<<___ if (!$win64);
2144 -.cfi_def_cfa %rsp,8
2146 +.cfi_def_cfa_register %rsp
2151 -.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
2159 +&declare_function("poly1305_blocks_avx2", 32, 4);
2160 +poly1305_blocks_avxN(0);
2161 +&end_function("poly1305_blocks_avx2");
2164 + $code .= "#endif\n";
2167 +#######################################################################
2169 +# On entry we have input length divisible by 64. But since inner loop
2170 +# processes 128 bytes per iteration, cases when length is not divisible
2171 +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2172 +# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2173 +# for this tail, we wouldn't have to even allocate stack frame...
2176 + $code .= "#ifdef CONFIG_AS_AVX512\n";
2179 +&declare_function("poly1305_blocks_avx512", 32, 4);
2180 +poly1305_blocks_avxN(1);
2181 +&end_function("poly1305_blocks_avx512");
2184 + $code .= "#endif\n";
2187 +if (!$kernel && $avx>3) {
2188 ########################################################################
2189 # VPMADD52 version using 2^44 radix.
2191 @@ -3753,45 +3881,9 @@ poly1305_emit_base2_44:
2192 .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
2199 -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
2201 -.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
2203 -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
2205 -.long 2,2,2,3,2,0,2,1
2207 -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
2210 -.long 0,1,1,2,2,3,7,7
2214 -.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
2222 -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
2223 -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
2225 -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
2226 -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
2230 -.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2235 { # chacha20-poly1305 helpers
2236 my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
2237 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
2238 @@ -4038,17 +4130,17 @@ avx_handler:
2242 - .rva .LSEH_begin_poly1305_init
2243 - .rva .LSEH_end_poly1305_init
2244 - .rva .LSEH_info_poly1305_init
2246 - .rva .LSEH_begin_poly1305_blocks
2247 - .rva .LSEH_end_poly1305_blocks
2248 - .rva .LSEH_info_poly1305_blocks
2250 - .rva .LSEH_begin_poly1305_emit
2251 - .rva .LSEH_end_poly1305_emit
2252 - .rva .LSEH_info_poly1305_emit
2253 + .rva .LSEH_begin_poly1305_init_x86_64
2254 + .rva .LSEH_end_poly1305_init_x86_64
2255 + .rva .LSEH_info_poly1305_init_x86_64
2257 + .rva .LSEH_begin_poly1305_blocks_x86_64
2258 + .rva .LSEH_end_poly1305_blocks_x86_64
2259 + .rva .LSEH_info_poly1305_blocks_x86_64
2261 + .rva .LSEH_begin_poly1305_emit_x86_64
2262 + .rva .LSEH_end_poly1305_emit_x86_64
2263 + .rva .LSEH_info_poly1305_emit_x86_64
2265 $code.=<<___ if ($avx);
2266 .rva .LSEH_begin_poly1305_blocks_avx
2267 @@ -4088,20 +4180,20 @@ ___
2271 -.LSEH_info_poly1305_init:
2272 +.LSEH_info_poly1305_init_x86_64:
2275 - .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
2276 + .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
2278 -.LSEH_info_poly1305_blocks:
2279 +.LSEH_info_poly1305_blocks_x86_64:
2282 .rva .Lblocks_body,.Lblocks_epilogue
2284 -.LSEH_info_poly1305_emit:
2285 +.LSEH_info_poly1305_emit_x86_64:
2288 - .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
2289 + .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
2291 $code.=<<___ if ($avx);
2292 .LSEH_info_poly1305_blocks_avx_1:
2293 @@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2);
2300 + last if (!s/^#/\/\// and !/^$/);
2305 foreach (split('\n',$code)) {
2306 s/\`([^\`]*)\`/eval($1)/ge;
2307 s/%r([a-z]+)#d/%e$1/g;
2308 s/%r([0-9]+)#d/%r$1d/g;
2309 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
2312 + s/(^\.type.*),[0-9]+$/\1/;
2313 + s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
2314 + next if /^\.cfi.*/;
2320 --- a/arch/x86/crypto/poly1305_glue.c
2321 +++ b/arch/x86/crypto/poly1305_glue.c
2323 -// SPDX-License-Identifier: GPL-2.0-or-later
2324 +// SPDX-License-Identifier: GPL-2.0 OR MIT
2326 - * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
2328 - * Copyright (C) 2015 Martin Willi
2329 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
2332 #include <crypto/algapi.h>
2333 @@ -13,279 +11,170 @@
2334 #include <linux/jump_label.h>
2335 #include <linux/kernel.h>
2336 #include <linux/module.h>
2337 +#include <asm/intel-family.h>
2338 #include <asm/simd.h>
2340 -asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
2341 - const u32 *r, unsigned int blocks);
2342 -asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
2343 - unsigned int blocks, const u32 *u);
2344 -asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
2345 - unsigned int blocks, const u32 *u);
2346 +asmlinkage void poly1305_init_x86_64(void *ctx,
2347 + const u8 key[POLY1305_KEY_SIZE]);
2348 +asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
2349 + const size_t len, const u32 padbit);
2350 +asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
2351 + const u32 nonce[4]);
2352 +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
2353 + const u32 nonce[4]);
2354 +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
2355 + const u32 padbit);
2356 +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
2357 + const u32 padbit);
2358 +asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
2359 + const size_t len, const u32 padbit);
2361 -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
2362 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
2363 static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
2364 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
2366 -static inline u64 mlt(u64 a, u64 b)
2371 -static inline u32 sr(u64 v, u_char n)
2376 -static inline u32 and(u32 v, u32 mask)
2381 -static void poly1305_simd_mult(u32 *a, const u32 *b)
2383 - u8 m[POLY1305_BLOCK_SIZE];
2385 - memset(m, 0, sizeof(m));
2386 - /* The poly1305 block function adds a hi-bit to the accumulator which
2387 - * we don't need for key multiplication; compensate for it. */
2389 - poly1305_block_sse2(a, m, b, 1);
2392 -static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key)
2394 - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
2395 - key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff;
2396 - key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03;
2397 - key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff;
2398 - key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff;
2399 - key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
2401 +struct poly1305_arch_internal {
2411 + struct { u32 r2, r1, r4, r3; } rn[9];
2414 -static void poly1305_integer_blocks(struct poly1305_state *state,
2415 - const struct poly1305_key *key,
2417 - unsigned int nblocks, u32 hibit)
2418 +/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
2419 + * the unfortunate situation of using AVX and then having to go back to scalar
2420 + * -- because the user is silly and has called the update function from two
2421 + * separate contexts -- then we need to convert back to the original base before
2422 + * proceeding. It is possible to reason that the initial reduction below is
2423 + * sufficient given the implementation invariants. However, for an avoidance of
2424 + * doubt and because this is not performance critical, we do the full reduction
2425 + * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
2427 +static void convert_to_base2_64(void *ctx)
2429 - u32 r0, r1, r2, r3, r4;
2430 - u32 s1, s2, s3, s4;
2431 - u32 h0, h1, h2, h3, h4;
2432 - u64 d0, d1, d2, d3, d4;
2433 + struct poly1305_arch_internal *state = ctx;
2437 + if (!state->is_base2_26)
2459 - h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff;
2460 - h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
2461 - h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
2462 - h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
2463 - h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
2466 - d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
2467 - mlt(h3, s2) + mlt(h4, s1);
2468 - d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
2469 - mlt(h3, s3) + mlt(h4, s2);
2470 - d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
2471 - mlt(h3, s4) + mlt(h4, s3);
2472 - d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
2473 - mlt(h3, r0) + mlt(h4, s4);
2474 - d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
2475 - mlt(h3, r1) + mlt(h4, r0);
2477 - /* (partial) h %= p */
2478 - d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
2479 - d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
2480 - d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
2481 - d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
2482 - h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
2483 - h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
2485 - src += POLY1305_BLOCK_SIZE;
2486 - } while (--nblocks);
2495 -static void poly1305_integer_emit(const struct poly1305_state *state, void *dst)
2497 - u32 h0, h1, h2, h3, h4;
2498 - u32 g0, g1, g2, g3, g4;
2501 - /* fully carry h */
2508 - h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
2509 - h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
2510 - h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
2511 - h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
2512 - h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
2514 - /* compute h + -p */
2516 - g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
2517 - g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
2518 - g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
2519 - g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
2521 - /* select h if h < p, or h + -p if h >= p */
2522 - mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
2529 - h0 = (h0 & mask) | g0;
2530 - h1 = (h1 & mask) | g1;
2531 - h2 = (h2 & mask) | g2;
2532 - h3 = (h3 & mask) | g3;
2533 - h4 = (h4 & mask) | g4;
2535 - /* h = h % (2^128) */
2536 - put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0);
2537 - put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4);
2538 - put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8);
2539 - put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12);
2542 -void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
2544 - poly1305_integer_setkey(desc->opaque_r, key);
2545 - desc->s[0] = get_unaligned_le32(key + 16);
2546 - desc->s[1] = get_unaligned_le32(key + 20);
2547 - desc->s[2] = get_unaligned_le32(key + 24);
2548 - desc->s[3] = get_unaligned_le32(key + 28);
2549 - poly1305_core_init(&desc->h);
2551 - desc->sset = true;
2554 -EXPORT_SYMBOL_GPL(poly1305_init_arch);
2556 -static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
2557 - const u8 *src, unsigned int srclen)
2559 - if (!dctx->sset) {
2560 - if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
2561 - poly1305_integer_setkey(dctx->r, src);
2562 - src += POLY1305_BLOCK_SIZE;
2563 - srclen -= POLY1305_BLOCK_SIZE;
2566 - if (srclen >= POLY1305_BLOCK_SIZE) {
2567 - dctx->s[0] = get_unaligned_le32(src + 0);
2568 - dctx->s[1] = get_unaligned_le32(src + 4);
2569 - dctx->s[2] = get_unaligned_le32(src + 8);
2570 - dctx->s[3] = get_unaligned_le32(src + 12);
2571 - src += POLY1305_BLOCK_SIZE;
2572 - srclen -= POLY1305_BLOCK_SIZE;
2573 - dctx->sset = true;
2575 + cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
2576 + cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
2577 + cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
2578 + cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
2579 + state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
2580 + state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
2581 + state->hs[2] = state->h[4] >> 24;
2582 +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
2583 + cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
2584 + state->hs[2] &= 3;
2585 + state->hs[0] += cy;
2586 + state->hs[1] += (cy = ULT(state->hs[0], cy));
2587 + state->hs[2] += ULT(state->hs[1], cy);
2589 + state->is_base2_26 = 0;
2592 +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
2594 + poly1305_init_x86_64(ctx, key);
2597 +static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
2600 + struct poly1305_arch_internal *state = ctx;
2602 + /* SIMD disables preemption, so relax after processing each page. */
2603 + BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
2604 + PAGE_SIZE % POLY1305_BLOCK_SIZE);
2606 + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
2607 + (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
2608 + !crypto_simd_usable()) {
2609 + convert_to_base2_64(ctx);
2610 + poly1305_blocks_x86_64(ctx, inp, len, padbit);
2616 -static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
2617 - const u8 *src, unsigned int srclen)
2619 - unsigned int datalen;
2621 + const size_t bytes = min_t(size_t, len, PAGE_SIZE);
2623 - if (unlikely(!dctx->sset)) {
2624 - datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
2625 - src += srclen - datalen;
2628 - if (srclen >= POLY1305_BLOCK_SIZE) {
2629 - poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src,
2630 - srclen / POLY1305_BLOCK_SIZE, 1);
2631 - srclen %= POLY1305_BLOCK_SIZE;
2632 + kernel_fpu_begin();
2633 + if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
2634 + poly1305_blocks_avx512(ctx, inp, bytes, padbit);
2635 + else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
2636 + poly1305_blocks_avx2(ctx, inp, bytes, padbit);
2638 + poly1305_blocks_avx(ctx, inp, bytes, padbit);
2648 -static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
2649 - const u8 *src, unsigned int srclen)
2651 - unsigned int blocks, datalen;
2652 +static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
2653 + const u32 nonce[4])
2655 + struct poly1305_arch_internal *state = ctx;
2657 + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
2658 + !state->is_base2_26 || !crypto_simd_usable()) {
2659 + convert_to_base2_64(ctx);
2660 + poly1305_emit_x86_64(ctx, mac, nonce);
2662 + poly1305_emit_avx(ctx, mac, nonce);
2665 +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
2667 + poly1305_simd_init(&dctx->h, key);
2668 + dctx->s[0] = get_unaligned_le32(&key[16]);
2669 + dctx->s[1] = get_unaligned_le32(&key[20]);
2670 + dctx->s[2] = get_unaligned_le32(&key[24]);
2671 + dctx->s[3] = get_unaligned_le32(&key[28]);
2673 + dctx->sset = true;
2675 +EXPORT_SYMBOL(poly1305_init_arch);
2677 +static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
2678 + const u8 *inp, unsigned int len)
2680 + unsigned int acc = 0;
2681 if (unlikely(!dctx->sset)) {
2682 - datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
2683 - src += srclen - datalen;
2687 - if (IS_ENABLED(CONFIG_AS_AVX2) &&
2688 - static_branch_likely(&poly1305_use_avx2) &&
2689 - srclen >= POLY1305_BLOCK_SIZE * 4) {
2690 - if (unlikely(dctx->rset < 4)) {
2691 - if (dctx->rset < 2) {
2692 - dctx->r[1] = dctx->r[0];
2693 - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
2695 - dctx->r[2] = dctx->r[1];
2696 - poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r);
2697 - dctx->r[3] = dctx->r[2];
2698 - poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r);
2700 + if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
2701 + poly1305_simd_init(&dctx->h, inp);
2702 + inp += POLY1305_BLOCK_SIZE;
2703 + len -= POLY1305_BLOCK_SIZE;
2704 + acc += POLY1305_BLOCK_SIZE;
2707 - blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
2708 - poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks,
2710 - src += POLY1305_BLOCK_SIZE * 4 * blocks;
2711 - srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
2714 - if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
2715 - if (unlikely(dctx->rset < 2)) {
2716 - dctx->r[1] = dctx->r[0];
2717 - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
2719 + if (len >= POLY1305_BLOCK_SIZE) {
2720 + dctx->s[0] = get_unaligned_le32(&inp[0]);
2721 + dctx->s[1] = get_unaligned_le32(&inp[4]);
2722 + dctx->s[2] = get_unaligned_le32(&inp[8]);
2723 + dctx->s[3] = get_unaligned_le32(&inp[12]);
2724 + inp += POLY1305_BLOCK_SIZE;
2725 + len -= POLY1305_BLOCK_SIZE;
2726 + acc += POLY1305_BLOCK_SIZE;
2727 + dctx->sset = true;
2729 - blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
2730 - poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r,
2731 - blocks, dctx->r[1].r);
2732 - src += POLY1305_BLOCK_SIZE * 2 * blocks;
2733 - srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
2735 - if (srclen >= POLY1305_BLOCK_SIZE) {
2736 - poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1);
2737 - srclen -= POLY1305_BLOCK_SIZE;
2743 void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
2744 unsigned int srclen)
2746 - unsigned int bytes;
2747 + unsigned int bytes, used;
2749 if (unlikely(dctx->buflen)) {
2750 bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
2751 @@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly130
2752 dctx->buflen += bytes;
2754 if (dctx->buflen == POLY1305_BLOCK_SIZE) {
2755 - if (static_branch_likely(&poly1305_use_simd) &&
2756 - likely(crypto_simd_usable())) {
2757 - kernel_fpu_begin();
2758 - poly1305_simd_blocks(dctx, dctx->buf,
2759 - POLY1305_BLOCK_SIZE);
2762 - poly1305_scalar_blocks(dctx, dctx->buf,
2763 - POLY1305_BLOCK_SIZE);
2765 + if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
2766 + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
2771 if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
2772 - if (static_branch_likely(&poly1305_use_simd) &&
2773 - likely(crypto_simd_usable())) {
2774 - kernel_fpu_begin();
2775 - bytes = poly1305_simd_blocks(dctx, src, srclen);
2778 - bytes = poly1305_scalar_blocks(dctx, src, srclen);
2780 - src += srclen - bytes;
2782 + bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
2784 + used = crypto_poly1305_setdctxkey(dctx, src, bytes);
2785 + if (likely(bytes - used))
2786 + poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
2790 if (unlikely(srclen)) {
2791 @@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly130
2793 EXPORT_SYMBOL(poly1305_update_arch);
2795 -void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst)
2796 +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
2801 - if (unlikely(desc->buflen)) {
2802 - desc->buf[desc->buflen++] = 1;
2803 - memset(desc->buf + desc->buflen, 0,
2804 - POLY1305_BLOCK_SIZE - desc->buflen);
2805 - poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0);
2806 + if (unlikely(dctx->buflen)) {
2807 + dctx->buf[dctx->buflen++] = 1;
2808 + memset(dctx->buf + dctx->buflen, 0,
2809 + POLY1305_BLOCK_SIZE - dctx->buflen);
2810 + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
2813 - poly1305_integer_emit(&desc->h, digest);
2815 - /* mac = (h + s) % (2^128) */
2816 - f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
2817 - put_unaligned_le32(f, dst + 0);
2818 - f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
2819 - put_unaligned_le32(f, dst + 4);
2820 - f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
2821 - put_unaligned_le32(f, dst + 8);
2822 - f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
2823 - put_unaligned_le32(f, dst + 12);
2825 - *desc = (struct poly1305_desc_ctx){};
2826 + poly1305_simd_emit(&dctx->h, dst, dctx->s);
2827 + *dctx = (struct poly1305_desc_ctx){};
2829 EXPORT_SYMBOL(poly1305_final_arch);
2831 @@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct s
2833 struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2835 - poly1305_core_init(&dctx->h);
2838 - dctx->sset = false;
2840 + *dctx = (struct poly1305_desc_ctx){};
2844 -static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
2845 +static int crypto_poly1305_update(struct shash_desc *desc,
2846 + const u8 *src, unsigned int srclen)
2848 struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2850 - if (unlikely(!dctx->sset))
2853 - poly1305_final_arch(dctx, dst);
2854 + poly1305_update_arch(dctx, src, srclen);
2858 -static int poly1305_simd_update(struct shash_desc *desc,
2859 - const u8 *src, unsigned int srclen)
2860 +static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
2862 struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2864 - poly1305_update_arch(dctx, src, srclen);
2865 + if (unlikely(!dctx->sset))
2868 + poly1305_final_arch(dctx, dst);
2872 static struct shash_alg alg = {
2873 .digestsize = POLY1305_DIGEST_SIZE,
2874 .init = crypto_poly1305_init,
2875 - .update = poly1305_simd_update,
2876 + .update = crypto_poly1305_update,
2877 .final = crypto_poly1305_final,
2878 .descsize = sizeof(struct poly1305_desc_ctx),
2880 @@ -406,17 +265,19 @@ static struct shash_alg alg = {
2882 static int __init poly1305_simd_mod_init(void)
2884 - if (!boot_cpu_has(X86_FEATURE_XMM2))
2887 - static_branch_enable(&poly1305_use_simd);
2889 - if (IS_ENABLED(CONFIG_AS_AVX2) &&
2890 - boot_cpu_has(X86_FEATURE_AVX) &&
2891 + if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
2892 + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
2893 + static_branch_enable(&poly1305_use_avx);
2894 + if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
2895 boot_cpu_has(X86_FEATURE_AVX2) &&
2896 cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
2897 static_branch_enable(&poly1305_use_avx2);
2899 + if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
2900 + boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
2901 + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
2902 + /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
2903 + boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
2904 + static_branch_enable(&poly1305_use_avx512);
2905 return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
2908 @@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init);
2909 module_exit(poly1305_simd_mod_exit);
2911 MODULE_LICENSE("GPL");
2912 -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
2913 +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
2914 MODULE_DESCRIPTION("Poly1305 authenticator");
2915 MODULE_ALIAS_CRYPTO("poly1305");
2916 MODULE_ALIAS_CRYPTO("poly1305-simd");
2917 --- a/lib/crypto/Kconfig
2918 +++ b/lib/crypto/Kconfig
2919 @@ -90,7 +90,7 @@ config CRYPTO_LIB_DES
2920 config CRYPTO_LIB_POLY1305_RSIZE
2923 - default 4 if X86_64
2924 + default 11 if X86_64
2925 default 9 if ARM || ARM64