1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Ard Biesheuvel <ardb@kernel.org>
3 Date: Fri, 8 Nov 2019 13:22:24 +0100
4 Subject: [PATCH] crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON
7 commit f569ca16475155013525686d0f73bc379c67e635 upstream.
9 This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation
10 for NEON authored by Andy Polyakov, and contributed by him to the OpenSSL
11 project. The file 'poly1305-armv8.pl' is taken straight from this upstream
12 GitHub repository [0] at commit ec55a08dc0244ce570c4fc7cade330c60798952f,
13 and already contains all the changes required to build it as part of a
16 [0] https://github.com/dot-asm/cryptogams
18 Co-developed-by: Andy Polyakov <appro@cryptogams.org>
19 Signed-off-by: Andy Polyakov <appro@cryptogams.org>
20 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
21 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
22 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
24 arch/arm64/crypto/Kconfig | 6 +
25 arch/arm64/crypto/Makefile | 10 +-
26 arch/arm64/crypto/poly1305-armv8.pl | 913 ++++++++++++++++++++++
27 arch/arm64/crypto/poly1305-core.S_shipped | 835 ++++++++++++++++++++
28 arch/arm64/crypto/poly1305-glue.c | 237 ++++++
29 lib/crypto/Kconfig | 1 +
30 6 files changed, 2001 insertions(+), 1 deletion(-)
31 create mode 100644 arch/arm64/crypto/poly1305-armv8.pl
32 create mode 100644 arch/arm64/crypto/poly1305-core.S_shipped
33 create mode 100644 arch/arm64/crypto/poly1305-glue.c
35 --- a/arch/arm64/crypto/Kconfig
36 +++ b/arch/arm64/crypto/Kconfig
37 @@ -106,6 +106,12 @@ config CRYPTO_CHACHA20_NEON
38 select CRYPTO_LIB_CHACHA_GENERIC
39 select CRYPTO_ARCH_HAVE_LIB_CHACHA
41 +config CRYPTO_POLY1305_NEON
42 + tristate "Poly1305 hash function using scalar or NEON instructions"
43 + depends on KERNEL_MODE_NEON
45 + select CRYPTO_ARCH_HAVE_LIB_POLY1305
47 config CRYPTO_NHPOLY1305_NEON
48 tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
49 depends on KERNEL_MODE_NEON
50 --- a/arch/arm64/crypto/Makefile
51 +++ b/arch/arm64/crypto/Makefile
52 @@ -50,6 +50,10 @@ sha512-arm64-y := sha512-glue.o sha512-c
53 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
54 chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
56 +obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
57 +poly1305-neon-y := poly1305-core.o poly1305-glue.o
58 +AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
60 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
61 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
63 @@ -68,11 +72,15 @@ ifdef REGENERATE_ARM64_CRYPTO
64 quiet_cmd_perlasm = PERLASM $@
65 cmd_perlasm = $(PERL) $(<) void $(@)
67 +$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
70 $(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
73 $(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
78 -clean-files += sha256-core.S sha512-core.S
79 +clean-files += poly1305-core.S sha256-core.S sha512-core.S
81 +++ b/arch/arm64/crypto/poly1305-armv8.pl
84 +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
86 +# ====================================================================
87 +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
89 +# ====================================================================
91 +# This module implements Poly1305 hash for ARMv8.
95 +# Numbers are cycles per processed byte with poly1305_blocks alone.
99 +# Apple A7 1.86/+5% 0.72
100 +# Cortex-A53 2.69/+58% 1.47
101 +# Cortex-A57 2.70/+7% 1.14
102 +# Denver 1.64/+50% 1.18(*)
103 +# X-Gene 2.13/+68% 2.27
104 +# Mongoose 1.77/+75% 1.12
105 +# Kryo 2.70/+55% 1.13
106 +# ThunderX2 1.17/+95% 1.36
108 +# (*) estimate based on resources availability is less than 1.0,
109 +# i.e. measured result is worse than expected, presumably binary
110 +# translator is not almighty;
115 +if ($flavour && $flavour ne "void") {
116 + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
117 + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
118 + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
119 + die "can't locate arm-xlate.pl";
121 + open STDOUT,"| \"$^X\" $xlate $flavour $output";
123 + open STDOUT,">$output";
126 +my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
127 +my ($mac,$nonce)=($inp,$len);
129 +my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
133 +# include "arm_arch.h"
134 +.extern OPENSSL_armcap_P
139 +// forward "declarations" are required for Apple
140 +.globl poly1305_blocks
141 +.globl poly1305_emit
143 +.globl poly1305_init
144 +.type poly1305_init,%function
148 + stp xzr,xzr,[$ctx] // zero hash value
149 + stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
155 + adrp x17,OPENSSL_armcap_P
156 + ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
159 + ldp $r0,$r1,[$inp] // load key
160 + mov $s1,#0xfffffffc0fffffff
161 + movk $s1,#0x0fff,lsl#48
162 +#ifdef __AARCH64EB__
163 + rev $r0,$r0 // flip bytes
166 + and $r0,$r0,$s1 // &=0ffffffc0fffffff
168 + and $r1,$r1,$s1 // &=0ffffffc0ffffffc
170 + stp $r0,$r1,[$ctx,#32] // save key value
171 + str w#$s1,[$ctx,#48] // impossible key power value
174 + tst w17,#ARMV7_NEON
176 + adr $d0,.Lpoly1305_blocks
177 + adr $r0,.Lpoly1305_blocks_neon
178 + adr $d1,.Lpoly1305_emit
180 + csel $d0,$d0,$r0,eq
183 + stp w#$d0,w#$d1,[$len]
191 +.size poly1305_init,.-poly1305_init
193 +.type poly1305_blocks,%function
197 + ands $len,$len,#-16
200 + ldp $h0,$h1,[$ctx] // load hash value
201 + ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
202 + ldp $r0,$r1,[$ctx,#32] // load key value
204 +#ifdef __AARCH64EB__
218 + add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
220 + adds $d0,$d0,$d2,lsl#52
221 + add $d1,$d1,x15,lsl#14
224 + adds $d1,$d1,x16,lsl#40
227 + cmp x17,#0 // is_base2_26?
228 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
229 + csel $h0,$h0,$d0,eq // choose between radixes
230 + csel $h1,$h1,$d1,eq
231 + csel $h2,$h2,$d2,eq
234 + ldp $t0,$t1,[$inp],#16 // load input
236 +#ifdef __AARCH64EB__
240 + adds $h0,$h0,$t0 // accumulate input
243 + mul $d0,$h0,$r0 // h0*r0
244 + adc $h2,$h2,$padbit
247 + mul $t0,$h1,$s1 // h1*5*r1
251 + mul $t0,$h0,$r1 // h0*r1
256 + mul $t0,$h1,$r0 // h1*r0
261 + mul $t0,$h2,$s1 // h2*5*r1
263 + mul $t1,$h2,$r0 // h2*r0
268 + and $t0,$d2,#-4 // final reduction
270 + add $t0,$t0,$d2,lsr#2
277 + stp $h0,$h1,[$ctx] // store hash value
278 + stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
282 +.size poly1305_blocks,.-poly1305_blocks
284 +.type poly1305_emit,%function
288 + ldp $h0,$h1,[$ctx] // load hash base 2^64
289 + ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
290 + ldp $t0,$t1,[$nonce] // load nonce
292 +#ifdef __AARCH64EB__
306 + add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
308 + adds $d0,$d0,$d2,lsl#52
309 + add $d1,$d1,x15,lsl#14
312 + adds $d1,$d1,x16,lsl#40
315 + cmp $r0,#0 // is_base2_26?
316 + csel $h0,$h0,$d0,eq // choose between radixes
317 + csel $h1,$h1,$d1,eq
318 + csel $h2,$h2,$d2,eq
320 + adds $d0,$h0,#5 // compare to modulus
324 + tst $d2,#-4 // see if it's carried/borrowed
326 + csel $h0,$h0,$d0,eq
327 + csel $h1,$h1,$d1,eq
329 +#ifdef __AARCH64EB__
330 + ror $t0,$t0,#32 // flip nonce words
333 + adds $h0,$h0,$t0 // accumulate nonce
335 +#ifdef __AARCH64EB__
336 + rev $h0,$h0 // flip output bytes
339 + stp $h0,$h1,[$mac] // write result
342 +.size poly1305_emit,.-poly1305_emit
344 +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
345 +my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
346 +my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
347 +my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
348 +my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
349 +my ($T0,$T1,$MASK) = map("v$_",(29..31));
351 +my ($in2,$zeros)=("x16","x17");
352 +my $is_base2_26 = $zeros; # borrow
355 +.type poly1305_mult,%function
358 + mul $d0,$h0,$r0 // h0*r0
361 + mul $t0,$h1,$s1 // h1*5*r1
365 + mul $t0,$h0,$r1 // h0*r1
370 + mul $t0,$h1,$r0 // h1*r0
375 + mul $t0,$h2,$s1 // h2*5*r1
377 + mul $t1,$h2,$r0 // h2*r0
382 + and $t0,$d2,#-4 // final reduction
384 + add $t0,$t0,$d2,lsr#2
390 +.size poly1305_mult,.-poly1305_mult
392 +.type poly1305_splat,%function
395 + and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
396 + ubfx x13,$h0,#26,#26
397 + extr x14,$h1,$h0,#52
398 + and x14,x14,#0x03ffffff
399 + ubfx x15,$h1,#14,#26
400 + extr x16,$h2,$h1,#40
402 + str w12,[$ctx,#16*0] // r0
403 + add w12,w13,w13,lsl#2 // r1*5
404 + str w13,[$ctx,#16*1] // r1
405 + add w13,w14,w14,lsl#2 // r2*5
406 + str w12,[$ctx,#16*2] // s1
407 + str w14,[$ctx,#16*3] // r2
408 + add w14,w15,w15,lsl#2 // r3*5
409 + str w13,[$ctx,#16*4] // s2
410 + str w15,[$ctx,#16*5] // r3
411 + add w15,w16,w16,lsl#2 // r4*5
412 + str w14,[$ctx,#16*6] // s3
413 + str w16,[$ctx,#16*7] // r4
414 + str w15,[$ctx,#16*8] // s4
417 +.size poly1305_splat,.-poly1305_splat
420 +.globl poly1305_blocks_neon
422 +.type poly1305_blocks_neon,%function
424 +poly1305_blocks_neon:
425 +.Lpoly1305_blocks_neon:
426 + ldr $is_base2_26,[$ctx,#24]
428 + b.lo .Lpoly1305_blocks
430 + .inst 0xd503233f // paciasp
431 + stp x29,x30,[sp,#-80]!
434 + stp d8,d9,[sp,#16] // meet ABI requirements
435 + stp d10,d11,[sp,#32]
436 + stp d12,d13,[sp,#48]
437 + stp d14,d15,[sp,#64]
439 + cbz $is_base2_26,.Lbase2_64_neon
441 + ldp w10,w11,[$ctx] // load hash value base 2^26
442 + ldp w12,w13,[$ctx,#8]
448 + ldp $r0,$r1,[$ctx,#32] // load key value
450 + add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
452 + adds $h0,$h0,x12,lsl#52
453 + add $h1,$h1,x13,lsl#14
456 + adds $h1,$h1,x14,lsl#40
457 + adc $d2,$h2,xzr // can be partially reduced...
459 + ldp $d0,$d1,[$inp],#16 // load input
461 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
463 +#ifdef __AARCH64EB__
467 + adds $h0,$h0,$d0 // accumulate input
469 + adc $h2,$h2,$padbit
473 + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
474 + ubfx x11,$h0,#26,#26
475 + extr x12,$h1,$h0,#52
476 + and x12,x12,#0x03ffffff
477 + ubfx x13,$h1,#14,#26
478 + extr x14,$h2,$h1,#40
484 + ldp $r0,$r1,[$ctx,#32] // load key value
486 + ldp $h0,$h1,[$ctx] // load hash value base 2^64
492 + ldp $d0,$d1,[$inp],#16 // load input
494 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
495 +#ifdef __AARCH64EB__
499 + adds $h0,$h0,$d0 // accumulate input
501 + adc $h2,$h2,$padbit
506 + ldr w17,[$ctx,#48] // first table element
507 + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
508 + ubfx x11,$h0,#26,#26
509 + extr x12,$h1,$h0,#52
510 + and x12,x12,#0x03ffffff
511 + ubfx x13,$h1,#14,#26
512 + extr x14,$h2,$h1,#40
514 + cmp w17,#-1 // is value impossible?
523 + ////////////////////////////////// initialize r^n table
525 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
528 + add $ctx,$ctx,#48+12
531 + bl poly1305_mult // r^2
535 + bl poly1305_mult // r^3
539 + bl poly1305_mult // r^4
542 + sub $ctx,$ctx,#48 // restore original $ctx
554 + ldp x8,x12,[$inp,#32] // inp[2:3]
556 + ldp x9,x13,[$inp,#48]
560 + lsl $padbit,$padbit,#24
563 +#ifdef __AARCH64EB__
569 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
570 + and x5,x9,#0x03ffffff
573 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
576 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
578 + and x8,x8,#0x03ffffff
579 + and x9,x9,#0x03ffffff
580 + ubfx x10,x12,#14,#26
581 + ubfx x11,x13,#14,#26
582 + add x12,$padbit,x12,lsr#40
583 + add x13,$padbit,x13,lsr#40
584 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
586 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
587 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
592 + ldp x8,x12,[$inp],#16 // inp[0:1]
593 + ldp x9,x13,[$inp],#48
595 + ld1 {$R0,$R1,$S1,$R2},[x15],#64
596 + ld1 {$S2,$R3,$S3,$R4},[x15],#64
599 +#ifdef __AARCH64EB__
605 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
606 + and x5,x9,#0x03ffffff
609 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
612 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
614 + and x8,x8,#0x03ffffff
615 + and x9,x9,#0x03ffffff
616 + ubfx x10,x12,#14,#26
617 + ubfx x11,x13,#14,#26
618 + add x12,$padbit,x12,lsr#40
619 + add x13,$padbit,x13,lsr#40
620 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
622 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
623 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
628 + ushr $MASK.2d,$MASK.2d,#38
634 + ////////////////////////////////////////////////////////////////
635 + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
636 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
637 + // \___________________/
638 + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
639 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
640 + // \___________________/ \____________________/
642 + // Note that we start with inp[2:3]*r^2. This is because it
643 + // doesn't depend on reduction in previous iteration.
644 + ////////////////////////////////////////////////////////////////
645 + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
646 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
647 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
648 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
649 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
652 + umull $ACC4,$IN23_0,${R4}[2]
653 + csel $in2,$zeros,$in2,lo
654 + umull $ACC3,$IN23_0,${R3}[2]
655 + umull $ACC2,$IN23_0,${R2}[2]
656 + ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
657 + umull $ACC1,$IN23_0,${R1}[2]
658 + ldp x9,x13,[$in2],#48
659 + umull $ACC0,$IN23_0,${R0}[2]
660 +#ifdef __AARCH64EB__
667 + umlal $ACC4,$IN23_1,${R3}[2]
668 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
669 + umlal $ACC3,$IN23_1,${R2}[2]
670 + and x5,x9,#0x03ffffff
671 + umlal $ACC2,$IN23_1,${R1}[2]
673 + umlal $ACC1,$IN23_1,${R0}[2]
675 + umlal $ACC0,$IN23_1,${S4}[2]
676 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
678 + umlal $ACC4,$IN23_2,${R2}[2]
680 + umlal $ACC3,$IN23_2,${R1}[2]
682 + umlal $ACC2,$IN23_2,${R0}[2]
683 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
684 + umlal $ACC1,$IN23_2,${S4}[2]
686 + umlal $ACC0,$IN23_2,${S3}[2]
687 + and x8,x8,#0x03ffffff
689 + umlal $ACC4,$IN23_3,${R1}[2]
690 + and x9,x9,#0x03ffffff
691 + umlal $ACC3,$IN23_3,${R0}[2]
692 + ubfx x10,x12,#14,#26
693 + umlal $ACC2,$IN23_3,${S4}[2]
694 + ubfx x11,x13,#14,#26
695 + umlal $ACC1,$IN23_3,${S3}[2]
696 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
697 + umlal $ACC0,$IN23_3,${S2}[2]
700 + add $IN01_2,$IN01_2,$H2
701 + add x12,$padbit,x12,lsr#40
702 + umlal $ACC4,$IN23_4,${R0}[2]
703 + add x13,$padbit,x13,lsr#40
704 + umlal $ACC3,$IN23_4,${S4}[2]
705 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
706 + umlal $ACC2,$IN23_4,${S3}[2]
707 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
708 + umlal $ACC1,$IN23_4,${S2}[2]
710 + umlal $ACC0,$IN23_4,${S1}[2]
713 + ////////////////////////////////////////////////////////////////
714 + // (hash+inp[0:1])*r^4 and accumulate
716 + add $IN01_0,$IN01_0,$H0
718 + umlal $ACC3,$IN01_2,${R1}[0]
719 + ldp x8,x12,[$inp],#16 // inp[0:1]
720 + umlal $ACC0,$IN01_2,${S3}[0]
721 + ldp x9,x13,[$inp],#48
722 + umlal $ACC4,$IN01_2,${R2}[0]
723 + umlal $ACC1,$IN01_2,${S4}[0]
724 + umlal $ACC2,$IN01_2,${R0}[0]
725 +#ifdef __AARCH64EB__
732 + add $IN01_1,$IN01_1,$H1
733 + umlal $ACC3,$IN01_0,${R3}[0]
734 + umlal $ACC4,$IN01_0,${R4}[0]
735 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
736 + umlal $ACC2,$IN01_0,${R2}[0]
737 + and x5,x9,#0x03ffffff
738 + umlal $ACC0,$IN01_0,${R0}[0]
740 + umlal $ACC1,$IN01_0,${R1}[0]
743 + add $IN01_3,$IN01_3,$H3
744 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
745 + umlal $ACC3,$IN01_1,${R2}[0]
747 + umlal $ACC4,$IN01_1,${R3}[0]
749 + umlal $ACC0,$IN01_1,${S4}[0]
750 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
751 + umlal $ACC2,$IN01_1,${R1}[0]
753 + umlal $ACC1,$IN01_1,${R0}[0]
754 + and x8,x8,#0x03ffffff
756 + add $IN01_4,$IN01_4,$H4
757 + and x9,x9,#0x03ffffff
758 + umlal $ACC3,$IN01_3,${R0}[0]
759 + ubfx x10,x12,#14,#26
760 + umlal $ACC0,$IN01_3,${S2}[0]
761 + ubfx x11,x13,#14,#26
762 + umlal $ACC4,$IN01_3,${R1}[0]
763 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
764 + umlal $ACC1,$IN01_3,${S3}[0]
766 + umlal $ACC2,$IN01_3,${S4}[0]
767 + add x12,$padbit,x12,lsr#40
769 + umlal $ACC3,$IN01_4,${S4}[0]
770 + add x13,$padbit,x13,lsr#40
771 + umlal $ACC0,$IN01_4,${S1}[0]
772 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
773 + umlal $ACC4,$IN01_4,${R0}[0]
774 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
775 + umlal $ACC1,$IN01_4,${S2}[0]
777 + umlal $ACC2,$IN01_4,${S3}[0]
781 + /////////////////////////////////////////////////////////////////
782 + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
785 + // [see discussion in poly1305-armv4 module]
787 + ushr $T0.2d,$ACC3,#26
789 + ushr $T1.2d,$ACC0,#26
790 + and $ACC0,$ACC0,$MASK.2d
791 + add $ACC4,$ACC4,$T0.2d // h3 -> h4
792 + bic $H3,#0xfc,lsl#24 // &=0x03ffffff
793 + add $ACC1,$ACC1,$T1.2d // h0 -> h1
795 + ushr $T0.2d,$ACC4,#26
797 + ushr $T1.2d,$ACC1,#26
799 + bic $H4,#0xfc,lsl#24
800 + add $ACC2,$ACC2,$T1.2d // h1 -> h2
802 + add $ACC0,$ACC0,$T0.2d
803 + shl $T0.2d,$T0.2d,#2
804 + shrn $T1.2s,$ACC2,#26
806 + add $ACC0,$ACC0,$T0.2d // h4 -> h0
807 + bic $H1,#0xfc,lsl#24
808 + add $H3,$H3,$T1.2s // h2 -> h3
809 + bic $H2,#0xfc,lsl#24
811 + shrn $T0.2s,$ACC0,#26
813 + ushr $T1.2s,$H3,#26
814 + bic $H3,#0xfc,lsl#24
815 + bic $H0,#0xfc,lsl#24
816 + add $H1,$H1,$T0.2s // h0 -> h1
817 + add $H4,$H4,$T1.2s // h3 -> h4
822 + dup $IN23_2,${IN23_2}[0]
823 + add $IN01_2,$IN01_2,$H2
825 + ////////////////////////////////////////////////////////////////
826 + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
831 + dup $IN23_2,${IN01_2}[0]
832 + add $IN23_0,$IN01_0,$H0
833 + add $IN23_3,$IN01_3,$H3
834 + add $IN23_1,$IN01_1,$H1
835 + add $IN23_4,$IN01_4,$H4
838 + dup $IN23_0,${IN23_0}[0]
839 + umull2 $ACC0,$IN23_2,${S3}
840 + umull2 $ACC3,$IN23_2,${R1}
841 + umull2 $ACC4,$IN23_2,${R2}
842 + umull2 $ACC2,$IN23_2,${R0}
843 + umull2 $ACC1,$IN23_2,${S4}
845 + dup $IN23_1,${IN23_1}[0]
846 + umlal2 $ACC0,$IN23_0,${R0}
847 + umlal2 $ACC2,$IN23_0,${R2}
848 + umlal2 $ACC3,$IN23_0,${R3}
849 + umlal2 $ACC4,$IN23_0,${R4}
850 + umlal2 $ACC1,$IN23_0,${R1}
852 + dup $IN23_3,${IN23_3}[0]
853 + umlal2 $ACC0,$IN23_1,${S4}
854 + umlal2 $ACC3,$IN23_1,${R2}
855 + umlal2 $ACC2,$IN23_1,${R1}
856 + umlal2 $ACC4,$IN23_1,${R3}
857 + umlal2 $ACC1,$IN23_1,${R0}
859 + dup $IN23_4,${IN23_4}[0]
860 + umlal2 $ACC3,$IN23_3,${R0}
861 + umlal2 $ACC4,$IN23_3,${R1}
862 + umlal2 $ACC0,$IN23_3,${S2}
863 + umlal2 $ACC1,$IN23_3,${S3}
864 + umlal2 $ACC2,$IN23_3,${S4}
866 + umlal2 $ACC3,$IN23_4,${S4}
867 + umlal2 $ACC0,$IN23_4,${S1}
868 + umlal2 $ACC4,$IN23_4,${R0}
869 + umlal2 $ACC1,$IN23_4,${S2}
870 + umlal2 $ACC2,$IN23_4,${S3}
874 + ////////////////////////////////////////////////////////////////
875 + // (hash+inp[0:1])*r^4:r^3 and accumulate
877 + add $IN01_0,$IN01_0,$H0
878 + umlal $ACC3,$IN01_2,${R1}
879 + umlal $ACC0,$IN01_2,${S3}
880 + umlal $ACC4,$IN01_2,${R2}
881 + umlal $ACC1,$IN01_2,${S4}
882 + umlal $ACC2,$IN01_2,${R0}
884 + add $IN01_1,$IN01_1,$H1
885 + umlal $ACC3,$IN01_0,${R3}
886 + umlal $ACC0,$IN01_0,${R0}
887 + umlal $ACC4,$IN01_0,${R4}
888 + umlal $ACC1,$IN01_0,${R1}
889 + umlal $ACC2,$IN01_0,${R2}
891 + add $IN01_3,$IN01_3,$H3
892 + umlal $ACC3,$IN01_1,${R2}
893 + umlal $ACC0,$IN01_1,${S4}
894 + umlal $ACC4,$IN01_1,${R3}
895 + umlal $ACC1,$IN01_1,${R0}
896 + umlal $ACC2,$IN01_1,${R1}
898 + add $IN01_4,$IN01_4,$H4
899 + umlal $ACC3,$IN01_3,${R0}
900 + umlal $ACC0,$IN01_3,${S2}
901 + umlal $ACC4,$IN01_3,${R1}
902 + umlal $ACC1,$IN01_3,${S3}
903 + umlal $ACC2,$IN01_3,${S4}
905 + umlal $ACC3,$IN01_4,${S4}
906 + umlal $ACC0,$IN01_4,${S1}
907 + umlal $ACC4,$IN01_4,${R0}
908 + umlal $ACC1,$IN01_4,${S2}
909 + umlal $ACC2,$IN01_4,${S3}
912 + ////////////////////////////////////////////////////////////////
915 + addp $ACC3,$ACC3,$ACC3
916 + ldp d8,d9,[sp,#16] // meet ABI requirements
917 + addp $ACC0,$ACC0,$ACC0
918 + ldp d10,d11,[sp,#32]
919 + addp $ACC4,$ACC4,$ACC4
920 + ldp d12,d13,[sp,#48]
921 + addp $ACC1,$ACC1,$ACC1
922 + ldp d14,d15,[sp,#64]
923 + addp $ACC2,$ACC2,$ACC2
925 + .inst 0xd50323bf // autiasp
927 + ////////////////////////////////////////////////////////////////
928 + // lazy reduction, but without narrowing
930 + ushr $T0.2d,$ACC3,#26
931 + and $ACC3,$ACC3,$MASK.2d
932 + ushr $T1.2d,$ACC0,#26
933 + and $ACC0,$ACC0,$MASK.2d
935 + add $ACC4,$ACC4,$T0.2d // h3 -> h4
936 + add $ACC1,$ACC1,$T1.2d // h0 -> h1
938 + ushr $T0.2d,$ACC4,#26
939 + and $ACC4,$ACC4,$MASK.2d
940 + ushr $T1.2d,$ACC1,#26
941 + and $ACC1,$ACC1,$MASK.2d
942 + add $ACC2,$ACC2,$T1.2d // h1 -> h2
944 + add $ACC0,$ACC0,$T0.2d
945 + shl $T0.2d,$T0.2d,#2
946 + ushr $T1.2d,$ACC2,#26
947 + and $ACC2,$ACC2,$MASK.2d
948 + add $ACC0,$ACC0,$T0.2d // h4 -> h0
949 + add $ACC3,$ACC3,$T1.2d // h2 -> h3
951 + ushr $T0.2d,$ACC0,#26
952 + and $ACC0,$ACC0,$MASK.2d
953 + ushr $T1.2d,$ACC3,#26
954 + and $ACC3,$ACC3,$MASK.2d
955 + add $ACC1,$ACC1,$T0.2d // h0 -> h1
956 + add $ACC4,$ACC4,$T1.2d // h3 -> h4
958 + ////////////////////////////////////////////////////////////////
959 + // write the result, can be partially reduced
961 + st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
963 + st1 {$ACC4}[0],[$ctx]
964 + str x4,[$ctx,#8] // set is_base2_26
968 +.size poly1305_blocks_neon,.-poly1305_blocks_neon
972 +.long 0,0,0,0,0,0,0,0
973 +.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
975 +#if !defined(__KERNEL__) && !defined(_WIN64)
976 +.comm OPENSSL_armcap_P,4,4
977 +.hidden OPENSSL_armcap_P
981 +foreach (split("\n",$code)) {
982 + s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
983 + s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
984 + (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
985 + (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
986 + (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
987 + (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
988 + (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
990 + s/\.[124]([sd])\[/.$1\[/;
991 + s/w#x([0-9]+)/w$1/g;
997 +++ b/arch/arm64/crypto/poly1305-core.S_shipped
1000 +# include "arm_arch.h"
1001 +.extern OPENSSL_armcap_P
1006 +// forward "declarations" are required for Apple
1007 +.globl poly1305_blocks
1008 +.globl poly1305_emit
1010 +.globl poly1305_init
1011 +.type poly1305_init,%function
1015 + stp xzr,xzr,[x0] // zero hash value
1016 + stp xzr,xzr,[x0,#16] // [along with is_base2_26]
1022 + adrp x17,OPENSSL_armcap_P
1023 + ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
1026 + ldp x7,x8,[x1] // load key
1027 + mov x9,#0xfffffffc0fffffff
1028 + movk x9,#0x0fff,lsl#48
1029 +#ifdef __AARCH64EB__
1030 + rev x7,x7 // flip bytes
1033 + and x7,x7,x9 // &=0ffffffc0fffffff
1035 + and x8,x8,x9 // &=0ffffffc0ffffffc
1037 + stp x7,x8,[x0,#32] // save key value
1038 + str w9,[x0,#48] // impossible key power value
1041 + tst w17,#ARMV7_NEON
1043 + adr x12,.Lpoly1305_blocks
1044 + adr x7,.Lpoly1305_blocks_neon
1045 + adr x13,.Lpoly1305_emit
1047 + csel x12,x12,x7,eq
1058 +.size poly1305_init,.-poly1305_init
1060 +.type poly1305_blocks,%function
1067 + ldp x4,x5,[x0] // load hash value
1068 + ldp x6,x17,[x0,#16] // [along with is_base2_26]
1069 + ldp x7,x8,[x0,#32] // load key value
1071 +#ifdef __AARCH64EB__
1085 + add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
1087 + adds x12,x12,x14,lsl#52
1088 + add x13,x13,x15,lsl#14
1091 + adds x13,x13,x16,lsl#40
1094 + cmp x17,#0 // is_base2_26?
1095 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
1096 + csel x4,x4,x12,eq // choose between radixes
1101 + ldp x10,x11,[x1],#16 // load input
1103 +#ifdef __AARCH64EB__
1107 + adds x4,x4,x10 // accumulate input
1110 + mul x12,x4,x7 // h0*r0
1114 + mul x10,x5,x9 // h1*5*r1
1118 + mul x10,x4,x8 // h0*r1
1123 + mul x10,x5,x7 // h1*r0
1128 + mul x10,x6,x9 // h2*5*r1
1130 + mul x11,x6,x7 // h2*r0
1135 + and x10,x14,#-4 // final reduction
1137 + add x10,x10,x14,lsr#2
1144 + stp x4,x5,[x0] // store hash value
1145 + stp x6,xzr,[x0,#16] // [and clear is_base2_26]
1149 +.size poly1305_blocks,.-poly1305_blocks
1151 +.type poly1305_emit,%function
1155 + ldp x4,x5,[x0] // load hash base 2^64
1156 + ldp x6,x7,[x0,#16] // [along with is_base2_26]
1157 + ldp x10,x11,[x2] // load nonce
1159 +#ifdef __AARCH64EB__
1173 + add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
1175 + adds x12,x12,x14,lsl#52
1176 + add x13,x13,x15,lsl#14
1179 + adds x13,x13,x16,lsl#40
1182 + cmp x7,#0 // is_base2_26?
1183 + csel x4,x4,x12,eq // choose between radixes
1187 + adds x12,x4,#5 // compare to modulus
1191 + tst x14,#-4 // see if it's carried/borrowed
1196 +#ifdef __AARCH64EB__
1197 + ror x10,x10,#32 // flip nonce words
1200 + adds x4,x4,x10 // accumulate nonce
1202 +#ifdef __AARCH64EB__
1203 + rev x4,x4 // flip output bytes
1206 + stp x4,x5,[x1] // write result
1209 +.size poly1305_emit,.-poly1305_emit
1210 +.type poly1305_mult,%function
1213 + mul x12,x4,x7 // h0*r0
1216 + mul x10,x5,x9 // h1*5*r1
1220 + mul x10,x4,x8 // h0*r1
1225 + mul x10,x5,x7 // h1*r0
1230 + mul x10,x6,x9 // h2*5*r1
1232 + mul x11,x6,x7 // h2*r0
1237 + and x10,x14,#-4 // final reduction
1239 + add x10,x10,x14,lsr#2
1245 +.size poly1305_mult,.-poly1305_mult
1247 +.type poly1305_splat,%function
1250 + and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
1251 + ubfx x13,x4,#26,#26
1252 + extr x14,x5,x4,#52
1253 + and x14,x14,#0x03ffffff
1254 + ubfx x15,x5,#14,#26
1255 + extr x16,x6,x5,#40
1257 + str w12,[x0,#16*0] // r0
1258 + add w12,w13,w13,lsl#2 // r1*5
1259 + str w13,[x0,#16*1] // r1
1260 + add w13,w14,w14,lsl#2 // r2*5
1261 + str w12,[x0,#16*2] // s1
1262 + str w14,[x0,#16*3] // r2
1263 + add w14,w15,w15,lsl#2 // r3*5
1264 + str w13,[x0,#16*4] // s2
1265 + str w15,[x0,#16*5] // r3
1266 + add w15,w16,w16,lsl#2 // r4*5
1267 + str w14,[x0,#16*6] // s3
1268 + str w16,[x0,#16*7] // r4
1269 + str w15,[x0,#16*8] // s4
1272 +.size poly1305_splat,.-poly1305_splat
1275 +.globl poly1305_blocks_neon
1277 +.type poly1305_blocks_neon,%function
1279 +poly1305_blocks_neon:
1280 +.Lpoly1305_blocks_neon:
1283 + b.lo .Lpoly1305_blocks
1285 + .inst 0xd503233f // paciasp
1286 + stp x29,x30,[sp,#-80]!
1289 + stp d8,d9,[sp,#16] // meet ABI requirements
1290 + stp d10,d11,[sp,#32]
1291 + stp d12,d13,[sp,#48]
1292 + stp d14,d15,[sp,#64]
1294 + cbz x17,.Lbase2_64_neon
1296 + ldp w10,w11,[x0] // load hash value base 2^26
1297 + ldp w12,w13,[x0,#8]
1303 + ldp x7,x8,[x0,#32] // load key value
1305 + add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
1307 + adds x4,x4,x12,lsl#52
1308 + add x5,x5,x13,lsl#14
1311 + adds x5,x5,x14,lsl#40
1312 + adc x14,x6,xzr // can be partially reduced...
1314 + ldp x12,x13,[x1],#16 // load input
1316 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
1318 +#ifdef __AARCH64EB__
1322 + adds x4,x4,x12 // accumulate input
1328 + and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
1329 + ubfx x11,x4,#26,#26
1330 + extr x12,x5,x4,#52
1331 + and x12,x12,#0x03ffffff
1332 + ubfx x13,x5,#14,#26
1333 + extr x14,x6,x5,#40
1339 + ldp x7,x8,[x0,#32] // load key value
1341 + ldp x4,x5,[x0] // load hash value base 2^64
1347 + ldp x12,x13,[x1],#16 // load input
1349 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
1350 +#ifdef __AARCH64EB__
1354 + adds x4,x4,x12 // accumulate input
1361 + ldr w17,[x0,#48] // first table element
1362 + and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
1363 + ubfx x11,x4,#26,#26
1364 + extr x12,x5,x4,#52
1365 + and x12,x12,#0x03ffffff
1366 + ubfx x13,x5,#14,#26
1367 + extr x14,x6,x5,#40
1369 + cmp w17,#-1 // is value impossible?
1378 + ////////////////////////////////// initialize r^n table
1380 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
1386 + bl poly1305_mult // r^2
1390 + bl poly1305_mult // r^3
1394 + bl poly1305_mult // r^4
1397 + sub x0,x0,#48 // restore original x0
1409 + ldp x8,x12,[x1,#32] // inp[2:3]
1411 + ldp x9,x13,[x1,#48]
1418 +#ifdef __AARCH64EB__
1424 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
1425 + and x5,x9,#0x03ffffff
1426 + ubfx x6,x8,#26,#26
1427 + ubfx x7,x9,#26,#26
1428 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
1429 + extr x8,x12,x8,#52
1430 + extr x9,x13,x9,#52
1431 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
1433 + and x8,x8,#0x03ffffff
1434 + and x9,x9,#0x03ffffff
1435 + ubfx x10,x12,#14,#26
1436 + ubfx x11,x13,#14,#26
1437 + add x12,x3,x12,lsr#40
1438 + add x13,x3,x13,lsr#40
1439 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
1441 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
1442 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
1447 + ldp x8,x12,[x1],#16 // inp[0:1]
1448 + ldp x9,x13,[x1],#48
1450 + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
1451 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
1454 +#ifdef __AARCH64EB__
1460 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
1461 + and x5,x9,#0x03ffffff
1462 + ubfx x6,x8,#26,#26
1463 + ubfx x7,x9,#26,#26
1464 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
1465 + extr x8,x12,x8,#52
1466 + extr x9,x13,x9,#52
1467 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
1469 + and x8,x8,#0x03ffffff
1470 + and x9,x9,#0x03ffffff
1471 + ubfx x10,x12,#14,#26
1472 + ubfx x11,x13,#14,#26
1473 + add x12,x3,x12,lsr#40
1474 + add x13,x3,x13,lsr#40
1475 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
1477 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
1478 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
1483 + ushr v31.2d,v31.2d,#38
1489 + ////////////////////////////////////////////////////////////////
1490 + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
1491 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
1492 + // ___________________/
1493 + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
1494 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
1495 + // ___________________/ ____________________/
1497 + // Note that we start with inp[2:3]*r^2. This is because it
1498 + // doesn't depend on reduction in previous iteration.
1499 + ////////////////////////////////////////////////////////////////
1500 + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
1501 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
1502 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
1503 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
1504 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
1507 + umull v23.2d,v14.2s,v7.s[2]
1508 + csel x16,x17,x16,lo
1509 + umull v22.2d,v14.2s,v5.s[2]
1510 + umull v21.2d,v14.2s,v3.s[2]
1511 + ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
1512 + umull v20.2d,v14.2s,v1.s[2]
1513 + ldp x9,x13,[x16],#48
1514 + umull v19.2d,v14.2s,v0.s[2]
1515 +#ifdef __AARCH64EB__
1522 + umlal v23.2d,v15.2s,v5.s[2]
1523 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
1524 + umlal v22.2d,v15.2s,v3.s[2]
1525 + and x5,x9,#0x03ffffff
1526 + umlal v21.2d,v15.2s,v1.s[2]
1527 + ubfx x6,x8,#26,#26
1528 + umlal v20.2d,v15.2s,v0.s[2]
1529 + ubfx x7,x9,#26,#26
1530 + umlal v19.2d,v15.2s,v8.s[2]
1531 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
1533 + umlal v23.2d,v16.2s,v3.s[2]
1534 + extr x8,x12,x8,#52
1535 + umlal v22.2d,v16.2s,v1.s[2]
1536 + extr x9,x13,x9,#52
1537 + umlal v21.2d,v16.2s,v0.s[2]
1538 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
1539 + umlal v20.2d,v16.2s,v8.s[2]
1541 + umlal v19.2d,v16.2s,v6.s[2]
1542 + and x8,x8,#0x03ffffff
1544 + umlal v23.2d,v17.2s,v1.s[2]
1545 + and x9,x9,#0x03ffffff
1546 + umlal v22.2d,v17.2s,v0.s[2]
1547 + ubfx x10,x12,#14,#26
1548 + umlal v21.2d,v17.2s,v8.s[2]
1549 + ubfx x11,x13,#14,#26
1550 + umlal v20.2d,v17.2s,v6.s[2]
1551 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
1552 + umlal v19.2d,v17.2s,v4.s[2]
1555 + add v11.2s,v11.2s,v26.2s
1556 + add x12,x3,x12,lsr#40
1557 + umlal v23.2d,v18.2s,v0.s[2]
1558 + add x13,x3,x13,lsr#40
1559 + umlal v22.2d,v18.2s,v8.s[2]
1560 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
1561 + umlal v21.2d,v18.2s,v6.s[2]
1562 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
1563 + umlal v20.2d,v18.2s,v4.s[2]
1565 + umlal v19.2d,v18.2s,v2.s[2]
1568 + ////////////////////////////////////////////////////////////////
1569 + // (hash+inp[0:1])*r^4 and accumulate
1571 + add v9.2s,v9.2s,v24.2s
1573 + umlal v22.2d,v11.2s,v1.s[0]
1574 + ldp x8,x12,[x1],#16 // inp[0:1]
1575 + umlal v19.2d,v11.2s,v6.s[0]
1576 + ldp x9,x13,[x1],#48
1577 + umlal v23.2d,v11.2s,v3.s[0]
1578 + umlal v20.2d,v11.2s,v8.s[0]
1579 + umlal v21.2d,v11.2s,v0.s[0]
1580 +#ifdef __AARCH64EB__
1587 + add v10.2s,v10.2s,v25.2s
1588 + umlal v22.2d,v9.2s,v5.s[0]
1589 + umlal v23.2d,v9.2s,v7.s[0]
1590 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
1591 + umlal v21.2d,v9.2s,v3.s[0]
1592 + and x5,x9,#0x03ffffff
1593 + umlal v19.2d,v9.2s,v0.s[0]
1594 + ubfx x6,x8,#26,#26
1595 + umlal v20.2d,v9.2s,v1.s[0]
1596 + ubfx x7,x9,#26,#26
1598 + add v12.2s,v12.2s,v27.2s
1599 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
1600 + umlal v22.2d,v10.2s,v3.s[0]
1601 + extr x8,x12,x8,#52
1602 + umlal v23.2d,v10.2s,v5.s[0]
1603 + extr x9,x13,x9,#52
1604 + umlal v19.2d,v10.2s,v8.s[0]
1605 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
1606 + umlal v21.2d,v10.2s,v1.s[0]
1608 + umlal v20.2d,v10.2s,v0.s[0]
1609 + and x8,x8,#0x03ffffff
1611 + add v13.2s,v13.2s,v28.2s
1612 + and x9,x9,#0x03ffffff
1613 + umlal v22.2d,v12.2s,v0.s[0]
1614 + ubfx x10,x12,#14,#26
1615 + umlal v19.2d,v12.2s,v4.s[0]
1616 + ubfx x11,x13,#14,#26
1617 + umlal v23.2d,v12.2s,v1.s[0]
1618 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
1619 + umlal v20.2d,v12.2s,v6.s[0]
1621 + umlal v21.2d,v12.2s,v8.s[0]
1622 + add x12,x3,x12,lsr#40
1624 + umlal v22.2d,v13.2s,v8.s[0]
1625 + add x13,x3,x13,lsr#40
1626 + umlal v19.2d,v13.2s,v2.s[0]
1627 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
1628 + umlal v23.2d,v13.2s,v0.s[0]
1629 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
1630 + umlal v20.2d,v13.2s,v4.s[0]
1632 + umlal v21.2d,v13.2s,v6.s[0]
1636 + /////////////////////////////////////////////////////////////////
1637 + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1640 + // [see discussion in poly1305-armv4 module]
1642 + ushr v29.2d,v22.2d,#26
1644 + ushr v30.2d,v19.2d,#26
1645 + and v19.16b,v19.16b,v31.16b
1646 + add v23.2d,v23.2d,v29.2d // h3 -> h4
1647 + bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
1648 + add v20.2d,v20.2d,v30.2d // h0 -> h1
1650 + ushr v29.2d,v23.2d,#26
1652 + ushr v30.2d,v20.2d,#26
1654 + bic v28.2s,#0xfc,lsl#24
1655 + add v21.2d,v21.2d,v30.2d // h1 -> h2
1657 + add v19.2d,v19.2d,v29.2d
1658 + shl v29.2d,v29.2d,#2
1659 + shrn v30.2s,v21.2d,#26
1661 + add v19.2d,v19.2d,v29.2d // h4 -> h0
1662 + bic v25.2s,#0xfc,lsl#24
1663 + add v27.2s,v27.2s,v30.2s // h2 -> h3
1664 + bic v26.2s,#0xfc,lsl#24
1666 + shrn v29.2s,v19.2d,#26
1668 + ushr v30.2s,v27.2s,#26
1669 + bic v27.2s,#0xfc,lsl#24
1670 + bic v24.2s,#0xfc,lsl#24
1671 + add v25.2s,v25.2s,v29.2s // h0 -> h1
1672 + add v28.2s,v28.2s,v30.2s // h3 -> h4
1677 + dup v16.2d,v16.d[0]
1678 + add v11.2s,v11.2s,v26.2s
1680 + ////////////////////////////////////////////////////////////////
1681 + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1686 + dup v16.2d,v11.d[0]
1687 + add v14.2s,v9.2s,v24.2s
1688 + add v17.2s,v12.2s,v27.2s
1689 + add v15.2s,v10.2s,v25.2s
1690 + add v18.2s,v13.2s,v28.2s
1693 + dup v14.2d,v14.d[0]
1694 + umull2 v19.2d,v16.4s,v6.4s
1695 + umull2 v22.2d,v16.4s,v1.4s
1696 + umull2 v23.2d,v16.4s,v3.4s
1697 + umull2 v21.2d,v16.4s,v0.4s
1698 + umull2 v20.2d,v16.4s,v8.4s
1700 + dup v15.2d,v15.d[0]
1701 + umlal2 v19.2d,v14.4s,v0.4s
1702 + umlal2 v21.2d,v14.4s,v3.4s
1703 + umlal2 v22.2d,v14.4s,v5.4s
1704 + umlal2 v23.2d,v14.4s,v7.4s
1705 + umlal2 v20.2d,v14.4s,v1.4s
1707 + dup v17.2d,v17.d[0]
1708 + umlal2 v19.2d,v15.4s,v8.4s
1709 + umlal2 v22.2d,v15.4s,v3.4s
1710 + umlal2 v21.2d,v15.4s,v1.4s
1711 + umlal2 v23.2d,v15.4s,v5.4s
1712 + umlal2 v20.2d,v15.4s,v0.4s
1714 + dup v18.2d,v18.d[0]
1715 + umlal2 v22.2d,v17.4s,v0.4s
1716 + umlal2 v23.2d,v17.4s,v1.4s
1717 + umlal2 v19.2d,v17.4s,v4.4s
1718 + umlal2 v20.2d,v17.4s,v6.4s
1719 + umlal2 v21.2d,v17.4s,v8.4s
1721 + umlal2 v22.2d,v18.4s,v8.4s
1722 + umlal2 v19.2d,v18.4s,v2.4s
1723 + umlal2 v23.2d,v18.4s,v0.4s
1724 + umlal2 v20.2d,v18.4s,v4.4s
1725 + umlal2 v21.2d,v18.4s,v6.4s
1729 + ////////////////////////////////////////////////////////////////
1730 + // (hash+inp[0:1])*r^4:r^3 and accumulate
1732 + add v9.2s,v9.2s,v24.2s
1733 + umlal v22.2d,v11.2s,v1.2s
1734 + umlal v19.2d,v11.2s,v6.2s
1735 + umlal v23.2d,v11.2s,v3.2s
1736 + umlal v20.2d,v11.2s,v8.2s
1737 + umlal v21.2d,v11.2s,v0.2s
1739 + add v10.2s,v10.2s,v25.2s
1740 + umlal v22.2d,v9.2s,v5.2s
1741 + umlal v19.2d,v9.2s,v0.2s
1742 + umlal v23.2d,v9.2s,v7.2s
1743 + umlal v20.2d,v9.2s,v1.2s
1744 + umlal v21.2d,v9.2s,v3.2s
1746 + add v12.2s,v12.2s,v27.2s
1747 + umlal v22.2d,v10.2s,v3.2s
1748 + umlal v19.2d,v10.2s,v8.2s
1749 + umlal v23.2d,v10.2s,v5.2s
1750 + umlal v20.2d,v10.2s,v0.2s
1751 + umlal v21.2d,v10.2s,v1.2s
1753 + add v13.2s,v13.2s,v28.2s
1754 + umlal v22.2d,v12.2s,v0.2s
1755 + umlal v19.2d,v12.2s,v4.2s
1756 + umlal v23.2d,v12.2s,v1.2s
1757 + umlal v20.2d,v12.2s,v6.2s
1758 + umlal v21.2d,v12.2s,v8.2s
1760 + umlal v22.2d,v13.2s,v8.2s
1761 + umlal v19.2d,v13.2s,v2.2s
1762 + umlal v23.2d,v13.2s,v0.2s
1763 + umlal v20.2d,v13.2s,v4.2s
1764 + umlal v21.2d,v13.2s,v6.2s
1767 + ////////////////////////////////////////////////////////////////
1770 + addp v22.2d,v22.2d,v22.2d
1771 + ldp d8,d9,[sp,#16] // meet ABI requirements
1772 + addp v19.2d,v19.2d,v19.2d
1773 + ldp d10,d11,[sp,#32]
1774 + addp v23.2d,v23.2d,v23.2d
1775 + ldp d12,d13,[sp,#48]
1776 + addp v20.2d,v20.2d,v20.2d
1777 + ldp d14,d15,[sp,#64]
1778 + addp v21.2d,v21.2d,v21.2d
1780 + .inst 0xd50323bf // autiasp
1782 + ////////////////////////////////////////////////////////////////
1783 + // lazy reduction, but without narrowing
1785 + ushr v29.2d,v22.2d,#26
1786 + and v22.16b,v22.16b,v31.16b
1787 + ushr v30.2d,v19.2d,#26
1788 + and v19.16b,v19.16b,v31.16b
1790 + add v23.2d,v23.2d,v29.2d // h3 -> h4
1791 + add v20.2d,v20.2d,v30.2d // h0 -> h1
1793 + ushr v29.2d,v23.2d,#26
1794 + and v23.16b,v23.16b,v31.16b
1795 + ushr v30.2d,v20.2d,#26
1796 + and v20.16b,v20.16b,v31.16b
1797 + add v21.2d,v21.2d,v30.2d // h1 -> h2
1799 + add v19.2d,v19.2d,v29.2d
1800 + shl v29.2d,v29.2d,#2
1801 + ushr v30.2d,v21.2d,#26
1802 + and v21.16b,v21.16b,v31.16b
1803 + add v19.2d,v19.2d,v29.2d // h4 -> h0
1804 + add v22.2d,v22.2d,v30.2d // h2 -> h3
1806 + ushr v29.2d,v19.2d,#26
1807 + and v19.16b,v19.16b,v31.16b
1808 + ushr v30.2d,v22.2d,#26
1809 + and v22.16b,v22.16b,v31.16b
1810 + add v20.2d,v20.2d,v29.2d // h0 -> h1
1811 + add v23.2d,v23.2d,v30.2d // h3 -> h4
1813 + ////////////////////////////////////////////////////////////////
1814 + // write the result, can be partially reduced
1816 + st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
1818 + st1 {v23.s}[0],[x0]
1819 + str x4,[x0,#8] // set is_base2_26
1823 +.size poly1305_blocks_neon,.-poly1305_blocks_neon
1827 +.long 0,0,0,0,0,0,0,0
1828 +.asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
1830 +#if !defined(__KERNEL__) && !defined(_WIN64)
1831 +.comm OPENSSL_armcap_P,4,4
1832 +.hidden OPENSSL_armcap_P
1835 +++ b/arch/arm64/crypto/poly1305-glue.c
1837 +// SPDX-License-Identifier: GPL-2.0
1839 + * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
1841 + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
1844 +#include <asm/hwcap.h>
1845 +#include <asm/neon.h>
1846 +#include <asm/simd.h>
1847 +#include <asm/unaligned.h>
1848 +#include <crypto/algapi.h>
1849 +#include <crypto/internal/hash.h>
1850 +#include <crypto/internal/poly1305.h>
1851 +#include <crypto/internal/simd.h>
1852 +#include <linux/cpufeature.h>
1853 +#include <linux/crypto.h>
1854 +#include <linux/jump_label.h>
1855 +#include <linux/module.h>
1857 +asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
1858 +asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
1859 +asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
1860 +asmlinkage void poly1305_emit(void *state, __le32 *digest, const u32 *nonce);
1862 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
1864 +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
1866 + poly1305_init_arm64(&dctx->h, key);
1867 + dctx->s[0] = get_unaligned_le32(key + 16);
1868 + dctx->s[1] = get_unaligned_le32(key + 20);
1869 + dctx->s[2] = get_unaligned_le32(key + 24);
1870 + dctx->s[3] = get_unaligned_le32(key + 28);
1873 +EXPORT_SYMBOL(poly1305_init_arch);
1875 +static int neon_poly1305_init(struct shash_desc *desc)
1877 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
1881 + dctx->sset = false;
1886 +static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
1887 + u32 len, u32 hibit, bool do_neon)
1889 + if (unlikely(!dctx->sset)) {
1890 + if (!dctx->rset) {
1891 + poly1305_init_arch(dctx, src);
1892 + src += POLY1305_BLOCK_SIZE;
1893 + len -= POLY1305_BLOCK_SIZE;
1896 + if (len >= POLY1305_BLOCK_SIZE) {
1897 + dctx->s[0] = get_unaligned_le32(src + 0);
1898 + dctx->s[1] = get_unaligned_le32(src + 4);
1899 + dctx->s[2] = get_unaligned_le32(src + 8);
1900 + dctx->s[3] = get_unaligned_le32(src + 12);
1901 + src += POLY1305_BLOCK_SIZE;
1902 + len -= POLY1305_BLOCK_SIZE;
1903 + dctx->sset = true;
1905 + if (len < POLY1305_BLOCK_SIZE)
1909 + len &= ~(POLY1305_BLOCK_SIZE - 1);
1911 + if (static_branch_likely(&have_neon) && likely(do_neon))
1912 + poly1305_blocks_neon(&dctx->h, src, len, hibit);
1914 + poly1305_blocks(&dctx->h, src, len, hibit);
1917 +static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
1918 + const u8 *src, u32 len, bool do_neon)
1920 + if (unlikely(dctx->buflen)) {
1921 + u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
1923 + memcpy(dctx->buf + dctx->buflen, src, bytes);
1926 + dctx->buflen += bytes;
1928 + if (dctx->buflen == POLY1305_BLOCK_SIZE) {
1929 + neon_poly1305_blocks(dctx, dctx->buf,
1930 + POLY1305_BLOCK_SIZE, 1, false);
1935 + if (likely(len >= POLY1305_BLOCK_SIZE)) {
1936 + neon_poly1305_blocks(dctx, src, len, 1, do_neon);
1937 + src += round_down(len, POLY1305_BLOCK_SIZE);
1938 + len %= POLY1305_BLOCK_SIZE;
1941 + if (unlikely(len)) {
1942 + dctx->buflen = len;
1943 + memcpy(dctx->buf, src, len);
1947 +static int neon_poly1305_update(struct shash_desc *desc,
1948 + const u8 *src, unsigned int srclen)
1950 + bool do_neon = crypto_simd_usable() && srclen > 128;
1951 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
1953 + if (static_branch_likely(&have_neon) && do_neon)
1954 + kernel_neon_begin();
1955 + neon_poly1305_do_update(dctx, src, srclen, do_neon);
1956 + if (static_branch_likely(&have_neon) && do_neon)
1957 + kernel_neon_end();
1961 +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
1962 + unsigned int nbytes)
1964 + if (unlikely(dctx->buflen)) {
1965 + u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
1967 + memcpy(dctx->buf + dctx->buflen, src, bytes);
1970 + dctx->buflen += bytes;
1972 + if (dctx->buflen == POLY1305_BLOCK_SIZE) {
1973 + poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
1978 + if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
1979 + unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
1981 + if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
1982 + kernel_neon_begin();
1983 + poly1305_blocks_neon(&dctx->h, src, len, 1);
1984 + kernel_neon_end();
1986 + poly1305_blocks(&dctx->h, src, len, 1);
1989 + nbytes %= POLY1305_BLOCK_SIZE;
1992 + if (unlikely(nbytes)) {
1993 + dctx->buflen = nbytes;
1994 + memcpy(dctx->buf, src, nbytes);
1997 +EXPORT_SYMBOL(poly1305_update_arch);
1999 +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
2004 + if (unlikely(dctx->buflen)) {
2005 + dctx->buf[dctx->buflen++] = 1;
2006 + memset(dctx->buf + dctx->buflen, 0,
2007 + POLY1305_BLOCK_SIZE - dctx->buflen);
2008 + poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
2011 + poly1305_emit(&dctx->h, digest, dctx->s);
2013 + /* mac = (h + s) % (2^128) */
2014 + f = (f >> 32) + le32_to_cpu(digest[0]);
2015 + put_unaligned_le32(f, dst);
2016 + f = (f >> 32) + le32_to_cpu(digest[1]);
2017 + put_unaligned_le32(f, dst + 4);
2018 + f = (f >> 32) + le32_to_cpu(digest[2]);
2019 + put_unaligned_le32(f, dst + 8);
2020 + f = (f >> 32) + le32_to_cpu(digest[3]);
2021 + put_unaligned_le32(f, dst + 12);
2023 + *dctx = (struct poly1305_desc_ctx){};
2025 +EXPORT_SYMBOL(poly1305_final_arch);
2027 +static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
2029 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2031 + if (unlikely(!dctx->sset))
2034 + poly1305_final_arch(dctx, dst);
2038 +static struct shash_alg neon_poly1305_alg = {
2039 + .init = neon_poly1305_init,
2040 + .update = neon_poly1305_update,
2041 + .final = neon_poly1305_final,
2042 + .digestsize = POLY1305_DIGEST_SIZE,
2043 + .descsize = sizeof(struct poly1305_desc_ctx),
2045 + .base.cra_name = "poly1305",
2046 + .base.cra_driver_name = "poly1305-neon",
2047 + .base.cra_priority = 200,
2048 + .base.cra_blocksize = POLY1305_BLOCK_SIZE,
2049 + .base.cra_module = THIS_MODULE,
2052 +static int __init neon_poly1305_mod_init(void)
2054 + if (!cpu_have_named_feature(ASIMD))
2057 + static_branch_enable(&have_neon);
2059 + return crypto_register_shash(&neon_poly1305_alg);
2062 +static void __exit neon_poly1305_mod_exit(void)
2064 + if (cpu_have_named_feature(ASIMD))
2065 + crypto_unregister_shash(&neon_poly1305_alg);
2068 +module_init(neon_poly1305_mod_init);
2069 +module_exit(neon_poly1305_mod_exit);
2071 +MODULE_LICENSE("GPL v2");
2072 +MODULE_ALIAS_CRYPTO("poly1305");
2073 +MODULE_ALIAS_CRYPTO("poly1305-neon");
2074 --- a/lib/crypto/Kconfig
2075 +++ b/lib/crypto/Kconfig
2076 @@ -40,6 +40,7 @@ config CRYPTO_LIB_DES
2077 config CRYPTO_LIB_POLY1305_RSIZE
2080 + default 9 if ARM64
2083 config CRYPTO_ARCH_HAVE_LIB_POLY1305