1 From 588765ccad76f9f65f09e1dcadc464d22441c889 Mon Sep 17 00:00:00 2001
2 From: Ard Biesheuvel <ardb@kernel.org>
3 Date: Fri, 8 Nov 2019 13:22:25 +0100
4 Subject: [PATCH 019/124] crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS
7 commit a6b803b3ddc793d6db0c16f12fc12d30d20fa9cc upstream.
9 This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation
10 for NEON authored by Andy Polyakov, and contributed by him to the OpenSSL
11 project. The file 'poly1305-armv4.pl' is taken straight from this upstream
12 GitHub repository [0] at commit ec55a08dc0244ce570c4fc7cade330c60798952f,
13 and already contains all the changes required to build it as part of a
16 [0] https://github.com/dot-asm/cryptogams
18 Co-developed-by: Andy Polyakov <appro@cryptogams.org>
19 Signed-off-by: Andy Polyakov <appro@cryptogams.org>
20 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
21 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
22 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
24 arch/arm/crypto/Kconfig | 5 +
25 arch/arm/crypto/Makefile | 12 +-
26 arch/arm/crypto/poly1305-armv4.pl | 1236 +++++++++++++++++++++++
27 arch/arm/crypto/poly1305-core.S_shipped | 1158 +++++++++++++++++++++
28 arch/arm/crypto/poly1305-glue.c | 276 +++++
29 lib/crypto/Kconfig | 2 +-
30 6 files changed, 2687 insertions(+), 2 deletions(-)
31 create mode 100644 arch/arm/crypto/poly1305-armv4.pl
32 create mode 100644 arch/arm/crypto/poly1305-core.S_shipped
33 create mode 100644 arch/arm/crypto/poly1305-glue.c
35 --- a/arch/arm/crypto/Kconfig
36 +++ b/arch/arm/crypto/Kconfig
37 @@ -131,6 +131,11 @@ config CRYPTO_CHACHA20_NEON
38 select CRYPTO_BLKCIPHER
39 select CRYPTO_ARCH_HAVE_LIB_CHACHA
41 +config CRYPTO_POLY1305_ARM
42 + tristate "Accelerated scalar and SIMD Poly1305 hash implementations"
44 + select CRYPTO_ARCH_HAVE_LIB_POLY1305
46 config CRYPTO_NHPOLY1305_NEON
47 tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
48 depends on KERNEL_MODE_NEON
49 --- a/arch/arm/crypto/Makefile
50 +++ b/arch/arm/crypto/Makefile
51 @@ -10,6 +10,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sh
52 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
53 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
54 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
55 +obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
56 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
58 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
59 @@ -55,12 +56,16 @@ crct10dif-arm-ce-y := crct10dif-ce-core.
60 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
61 chacha-neon-y := chacha-scalar-core.o chacha-glue.o
62 chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
63 +poly1305-arm-y := poly1305-core.o poly1305-glue.o
64 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
66 ifdef REGENERATE_ARM_CRYPTO
67 quiet_cmd_perl = PERL $@
68 cmd_perl = $(PERL) $(<) > $(@)
70 +$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl
73 $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
76 @@ -68,4 +73,9 @@ $(src)/sha512-core.S_shipped: $(src)/sha
80 -clean-files += sha256-core.S sha512-core.S
81 +clean-files += poly1305-core.S sha256-core.S sha512-core.S
83 +# massage the perlasm code a bit so we only get the NEON routine if we need it
84 +poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
85 +poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
86 +AFLAGS_poly1305-core.o += $(poly1305-aflags-y)
88 +++ b/arch/arm/crypto/poly1305-armv4.pl
91 +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
93 +# ====================================================================
94 +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
96 +# ====================================================================
98 +# IALU(*)/gcc-4.4 NEON
100 +# ARM11xx(ARMv6) 7.78/+100% -
101 +# Cortex-A5 6.35/+130% 3.00
102 +# Cortex-A8 6.25/+115% 2.36
103 +# Cortex-A9 5.10/+95% 2.55
104 +# Cortex-A15 3.85/+85% 1.25(**)
105 +# Snapdragon S4 5.70/+100% 1.48(**)
107 +# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
108 +# (**) these are trade-off results, they can be improved by ~8% but at
109 +# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
110 +# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
113 +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
114 +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
116 +if ($flavour && $flavour ne "void") {
117 + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
118 + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
119 + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
120 + die "can't locate arm-xlate.pl";
122 + open STDOUT,"| \"$^X\" $xlate $flavour $output";
124 + open STDOUT,">$output";
127 +($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
131 +# include "arm_arch.h"
133 +# define __ARM_ARCH__ __LINUX_ARM_ARCH__
134 +# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
135 +# define poly1305_init poly1305_init_arm
136 +# define poly1305_blocks poly1305_blocks_arm
137 +# define poly1305_emit poly1305_emit_arm
138 +.globl poly1305_blocks_neon
141 +#if defined(__thumb2__)
150 +.globl poly1305_emit
151 +.globl poly1305_blocks
152 +.globl poly1305_init
153 +.type poly1305_init,%function
161 + str r3,[$ctx,#0] @ zero hash value
166 + str r3,[$ctx,#36] @ clear is_base2_26
175 +#if __ARM_MAX_ARCH__>=7
177 + str r3,[$ctx,#28] @ impossible key power value
179 + adr r11,.Lpoly1305_init
180 + ldr r12,.LOPENSSL_armcap
184 + mov r10,#0x0fffffff
186 + and r3,r10,#-4 @ 0x0ffffffc
191 + orr r4,r4,r6,lsl#16
193 + orr r4,r4,r7,lsl#24
197 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
198 +# if !defined(_WIN32)
199 + ldr r12,[r11,r12] @ OPENSSL_armcap_P
201 +# if defined(__APPLE__) || defined(_WIN32)
208 + orr r5,r5,r7,lsl#16
210 + orr r5,r5,r8,lsl#24
214 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
215 + tst r12,#ARMV7_NEON @ check for NEON
217 + adr r9,.Lpoly1305_blocks_neon
218 + adr r11,.Lpoly1305_blocks
221 + adr r12,.Lpoly1305_emit
222 + orr r11,r11,#1 @ thumb-ify addresses
225 + add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
227 + addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
228 + addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
234 + orr r6,r6,r8,lsl#16
236 + orr r6,r6,r9,lsl#24
240 + ldrb r10,[$inp,#15]
243 + orr r7,r7,r9,lsl#16
245 + orr r7,r7,r10,lsl#24
249 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
250 + stmia r2,{r11,r12} @ fill functions table
261 + moveq pc,lr @ be binary compatible with V4, yet
262 + bx lr @ interoperable with Thumb ISA:-)
264 +.size poly1305_init,.-poly1305_init
267 +my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
268 +my ($s1,$s2,$s3)=($r1,$r2,$r3);
271 +.type poly1305_blocks,%function
275 + stmdb sp!,{r3-r11,lr}
277 + ands $len,$len,#-16
280 + add $len,$len,$inp @ end pointer
284 + ldmia $ctx,{$h0-$r3} @ load context
286 + str $len,[sp,#16] @ offload stuff
289 + ldr lr,[$ctx,#36] @ is_base2_26
290 + ldmia $ctx!,{$h0-$h4} @ load hash value
291 + str $len,[sp,#16] @ offload stuff
294 + adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
296 + adcs $r1,$r1,$h2,lsl#20
298 + adcs $r2,$r2,$h3,lsl#14
300 + adcs $r3,$r3,$h4,lsl#8
303 + str $len,[$ctx,#16] @ clear is_base2_26
304 + adc $len,$len,$h4,lsr#24
307 + movne $h0,$r0 @ choose between radixes
311 + ldmia $ctx,{$r0-$r3} @ load key
326 + ldrb r0,[lr],#16 @ load input
330 + addhi $h4,$h4,#1 @ 1<<128
336 + orr r2,r1,r2,lsl#16
338 + orr r3,r2,r3,lsl#24
340 + adds $h0,$h0,r3 @ accumulate input
345 + orr r2,r1,r2,lsl#16
347 + orr r3,r2,r3,lsl#24
354 + orr r2,r1,r2,lsl#16
356 + orr r3,r2,r3,lsl#24
362 + str lr,[sp,#8] @ offload input pointer
363 + orr r2,r1,r2,lsl#16
364 + add $s1,$r1,$r1,lsr#2
365 + orr r3,r2,r3,lsl#24
367 + ldr r0,[lr],#16 @ load input
369 + addhi $h4,$h4,#1 @ padbit
379 + adds $h0,$h0,r0 @ accumulate input
380 + str lr,[sp,#8] @ offload input pointer
382 + add $s1,$r1,$r1,lsr#2
385 + add $s2,$r2,$r2,lsr#2
387 + add $s3,$r3,$r3,lsr#2
389 + umull r2,r3,$h1,$r0
391 + umull r0,r1,$h0,$r0
392 + umlal r2,r3,$h4,$s1
393 + umlal r0,r1,$h3,$s1
394 + ldr $r1,[sp,#20] @ reload $r1
395 + umlal r2,r3,$h2,$s3
396 + umlal r0,r1,$h1,$s3
397 + umlal r2,r3,$h3,$s2
398 + umlal r0,r1,$h2,$s2
399 + umlal r2,r3,$h0,$r1
400 + str r0,[sp,#0] @ future $h0
402 + ldr $r2,[sp,#24] @ reload $r2
403 + adds r2,r2,r1 @ d1+=d0>>32
405 + adc lr,r3,#0 @ future $h2
406 + str r2,[sp,#4] @ future $h1
410 + umlal r0,r1,$h3,$s3
411 + ldr $r3,[sp,#28] @ reload $r3
412 + umlal r2,r3,$h3,$r0
413 + umlal r0,r1,$h2,$r0
414 + umlal r2,r3,$h2,$r1
415 + umlal r0,r1,$h1,$r1
416 + umlal r2,r3,$h1,$r2
417 + umlal r0,r1,$h0,$r2
418 + umlal r2,r3,$h0,$r3
423 + adds $h2,lr,r0 @ d2+=d1>>32
424 + ldr lr,[sp,#8] @ reload input pointer
426 + adds $h3,r2,r1 @ d3+=d2>>32
427 + ldr r0,[sp,#16] @ reload end pointer
429 + add $h4,$h4,r3 @ h4+=d3>>32
433 + add r1,r1,r1,lsr#2 @ *=5
440 + cmp r0,lr @ done yet?
445 + stmdb $ctx,{$h0-$h4} @ store the result
449 + ldmia sp!,{r3-r11,pc}
451 + ldmia sp!,{r3-r11,lr}
453 + moveq pc,lr @ be binary compatible with V4, yet
454 + bx lr @ interoperable with Thumb ISA:-)
456 +.size poly1305_blocks,.-poly1305_blocks
460 +my ($ctx,$mac,$nonce)=map("r$_",(0..2));
461 +my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
465 +.type poly1305_emit,%function
471 + ldmia $ctx,{$h0-$h4}
474 + ldr ip,[$ctx,#36] @ is_base2_26
476 + adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
478 + adcs $g1,$g1,$h2,lsl#20
480 + adcs $g2,$g2,$h3,lsl#14
482 + adcs $g3,$g3,$h4,lsl#8
484 + adc $g4,$g4,$h4,lsr#24
496 + adds $g0,$h0,#5 @ compare to modulus
501 + tst $g4,#4 @ did it carry/borrow?
507 + ldr $g0,[$nonce,#0]
512 + ldr $g1,[$nonce,#4]
517 + ldr $g2,[$nonce,#8]
522 + ldr $g3,[$nonce,#12]
547 + strb $h3,[$mac,#12]
556 + strb $h3,[$mac,#13]
563 + strb $h2,[$mac,#10]
565 + strb $h3,[$mac,#14]
570 + strb $h2,[$mac,#11]
571 + strb $h3,[$mac,#15]
578 + moveq pc,lr @ be binary compatible with V4, yet
579 + bx lr @ interoperable with Thumb ISA:-)
581 +.size poly1305_emit,.-poly1305_emit
584 +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
585 +my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
586 +my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
588 +my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
591 +#if __ARM_MAX_ARCH__>=7
594 +.type poly1305_init_neon,%function
597 +.Lpoly1305_init_neon:
598 + ldr r3,[$ctx,#48] @ first table element
599 + cmp r3,#-1 @ is value impossible?
602 + ldr r4,[$ctx,#20] @ load key base 2^32
607 + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
612 + orr r4,r4,r6,lsl#12
614 + orr r5,r5,r7,lsl#18
615 + and r3,r3,#0x03ffffff
616 + and r4,r4,#0x03ffffff
617 + and r5,r5,#0x03ffffff
619 + vdup.32 $R0,r2 @ r^1 in both lanes
620 + add r2,r3,r3,lsl#2 @ *5
633 + mov $zeros,#2 @ counter
636 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
637 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
638 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
639 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
640 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
641 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
643 + vmull.u32 $D0,$R0,${R0}[1]
644 + vmull.u32 $D1,$R1,${R0}[1]
645 + vmull.u32 $D2,$R2,${R0}[1]
646 + vmull.u32 $D3,$R3,${R0}[1]
647 + vmull.u32 $D4,$R4,${R0}[1]
649 + vmlal.u32 $D0,$R4,${S1}[1]
650 + vmlal.u32 $D1,$R0,${R1}[1]
651 + vmlal.u32 $D2,$R1,${R1}[1]
652 + vmlal.u32 $D3,$R2,${R1}[1]
653 + vmlal.u32 $D4,$R3,${R1}[1]
655 + vmlal.u32 $D0,$R3,${S2}[1]
656 + vmlal.u32 $D1,$R4,${S2}[1]
657 + vmlal.u32 $D3,$R1,${R2}[1]
658 + vmlal.u32 $D2,$R0,${R2}[1]
659 + vmlal.u32 $D4,$R2,${R2}[1]
661 + vmlal.u32 $D0,$R2,${S3}[1]
662 + vmlal.u32 $D3,$R0,${R3}[1]
663 + vmlal.u32 $D1,$R3,${S3}[1]
664 + vmlal.u32 $D2,$R4,${S3}[1]
665 + vmlal.u32 $D4,$R1,${R3}[1]
667 + vmlal.u32 $D3,$R4,${S4}[1]
668 + vmlal.u32 $D0,$R1,${S4}[1]
669 + vmlal.u32 $D1,$R2,${S4}[1]
670 + vmlal.u32 $D2,$R3,${S4}[1]
671 + vmlal.u32 $D4,$R0,${R4}[1]
673 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
674 + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
677 + @ H0>>+H1>>+H2>>+H3>>+H4
678 + @ H3>>+H4>>*5+H0>>+H1
682 + @ Result of multiplication of n-bit number by m-bit number is
683 + @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
684 + @ m-bit number multiplied by 2^n is still n+m bits wide.
686 + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
687 + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
688 + @ one is n+1 bits wide.
690 + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
691 + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
692 + @ can be 27. However! In cases when their width exceeds 26 bits
693 + @ they are limited by 2^26+2^6. This in turn means that *sum*
694 + @ of the products with these values can still be viewed as sum
695 + @ of 52-bit numbers as long as the amount of addends is not a
696 + @ power of 2. For example,
698 + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
700 + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
701 + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
702 + @ 8 * (2^52) or 2^55. However, the value is then multiplied by
703 + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
704 + @ which is less than 32 * (2^52) or 2^57. And when processing
705 + @ data we are looking at triple as many addends...
707 + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
708 + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
709 + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
710 + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
711 + @ instruction accepts 2x32-bit input and writes 2x64-bit result.
712 + @ This means that result of reduction have to be compressed upon
713 + @ loop wrap-around. This can be done in the process of reduction
714 + @ to minimize amount of instructions [as well as amount of
715 + @ 128-bit instructions, which benefits low-end processors], but
716 + @ one has to watch for H2 (which is narrower than H0) and 5*H4
717 + @ not being wider than 58 bits, so that result of right shift
718 + @ by 26 bits fits in 32 bits. This is also useful on x86,
719 + @ because it allows to use paddd in place for paddq, which
720 + @ benefits Atom, where paddq is ridiculously slow.
722 + vshr.u64 $T0,$D3,#26
723 + vmovn.i64 $D3#lo,$D3
724 + vshr.u64 $T1,$D0,#26
725 + vmovn.i64 $D0#lo,$D0
726 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4
727 + vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
728 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1
729 + vbic.i32 $D0#lo,#0xfc000000
731 + vshrn.u64 $T0#lo,$D4,#26
732 + vmovn.i64 $D4#lo,$D4
733 + vshr.u64 $T1,$D1,#26
734 + vmovn.i64 $D1#lo,$D1
735 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2
736 + vbic.i32 $D4#lo,#0xfc000000
737 + vbic.i32 $D1#lo,#0xfc000000
739 + vadd.i32 $D0#lo,$D0#lo,$T0#lo
740 + vshl.u32 $T0#lo,$T0#lo,#2
741 + vshrn.u64 $T1#lo,$D2,#26
742 + vmovn.i64 $D2#lo,$D2
743 + vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
744 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
745 + vbic.i32 $D2#lo,#0xfc000000
747 + vshr.u32 $T0#lo,$D0#lo,#26
748 + vbic.i32 $D0#lo,#0xfc000000
749 + vshr.u32 $T1#lo,$D3#lo,#26
750 + vbic.i32 $D3#lo,#0xfc000000
751 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
752 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
754 + subs $zeros,$zeros,#1
755 + beq .Lsquare_break_neon
757 + add $tbl0,$ctx,#(48+0*9*4)
758 + add $tbl1,$ctx,#(48+1*9*4)
760 + vtrn.32 $R0,$D0#lo @ r^2:r^1
766 + vshl.u32 $S2,$R2,#2 @ *5
767 + vshl.u32 $S3,$R3,#2
768 + vshl.u32 $S1,$R1,#2
769 + vshl.u32 $S4,$R4,#2
770 + vadd.i32 $S2,$S2,$R2
771 + vadd.i32 $S1,$S1,$R1
772 + vadd.i32 $S3,$S3,$R3
773 + vadd.i32 $S4,$S4,$R4
775 + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
776 + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
777 + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
778 + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
779 + vst1.32 {${S4}[0]},[$tbl0,:32]
780 + vst1.32 {${S4}[1]},[$tbl1,:32]
785 +.Lsquare_break_neon:
786 + add $tbl0,$ctx,#(48+2*4*9)
787 + add $tbl1,$ctx,#(48+3*4*9)
789 + vmov $R0,$D0#lo @ r^4:r^3
790 + vshl.u32 $S1,$D1#lo,#2 @ *5
792 + vshl.u32 $S2,$D2#lo,#2
794 + vshl.u32 $S3,$D3#lo,#2
796 + vshl.u32 $S4,$D4#lo,#2
798 + vadd.i32 $S1,$S1,$D1#lo
799 + vadd.i32 $S2,$S2,$D2#lo
800 + vadd.i32 $S3,$S3,$D3#lo
801 + vadd.i32 $S4,$S4,$D4#lo
803 + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
804 + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
805 + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
806 + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
807 + vst1.32 {${S4}[0]},[$tbl0]
808 + vst1.32 {${S4}[1]},[$tbl1]
812 +.size poly1305_init_neon,.-poly1305_init_neon
814 +.type poly1305_blocks_neon,%function
816 +poly1305_blocks_neon:
817 +.Lpoly1305_blocks_neon:
818 + ldr ip,[$ctx,#36] @ is_base2_26
821 + blo .Lpoly1305_blocks
824 + vstmdb sp!,{d8-d15} @ ABI specification says so
826 + tst ip,ip @ is_base2_26?
827 + bne .Lbase2_26_neon
829 + stmdb sp!,{r1-r3,lr}
830 + bl .Lpoly1305_init_neon
832 + ldr r4,[$ctx,#0] @ load hash value base 2^32
838 + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
840 + veor $D0#lo,$D0#lo,$D0#lo
843 + veor $D1#lo,$D1#lo,$D1#lo
845 + orr r4,r4,r6,lsl#12
846 + veor $D2#lo,$D2#lo,$D2#lo
848 + orr r5,r5,r7,lsl#18
849 + veor $D3#lo,$D3#lo,$D3#lo
850 + and r3,r3,#0x03ffffff
851 + orr r6,r6,ip,lsl#24
852 + veor $D4#lo,$D4#lo,$D4#lo
853 + and r4,r4,#0x03ffffff
855 + and r5,r5,#0x03ffffff
856 + str r1,[$ctx,#36] @ set is_base2_26
858 + vmov.32 $D0#lo[0],r2
859 + vmov.32 $D1#lo[0],r3
860 + vmov.32 $D2#lo[0],r4
861 + vmov.32 $D3#lo[0],r5
862 + vmov.32 $D4#lo[0],r6
865 + ldmia sp!,{r1-r3,lr}
870 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
873 + veor $D0#lo,$D0#lo,$D0#lo
874 + veor $D1#lo,$D1#lo,$D1#lo
875 + veor $D2#lo,$D2#lo,$D2#lo
876 + veor $D3#lo,$D3#lo,$D3#lo
877 + veor $D4#lo,$D4#lo,$D4#lo
878 + vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
880 + vld1.32 {$D4#lo[0]},[$ctx]
881 + sub $ctx,$ctx,#16 @ rewind
885 + mov $padbit,$padbit,lsl#24
889 + vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
890 + vmov.32 $H4#lo[0],$padbit
900 + vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
901 + vshl.u32 $H3#lo,$H3#lo,#18
903 + vsri.u32 $H3#lo,$H2#lo,#14
904 + vshl.u32 $H2#lo,$H2#lo,#12
905 + vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
907 + vbic.i32 $H3#lo,#0xfc000000
908 + vsri.u32 $H2#lo,$H1#lo,#20
909 + vshl.u32 $H1#lo,$H1#lo,#6
911 + vbic.i32 $H2#lo,#0xfc000000
912 + vsri.u32 $H1#lo,$H0#lo,#26
913 + vadd.i32 $H3#hi,$H3#lo,$D3#lo
915 + vbic.i32 $H0#lo,#0xfc000000
916 + vbic.i32 $H1#lo,#0xfc000000
917 + vadd.i32 $H2#hi,$H2#lo,$D2#lo
919 + vadd.i32 $H0#hi,$H0#lo,$D0#lo
920 + vadd.i32 $H1#hi,$H1#lo,$D1#lo
934 + vmov.i32 $H4,#1<<24 @ padbit, yes, always
935 + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
937 + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
940 + addhi $tbl1,$ctx,#(48+1*9*4)
941 + addhi $tbl0,$ctx,#(48+3*9*4)
949 + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
950 + vshl.u32 $H3,$H3,#18
952 + vsri.u32 $H3,$H2,#14
953 + vshl.u32 $H2,$H2,#12
955 + vbic.i32 $H3,#0xfc000000
956 + vsri.u32 $H2,$H1,#20
957 + vshl.u32 $H1,$H1,#6
959 + vbic.i32 $H2,#0xfc000000
960 + vsri.u32 $H1,$H0,#26
962 + vbic.i32 $H0,#0xfc000000
963 + vbic.i32 $H1,#0xfc000000
967 + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
968 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
969 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
970 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
975 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
976 + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
977 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
978 + @ \___________________/
979 + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
980 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
981 + @ \___________________/ \____________________/
983 + @ Note that we start with inp[2:3]*r^2. This is because it
984 + @ doesn't depend on reduction in previous iteration.
985 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
986 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
987 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
988 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
989 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
990 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
992 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
995 + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
996 + vmull.u32 $D2,$H2#hi,${R0}[1]
997 + vadd.i32 $H0#lo,$H0#lo,$D0#lo
998 + vmull.u32 $D0,$H0#hi,${R0}[1]
999 + vadd.i32 $H3#lo,$H3#lo,$D3#lo
1000 + vmull.u32 $D3,$H3#hi,${R0}[1]
1001 + vmlal.u32 $D2,$H1#hi,${R1}[1]
1002 + vadd.i32 $H1#lo,$H1#lo,$D1#lo
1003 + vmull.u32 $D1,$H1#hi,${R0}[1]
1005 + vadd.i32 $H4#lo,$H4#lo,$D4#lo
1006 + vmull.u32 $D4,$H4#hi,${R0}[1]
1007 + subs $len,$len,#64
1008 + vmlal.u32 $D0,$H4#hi,${S1}[1]
1011 + vmlal.u32 $D3,$H2#hi,${R1}[1]
1012 + vld1.32 ${S4}[1],[$tbl1,:32]
1013 + vmlal.u32 $D1,$H0#hi,${R1}[1]
1014 + vmlal.u32 $D4,$H3#hi,${R1}[1]
1016 + vmlal.u32 $D0,$H3#hi,${S2}[1]
1017 + vmlal.u32 $D3,$H1#hi,${R2}[1]
1018 + vmlal.u32 $D4,$H2#hi,${R2}[1]
1019 + vmlal.u32 $D1,$H4#hi,${S2}[1]
1020 + vmlal.u32 $D2,$H0#hi,${R2}[1]
1022 + vmlal.u32 $D3,$H0#hi,${R3}[1]
1023 + vmlal.u32 $D0,$H2#hi,${S3}[1]
1024 + vmlal.u32 $D4,$H1#hi,${R3}[1]
1025 + vmlal.u32 $D1,$H3#hi,${S3}[1]
1026 + vmlal.u32 $D2,$H4#hi,${S3}[1]
1028 + vmlal.u32 $D3,$H4#hi,${S4}[1]
1029 + vmlal.u32 $D0,$H1#hi,${S4}[1]
1030 + vmlal.u32 $D4,$H0#hi,${R4}[1]
1031 + vmlal.u32 $D1,$H2#hi,${S4}[1]
1032 + vmlal.u32 $D2,$H3#hi,${S4}[1]
1034 + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
1037 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1038 + @ (hash+inp[0:1])*r^4 and accumulate
1040 + vmlal.u32 $D3,$H3#lo,${R0}[0]
1041 + vmlal.u32 $D0,$H0#lo,${R0}[0]
1042 + vmlal.u32 $D4,$H4#lo,${R0}[0]
1043 + vmlal.u32 $D1,$H1#lo,${R0}[0]
1044 + vmlal.u32 $D2,$H2#lo,${R0}[0]
1045 + vld1.32 ${S4}[0],[$tbl0,:32]
1047 + vmlal.u32 $D3,$H2#lo,${R1}[0]
1048 + vmlal.u32 $D0,$H4#lo,${S1}[0]
1049 + vmlal.u32 $D4,$H3#lo,${R1}[0]
1050 + vmlal.u32 $D1,$H0#lo,${R1}[0]
1051 + vmlal.u32 $D2,$H1#lo,${R1}[0]
1053 + vmlal.u32 $D3,$H1#lo,${R2}[0]
1054 + vmlal.u32 $D0,$H3#lo,${S2}[0]
1055 + vmlal.u32 $D4,$H2#lo,${R2}[0]
1056 + vmlal.u32 $D1,$H4#lo,${S2}[0]
1057 + vmlal.u32 $D2,$H0#lo,${R2}[0]
1059 + vmlal.u32 $D3,$H0#lo,${R3}[0]
1060 + vmlal.u32 $D0,$H2#lo,${S3}[0]
1061 + vmlal.u32 $D4,$H1#lo,${R3}[0]
1062 + vmlal.u32 $D1,$H3#lo,${S3}[0]
1063 + vmlal.u32 $D3,$H4#lo,${S4}[0]
1065 + vmlal.u32 $D2,$H4#lo,${S3}[0]
1066 + vmlal.u32 $D0,$H1#lo,${S4}[0]
1067 + vmlal.u32 $D4,$H0#lo,${R4}[0]
1068 + vmov.i32 $H4,#1<<24 @ padbit, yes, always
1069 + vmlal.u32 $D1,$H2#lo,${S4}[0]
1070 + vmlal.u32 $D2,$H3#lo,${S4}[0]
1072 + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
1081 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1082 + @ lazy reduction interleaved with base 2^32 -> base 2^26 of
1083 + @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
1085 + vshr.u64 $T0,$D3,#26
1086 + vmovn.i64 $D3#lo,$D3
1087 + vshr.u64 $T1,$D0,#26
1088 + vmovn.i64 $D0#lo,$D0
1089 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4
1090 + vbic.i32 $D3#lo,#0xfc000000
1091 + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
1092 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1
1093 + vshl.u32 $H3,$H3,#18
1094 + vbic.i32 $D0#lo,#0xfc000000
1096 + vshrn.u64 $T0#lo,$D4,#26
1097 + vmovn.i64 $D4#lo,$D4
1098 + vshr.u64 $T1,$D1,#26
1099 + vmovn.i64 $D1#lo,$D1
1100 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2
1101 + vsri.u32 $H3,$H2,#14
1102 + vbic.i32 $D4#lo,#0xfc000000
1103 + vshl.u32 $H2,$H2,#12
1104 + vbic.i32 $D1#lo,#0xfc000000
1106 + vadd.i32 $D0#lo,$D0#lo,$T0#lo
1107 + vshl.u32 $T0#lo,$T0#lo,#2
1108 + vbic.i32 $H3,#0xfc000000
1109 + vshrn.u64 $T1#lo,$D2,#26
1110 + vmovn.i64 $D2#lo,$D2
1111 + vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
1112 + vsri.u32 $H2,$H1,#20
1113 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
1114 + vshl.u32 $H1,$H1,#6
1115 + vbic.i32 $D2#lo,#0xfc000000
1116 + vbic.i32 $H2,#0xfc000000
1118 + vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
1119 + vmovn.i64 $D0#lo,$D0
1120 + vsri.u32 $H1,$H0,#26
1121 + vbic.i32 $H0,#0xfc000000
1122 + vshr.u32 $T1#lo,$D3#lo,#26
1123 + vbic.i32 $D3#lo,#0xfc000000
1124 + vbic.i32 $D0#lo,#0xfc000000
1125 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
1126 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
1127 + vbic.i32 $H1,#0xfc000000
1132 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1133 + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1135 + add $tbl1,$ctx,#(48+0*9*4)
1136 + add $tbl0,$ctx,#(48+1*9*4)
1137 + adds $len,$len,#32
1142 + vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
1143 + vadd.i32 $H0#hi,$H0#lo,$D0#lo
1144 + vadd.i32 $H3#hi,$H3#lo,$D3#lo
1145 + vadd.i32 $H1#hi,$H1#lo,$D1#lo
1146 + vadd.i32 $H4#hi,$H4#lo,$D4#lo
1149 + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
1150 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
1152 + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
1153 + vmull.u32 $D2,$H2#hi,$R0
1154 + vadd.i32 $H0#lo,$H0#lo,$D0#lo
1155 + vmull.u32 $D0,$H0#hi,$R0
1156 + vadd.i32 $H3#lo,$H3#lo,$D3#lo
1157 + vmull.u32 $D3,$H3#hi,$R0
1158 + vadd.i32 $H1#lo,$H1#lo,$D1#lo
1159 + vmull.u32 $D1,$H1#hi,$R0
1160 + vadd.i32 $H4#lo,$H4#lo,$D4#lo
1161 + vmull.u32 $D4,$H4#hi,$R0
1163 + vmlal.u32 $D0,$H4#hi,$S1
1164 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1165 + vmlal.u32 $D3,$H2#hi,$R1
1166 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1167 + vmlal.u32 $D1,$H0#hi,$R1
1168 + vmlal.u32 $D4,$H3#hi,$R1
1169 + vmlal.u32 $D2,$H1#hi,$R1
1171 + vmlal.u32 $D3,$H1#hi,$R2
1172 + vld1.32 ${S4}[1],[$tbl1,:32]
1173 + vmlal.u32 $D0,$H3#hi,$S2
1174 + vld1.32 ${S4}[0],[$tbl0,:32]
1175 + vmlal.u32 $D4,$H2#hi,$R2
1176 + vmlal.u32 $D1,$H4#hi,$S2
1177 + vmlal.u32 $D2,$H0#hi,$R2
1179 + vmlal.u32 $D3,$H0#hi,$R3
1181 + addne $tbl1,$ctx,#(48+2*9*4)
1182 + vmlal.u32 $D0,$H2#hi,$S3
1184 + addne $tbl0,$ctx,#(48+3*9*4)
1185 + vmlal.u32 $D4,$H1#hi,$R3
1186 + vmlal.u32 $D1,$H3#hi,$S3
1187 + vmlal.u32 $D2,$H4#hi,$S3
1189 + vmlal.u32 $D3,$H4#hi,$S4
1190 + vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
1191 + vmlal.u32 $D0,$H1#hi,$S4
1192 + vshr.u64 $MASK,$MASK,#38
1193 + vmlal.u32 $D4,$H0#hi,$R4
1194 + vmlal.u32 $D1,$H2#hi,$S4
1195 + vmlal.u32 $D2,$H3#hi,$S4
1199 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1200 + @ (hash+inp[0:1])*r^4:r^3 and accumulate
1202 + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
1203 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
1205 + vmlal.u32 $D2,$H2#lo,$R0
1206 + vmlal.u32 $D0,$H0#lo,$R0
1207 + vmlal.u32 $D3,$H3#lo,$R0
1208 + vmlal.u32 $D1,$H1#lo,$R0
1209 + vmlal.u32 $D4,$H4#lo,$R0
1211 + vmlal.u32 $D0,$H4#lo,$S1
1212 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1213 + vmlal.u32 $D3,$H2#lo,$R1
1214 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1215 + vmlal.u32 $D1,$H0#lo,$R1
1216 + vmlal.u32 $D4,$H3#lo,$R1
1217 + vmlal.u32 $D2,$H1#lo,$R1
1219 + vmlal.u32 $D3,$H1#lo,$R2
1220 + vld1.32 ${S4}[1],[$tbl1,:32]
1221 + vmlal.u32 $D0,$H3#lo,$S2
1222 + vld1.32 ${S4}[0],[$tbl0,:32]
1223 + vmlal.u32 $D4,$H2#lo,$R2
1224 + vmlal.u32 $D1,$H4#lo,$S2
1225 + vmlal.u32 $D2,$H0#lo,$R2
1227 + vmlal.u32 $D3,$H0#lo,$R3
1228 + vmlal.u32 $D0,$H2#lo,$S3
1229 + vmlal.u32 $D4,$H1#lo,$R3
1230 + vmlal.u32 $D1,$H3#lo,$S3
1231 + vmlal.u32 $D2,$H4#lo,$S3
1233 + vmlal.u32 $D3,$H4#lo,$S4
1234 + vorn $MASK,$MASK,$MASK @ all-ones
1235 + vmlal.u32 $D0,$H1#lo,$S4
1236 + vshr.u64 $MASK,$MASK,#38
1237 + vmlal.u32 $D4,$H0#lo,$R4
1238 + vmlal.u32 $D1,$H2#lo,$S4
1239 + vmlal.u32 $D2,$H3#lo,$S4
1242 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1243 + @ horizontal addition
1245 + vadd.i64 $D3#lo,$D3#lo,$D3#hi
1246 + vadd.i64 $D0#lo,$D0#lo,$D0#hi
1247 + vadd.i64 $D4#lo,$D4#lo,$D4#hi
1248 + vadd.i64 $D1#lo,$D1#lo,$D1#hi
1249 + vadd.i64 $D2#lo,$D2#lo,$D2#hi
1251 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1252 + @ lazy reduction, but without narrowing
1254 + vshr.u64 $T0,$D3,#26
1255 + vand.i64 $D3,$D3,$MASK
1256 + vshr.u64 $T1,$D0,#26
1257 + vand.i64 $D0,$D0,$MASK
1258 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4
1259 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1
1261 + vshr.u64 $T0,$D4,#26
1262 + vand.i64 $D4,$D4,$MASK
1263 + vshr.u64 $T1,$D1,#26
1264 + vand.i64 $D1,$D1,$MASK
1265 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2
1267 + vadd.i64 $D0,$D0,$T0
1268 + vshl.u64 $T0,$T0,#2
1269 + vshr.u64 $T1,$D2,#26
1270 + vand.i64 $D2,$D2,$MASK
1271 + vadd.i64 $D0,$D0,$T0 @ h4 -> h0
1272 + vadd.i64 $D3,$D3,$T1 @ h2 -> h3
1274 + vshr.u64 $T0,$D0,#26
1275 + vand.i64 $D0,$D0,$MASK
1276 + vshr.u64 $T1,$D3,#26
1277 + vand.i64 $D3,$D3,$MASK
1278 + vadd.i64 $D1,$D1,$T0 @ h0 -> h1
1279 + vadd.i64 $D4,$D4,$T1 @ h3 -> h4
1284 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1285 + @ store hash value
1287 + vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
1288 + vst1.32 {$D4#lo[0]},[$ctx]
1290 + vldmia sp!,{d8-d15} @ epilogue
1293 +.size poly1305_blocks_neon,.-poly1305_blocks_neon
1297 +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1301 +.word OPENSSL_armcap_P
1303 +.word OPENSSL_armcap_P-.Lpoly1305_init
1305 +.comm OPENSSL_armcap_P,4,4
1306 +.hidden OPENSSL_armcap_P
1312 +.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
1316 +foreach (split("\n",$code)) {
1317 + s/\`([^\`]*)\`/eval $1/geo;
1319 + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
1320 + s/\bret\b/bx lr/go or
1321 + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
1325 +close STDOUT; # enforce flush
1327 +++ b/arch/arm/crypto/poly1305-core.S_shipped
1330 +# include "arm_arch.h"
1332 +# define __ARM_ARCH__ __LINUX_ARM_ARCH__
1333 +# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
1334 +# define poly1305_init poly1305_init_arm
1335 +# define poly1305_blocks poly1305_blocks_arm
1336 +# define poly1305_emit poly1305_emit_arm
1337 +.globl poly1305_blocks_neon
1340 +#if defined(__thumb2__)
1349 +.globl poly1305_emit
1350 +.globl poly1305_blocks
1351 +.globl poly1305_init
1352 +.type poly1305_init,%function
1356 + stmdb sp!,{r4-r11}
1360 + str r3,[r0,#0] @ zero hash value
1365 + str r3,[r0,#36] @ clear is_base2_26
1374 +#if __ARM_MAX_ARCH__>=7
1376 + str r3,[r0,#28] @ impossible key power value
1377 +# ifndef __KERNEL__
1378 + adr r11,.Lpoly1305_init
1379 + ldr r12,.LOPENSSL_armcap
1383 + mov r10,#0x0fffffff
1385 + and r3,r10,#-4 @ 0x0ffffffc
1388 + orr r4,r4,r5,lsl#8
1390 + orr r4,r4,r6,lsl#16
1392 + orr r4,r4,r7,lsl#24
1396 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1397 +# if !defined(_WIN32)
1398 + ldr r12,[r11,r12] @ OPENSSL_armcap_P
1400 +# if defined(__APPLE__) || defined(_WIN32)
1405 + orr r5,r5,r6,lsl#8
1407 + orr r5,r5,r7,lsl#16
1409 + orr r5,r5,r8,lsl#24
1413 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1414 + tst r12,#ARMV7_NEON @ check for NEON
1416 + adr r9,.Lpoly1305_blocks_neon
1417 + adr r11,.Lpoly1305_blocks
1420 + adr r12,.Lpoly1305_emit
1421 + orr r11,r11,#1 @ thumb-ify addresses
1424 + add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
1426 + addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
1427 + addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
1431 + orr r6,r6,r7,lsl#8
1433 + orr r6,r6,r8,lsl#16
1435 + orr r6,r6,r9,lsl#24
1440 + orr r7,r7,r8,lsl#8
1442 + orr r7,r7,r9,lsl#16
1444 + orr r7,r7,r10,lsl#24
1448 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1449 + stmia r2,{r11,r12} @ fill functions table
1455 + ldmia sp!,{r4-r11}
1456 +#if __ARM_ARCH__>=5
1460 + moveq pc,lr @ be binary compatible with V4, yet
1461 + .word 0xe12fff1e @ interoperable with Thumb ISA:-)
1463 +.size poly1305_init,.-poly1305_init
1464 +.type poly1305_blocks,%function
1468 + stmdb sp!,{r3-r11,lr}
1473 + add r2,r2,r1 @ end pointer
1477 + ldmia r0,{r4-r12} @ load context
1479 + str r2,[sp,#16] @ offload stuff
1482 + ldr lr,[r0,#36] @ is_base2_26
1483 + ldmia r0!,{r4-r8} @ load hash value
1484 + str r2,[sp,#16] @ offload stuff
1487 + adds r9,r4,r5,lsl#26 @ base 2^26 -> base 2^32
1489 + adcs r10,r10,r6,lsl#20
1491 + adcs r11,r11,r7,lsl#14
1493 + adcs r12,r12,r8,lsl#8
1496 + str r2,[r0,#16] @ clear is_base2_26
1497 + adc r2,r2,r8,lsr#24
1500 + movne r4,r9 @ choose between radixes
1504 + ldmia r0,{r9-r12} @ load key
1519 + ldrb r0,[lr],#16 @ load input
1523 + addhi r8,r8,#1 @ 1<<128
1527 + orr r1,r0,r1,lsl#8
1529 + orr r2,r1,r2,lsl#16
1531 + orr r3,r2,r3,lsl#24
1533 + adds r4,r4,r3 @ accumulate input
1536 + orr r1,r0,r1,lsl#8
1538 + orr r2,r1,r2,lsl#16
1540 + orr r3,r2,r3,lsl#24
1545 + orr r1,r0,r1,lsl#8
1547 + orr r2,r1,r2,lsl#16
1549 + orr r3,r2,r3,lsl#24
1554 + orr r1,r0,r1,lsl#8
1555 + str lr,[sp,#8] @ offload input pointer
1556 + orr r2,r1,r2,lsl#16
1557 + add r10,r10,r10,lsr#2
1558 + orr r3,r2,r3,lsl#24
1560 + ldr r0,[lr],#16 @ load input
1562 + addhi r8,r8,#1 @ padbit
1572 + adds r4,r4,r0 @ accumulate input
1573 + str lr,[sp,#8] @ offload input pointer
1575 + add r10,r10,r10,lsr#2
1578 + add r11,r11,r11,lsr#2
1580 + add r12,r12,r12,lsr#2
1585 + umlal r2,r3,r8,r10
1586 + umlal r0,r1,r7,r10
1587 + ldr r10,[sp,#20] @ reload r10
1588 + umlal r2,r3,r6,r12
1589 + umlal r0,r1,r5,r12
1590 + umlal r2,r3,r7,r11
1591 + umlal r0,r1,r6,r11
1592 + umlal r2,r3,r4,r10
1593 + str r0,[sp,#0] @ future r4
1595 + ldr r11,[sp,#24] @ reload r11
1596 + adds r2,r2,r1 @ d1+=d0>>32
1598 + adc lr,r3,#0 @ future r6
1599 + str r2,[sp,#4] @ future r5
1603 + umlal r0,r1,r7,r12
1604 + ldr r12,[sp,#28] @ reload r12
1607 + umlal r2,r3,r6,r10
1608 + umlal r0,r1,r5,r10
1609 + umlal r2,r3,r5,r11
1610 + umlal r0,r1,r4,r11
1611 + umlal r2,r3,r4,r12
1616 + adds r6,lr,r0 @ d2+=d1>>32
1617 + ldr lr,[sp,#8] @ reload input pointer
1619 + adds r7,r2,r1 @ d3+=d2>>32
1620 + ldr r0,[sp,#16] @ reload end pointer
1622 + add r8,r8,r3 @ h4+=d3>>32
1626 + add r1,r1,r1,lsr#2 @ *=5
1633 + cmp r0,lr @ done yet?
1638 + stmdb r0,{r4-r8} @ store the result
1641 +#if __ARM_ARCH__>=5
1642 + ldmia sp!,{r3-r11,pc}
1644 + ldmia sp!,{r3-r11,lr}
1646 + moveq pc,lr @ be binary compatible with V4, yet
1647 + .word 0xe12fff1e @ interoperable with Thumb ISA:-)
1649 +.size poly1305_blocks,.-poly1305_blocks
1650 +.type poly1305_emit,%function
1654 + stmdb sp!,{r4-r11}
1658 +#if __ARM_ARCH__>=7
1659 + ldr ip,[r0,#36] @ is_base2_26
1661 + adds r8,r3,r4,lsl#26 @ base 2^26 -> base 2^32
1663 + adcs r9,r9,r5,lsl#20
1665 + adcs r10,r10,r6,lsl#14
1667 + adcs r11,r11,r7,lsl#8
1669 + adc r0,r0,r7,lsr#24
1681 + adds r8,r3,#5 @ compare to modulus
1686 + tst r0,#4 @ did it carry/borrow?
1714 +#if __ARM_ARCH__>=7
1758 + ldmia sp!,{r4-r11}
1759 +#if __ARM_ARCH__>=5
1763 + moveq pc,lr @ be binary compatible with V4, yet
1764 + .word 0xe12fff1e @ interoperable with Thumb ISA:-)
1766 +.size poly1305_emit,.-poly1305_emit
1767 +#if __ARM_MAX_ARCH__>=7
1770 +.type poly1305_init_neon,%function
1772 +poly1305_init_neon:
1773 +.Lpoly1305_init_neon:
1774 + ldr r3,[r0,#48] @ first table element
1775 + cmp r3,#-1 @ is value impossible?
1776 + bne .Lno_init_neon
1778 + ldr r4,[r0,#20] @ load key base 2^32
1783 + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
1786 + orr r3,r3,r5,lsl#6
1788 + orr r4,r4,r6,lsl#12
1790 + orr r5,r5,r7,lsl#18
1791 + and r3,r3,#0x03ffffff
1792 + and r4,r4,#0x03ffffff
1793 + and r5,r5,#0x03ffffff
1795 + vdup.32 d0,r2 @ r^1 in both lanes
1796 + add r2,r3,r3,lsl#2 @ *5
1798 + add r3,r4,r4,lsl#2
1801 + add r4,r5,r5,lsl#2
1804 + add r5,r6,r6,lsl#2
1809 + mov r5,#2 @ counter
1812 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1813 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1814 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1815 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1816 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1817 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1819 + vmull.u32 q5,d0,d0[1]
1820 + vmull.u32 q6,d1,d0[1]
1821 + vmull.u32 q7,d3,d0[1]
1822 + vmull.u32 q8,d5,d0[1]
1823 + vmull.u32 q9,d7,d0[1]
1825 + vmlal.u32 q5,d7,d2[1]
1826 + vmlal.u32 q6,d0,d1[1]
1827 + vmlal.u32 q7,d1,d1[1]
1828 + vmlal.u32 q8,d3,d1[1]
1829 + vmlal.u32 q9,d5,d1[1]
1831 + vmlal.u32 q5,d5,d4[1]
1832 + vmlal.u32 q6,d7,d4[1]
1833 + vmlal.u32 q8,d1,d3[1]
1834 + vmlal.u32 q7,d0,d3[1]
1835 + vmlal.u32 q9,d3,d3[1]
1837 + vmlal.u32 q5,d3,d6[1]
1838 + vmlal.u32 q8,d0,d5[1]
1839 + vmlal.u32 q6,d5,d6[1]
1840 + vmlal.u32 q7,d7,d6[1]
1841 + vmlal.u32 q9,d1,d5[1]
1843 + vmlal.u32 q8,d7,d8[1]
1844 + vmlal.u32 q5,d1,d8[1]
1845 + vmlal.u32 q6,d3,d8[1]
1846 + vmlal.u32 q7,d5,d8[1]
1847 + vmlal.u32 q9,d0,d7[1]
1849 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1850 + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1853 + @ H0>>+H1>>+H2>>+H3>>+H4
1854 + @ H3>>+H4>>*5+H0>>+H1
1858 + @ Result of multiplication of n-bit number by m-bit number is
1859 + @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
1860 + @ m-bit number multiplied by 2^n is still n+m bits wide.
1862 + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
1863 + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
1864 + @ one is n+1 bits wide.
1866 + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
1867 + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
1868 + @ can be 27. However! In cases when their width exceeds 26 bits
1869 + @ they are limited by 2^26+2^6. This in turn means that *sum*
1870 + @ of the products with these values can still be viewed as sum
1871 + @ of 52-bit numbers as long as the amount of addends is not a
1872 + @ power of 2. For example,
1874 + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
1876 + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
1877 + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
1878 + @ 8 * (2^52) or 2^55. However, the value is then multiplied by
1879 + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
1880 + @ which is less than 32 * (2^52) or 2^57. And when processing
1881 + @ data we are looking at triple as many addends...
1883 + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
1884 + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
1885 + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
1886 + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
1887 + @ instruction accepts 2x32-bit input and writes 2x64-bit result.
1888 + @ This means that result of reduction have to be compressed upon
1889 + @ loop wrap-around. This can be done in the process of reduction
1890 + @ to minimize amount of instructions [as well as amount of
1891 + @ 128-bit instructions, which benefits low-end processors], but
1892 + @ one has to watch for H2 (which is narrower than H0) and 5*H4
1893 + @ not being wider than 58 bits, so that result of right shift
1894 + @ by 26 bits fits in 32 bits. This is also useful on x86,
1895 + @ because it allows to use paddd in place for paddq, which
1896 + @ benefits Atom, where paddq is ridiculously slow.
1898 + vshr.u64 q15,q8,#26
1900 + vshr.u64 q4,q5,#26
1902 + vadd.i64 q9,q9,q15 @ h3 -> h4
1903 + vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
1904 + vadd.i64 q6,q6,q4 @ h0 -> h1
1905 + vbic.i32 d10,#0xfc000000
1907 + vshrn.u64 d30,q9,#26
1909 + vshr.u64 q4,q6,#26
1911 + vadd.i64 q7,q7,q4 @ h1 -> h2
1912 + vbic.i32 d18,#0xfc000000
1913 + vbic.i32 d12,#0xfc000000
1915 + vadd.i32 d10,d10,d30
1916 + vshl.u32 d30,d30,#2
1917 + vshrn.u64 d8,q7,#26
1919 + vadd.i32 d10,d10,d30 @ h4 -> h0
1920 + vadd.i32 d16,d16,d8 @ h2 -> h3
1921 + vbic.i32 d14,#0xfc000000
1923 + vshr.u32 d30,d10,#26
1924 + vbic.i32 d10,#0xfc000000
1925 + vshr.u32 d8,d16,#26
1926 + vbic.i32 d16,#0xfc000000
1927 + vadd.i32 d12,d12,d30 @ h0 -> h1
1928 + vadd.i32 d18,d18,d8 @ h3 -> h4
1931 + beq .Lsquare_break_neon
1933 + add r6,r0,#(48+0*9*4)
1934 + add r7,r0,#(48+1*9*4)
1936 + vtrn.32 d0,d10 @ r^2:r^1
1942 + vshl.u32 d4,d3,#2 @ *5
1951 + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
1952 + vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
1953 + vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
1954 + vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
1955 + vst1.32 {d8[0]},[r6,:32]
1956 + vst1.32 {d8[1]},[r7,:32]
1961 +.Lsquare_break_neon:
1962 + add r6,r0,#(48+2*4*9)
1963 + add r7,r0,#(48+3*4*9)
1965 + vmov d0,d10 @ r^4:r^3
1966 + vshl.u32 d2,d12,#2 @ *5
1968 + vshl.u32 d4,d14,#2
1970 + vshl.u32 d6,d16,#2
1972 + vshl.u32 d8,d18,#2
1974 + vadd.i32 d2,d2,d12
1975 + vadd.i32 d4,d4,d14
1976 + vadd.i32 d6,d6,d16
1977 + vadd.i32 d8,d8,d18
1979 + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
1980 + vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
1981 + vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
1982 + vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
1983 + vst1.32 {d8[0]},[r6]
1984 + vst1.32 {d8[1]},[r7]
1988 +.size poly1305_init_neon,.-poly1305_init_neon
1990 +.type poly1305_blocks_neon,%function
1992 +poly1305_blocks_neon:
1993 +.Lpoly1305_blocks_neon:
1994 + ldr ip,[r0,#36] @ is_base2_26
1997 + blo .Lpoly1305_blocks
2000 + vstmdb sp!,{d8-d15} @ ABI specification says so
2002 + tst ip,ip @ is_base2_26?
2003 + bne .Lbase2_26_neon
2005 + stmdb sp!,{r1-r3,lr}
2006 + bl .Lpoly1305_init_neon
2008 + ldr r4,[r0,#0] @ load hash value base 2^32
2014 + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
2018 + orr r3,r3,r5,lsl#6
2021 + orr r4,r4,r6,lsl#12
2024 + orr r5,r5,r7,lsl#18
2026 + and r3,r3,#0x03ffffff
2027 + orr r6,r6,ip,lsl#24
2029 + and r4,r4,#0x03ffffff
2031 + and r5,r5,#0x03ffffff
2032 + str r1,[r0,#36] @ set is_base2_26
2041 + ldmia sp!,{r1-r3,lr}
2046 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2054 + vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
2056 + vld1.32 {d18[0]},[r0]
2057 + sub r0,r0,#16 @ rewind
2065 + vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
2076 + vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
2077 + vshl.u32 d26,d26,#18
2079 + vsri.u32 d26,d24,#14
2080 + vshl.u32 d24,d24,#12
2081 + vadd.i32 d29,d28,d18 @ add hash value and move to #hi
2083 + vbic.i32 d26,#0xfc000000
2084 + vsri.u32 d24,d22,#20
2085 + vshl.u32 d22,d22,#6
2087 + vbic.i32 d24,#0xfc000000
2088 + vsri.u32 d22,d20,#26
2089 + vadd.i32 d27,d26,d16
2091 + vbic.i32 d20,#0xfc000000
2092 + vbic.i32 d22,#0xfc000000
2093 + vadd.i32 d25,d24,d14
2095 + vadd.i32 d21,d20,d10
2096 + vadd.i32 d23,d22,d12
2110 + vmov.i32 q14,#1<<24 @ padbit, yes, always
2111 + vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
2113 + vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
2116 + addhi r7,r0,#(48+1*9*4)
2117 + addhi r6,r0,#(48+3*9*4)
2125 + vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
2126 + vshl.u32 q13,q13,#18
2128 + vsri.u32 q13,q12,#14
2129 + vshl.u32 q12,q12,#12
2131 + vbic.i32 q13,#0xfc000000
2132 + vsri.u32 q12,q11,#20
2133 + vshl.u32 q11,q11,#6
2135 + vbic.i32 q12,#0xfc000000
2136 + vsri.u32 q11,q10,#26
2138 + vbic.i32 q10,#0xfc000000
2139 + vbic.i32 q11,#0xfc000000
2143 + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
2144 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
2145 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
2146 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
2151 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2152 + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
2153 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
2154 + @ ___________________/
2155 + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
2156 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
2157 + @ ___________________/ ____________________/
2159 + @ Note that we start with inp[2:3]*r^2. This is because it
2160 + @ doesn't depend on reduction in previous iteration.
2161 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2162 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
2163 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
2164 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
2165 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2166 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2168 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2171 + vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
2172 + vmull.u32 q7,d25,d0[1]
2173 + vadd.i32 d20,d20,d10
2174 + vmull.u32 q5,d21,d0[1]
2175 + vadd.i32 d26,d26,d16
2176 + vmull.u32 q8,d27,d0[1]
2177 + vmlal.u32 q7,d23,d1[1]
2178 + vadd.i32 d22,d22,d12
2179 + vmull.u32 q6,d23,d0[1]
2181 + vadd.i32 d28,d28,d18
2182 + vmull.u32 q9,d29,d0[1]
2184 + vmlal.u32 q5,d29,d2[1]
2187 + vmlal.u32 q8,d25,d1[1]
2188 + vld1.32 d8[1],[r7,:32]
2189 + vmlal.u32 q6,d21,d1[1]
2190 + vmlal.u32 q9,d27,d1[1]
2192 + vmlal.u32 q5,d27,d4[1]
2193 + vmlal.u32 q8,d23,d3[1]
2194 + vmlal.u32 q9,d25,d3[1]
2195 + vmlal.u32 q6,d29,d4[1]
2196 + vmlal.u32 q7,d21,d3[1]
2198 + vmlal.u32 q8,d21,d5[1]
2199 + vmlal.u32 q5,d25,d6[1]
2200 + vmlal.u32 q9,d23,d5[1]
2201 + vmlal.u32 q6,d27,d6[1]
2202 + vmlal.u32 q7,d29,d6[1]
2204 + vmlal.u32 q8,d29,d8[1]
2205 + vmlal.u32 q5,d23,d8[1]
2206 + vmlal.u32 q9,d21,d7[1]
2207 + vmlal.u32 q6,d25,d8[1]
2208 + vmlal.u32 q7,d27,d8[1]
2210 + vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
2213 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2214 + @ (hash+inp[0:1])*r^4 and accumulate
2216 + vmlal.u32 q8,d26,d0[0]
2217 + vmlal.u32 q5,d20,d0[0]
2218 + vmlal.u32 q9,d28,d0[0]
2219 + vmlal.u32 q6,d22,d0[0]
2220 + vmlal.u32 q7,d24,d0[0]
2221 + vld1.32 d8[0],[r6,:32]
2223 + vmlal.u32 q8,d24,d1[0]
2224 + vmlal.u32 q5,d28,d2[0]
2225 + vmlal.u32 q9,d26,d1[0]
2226 + vmlal.u32 q6,d20,d1[0]
2227 + vmlal.u32 q7,d22,d1[0]
2229 + vmlal.u32 q8,d22,d3[0]
2230 + vmlal.u32 q5,d26,d4[0]
2231 + vmlal.u32 q9,d24,d3[0]
2232 + vmlal.u32 q6,d28,d4[0]
2233 + vmlal.u32 q7,d20,d3[0]
2235 + vmlal.u32 q8,d20,d5[0]
2236 + vmlal.u32 q5,d24,d6[0]
2237 + vmlal.u32 q9,d22,d5[0]
2238 + vmlal.u32 q6,d26,d6[0]
2239 + vmlal.u32 q8,d28,d8[0]
2241 + vmlal.u32 q7,d28,d6[0]
2242 + vmlal.u32 q5,d22,d8[0]
2243 + vmlal.u32 q9,d20,d7[0]
2244 + vmov.i32 q14,#1<<24 @ padbit, yes, always
2245 + vmlal.u32 q6,d24,d8[0]
2246 + vmlal.u32 q7,d26,d8[0]
2248 + vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
2257 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2258 + @ lazy reduction interleaved with base 2^32 -> base 2^26 of
2259 + @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
2261 + vshr.u64 q15,q8,#26
2263 + vshr.u64 q4,q5,#26
2265 + vadd.i64 q9,q9,q15 @ h3 -> h4
2266 + vbic.i32 d16,#0xfc000000
2267 + vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
2268 + vadd.i64 q6,q6,q4 @ h0 -> h1
2269 + vshl.u32 q13,q13,#18
2270 + vbic.i32 d10,#0xfc000000
2272 + vshrn.u64 d30,q9,#26
2274 + vshr.u64 q4,q6,#26
2276 + vadd.i64 q7,q7,q4 @ h1 -> h2
2277 + vsri.u32 q13,q12,#14
2278 + vbic.i32 d18,#0xfc000000
2279 + vshl.u32 q12,q12,#12
2280 + vbic.i32 d12,#0xfc000000
2282 + vadd.i32 d10,d10,d30
2283 + vshl.u32 d30,d30,#2
2284 + vbic.i32 q13,#0xfc000000
2285 + vshrn.u64 d8,q7,#26
2287 + vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
2288 + vsri.u32 q12,q11,#20
2289 + vadd.i32 d16,d16,d8 @ h2 -> h3
2290 + vshl.u32 q11,q11,#6
2291 + vbic.i32 d14,#0xfc000000
2292 + vbic.i32 q12,#0xfc000000
2294 + vshrn.u64 d30,q5,#26 @ re-narrow
2296 + vsri.u32 q11,q10,#26
2297 + vbic.i32 q10,#0xfc000000
2298 + vshr.u32 d8,d16,#26
2299 + vbic.i32 d16,#0xfc000000
2300 + vbic.i32 d10,#0xfc000000
2301 + vadd.i32 d12,d12,d30 @ h0 -> h1
2302 + vadd.i32 d18,d18,d8 @ h3 -> h4
2303 + vbic.i32 q11,#0xfc000000
2308 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2309 + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
2311 + add r7,r0,#(48+0*9*4)
2312 + add r6,r0,#(48+1*9*4)
2318 + vadd.i32 d25,d24,d14 @ add hash value and move to #hi
2319 + vadd.i32 d21,d20,d10
2320 + vadd.i32 d27,d26,d16
2321 + vadd.i32 d23,d22,d12
2322 + vadd.i32 d29,d28,d18
2325 + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
2326 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
2328 + vadd.i32 d24,d24,d14 @ can be redundant
2329 + vmull.u32 q7,d25,d0
2330 + vadd.i32 d20,d20,d10
2331 + vmull.u32 q5,d21,d0
2332 + vadd.i32 d26,d26,d16
2333 + vmull.u32 q8,d27,d0
2334 + vadd.i32 d22,d22,d12
2335 + vmull.u32 q6,d23,d0
2336 + vadd.i32 d28,d28,d18
2337 + vmull.u32 q9,d29,d0
2339 + vmlal.u32 q5,d29,d2
2340 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
2341 + vmlal.u32 q8,d25,d1
2342 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
2343 + vmlal.u32 q6,d21,d1
2344 + vmlal.u32 q9,d27,d1
2345 + vmlal.u32 q7,d23,d1
2347 + vmlal.u32 q8,d23,d3
2348 + vld1.32 d8[1],[r7,:32]
2349 + vmlal.u32 q5,d27,d4
2350 + vld1.32 d8[0],[r6,:32]
2351 + vmlal.u32 q9,d25,d3
2352 + vmlal.u32 q6,d29,d4
2353 + vmlal.u32 q7,d21,d3
2355 + vmlal.u32 q8,d21,d5
2357 + addne r7,r0,#(48+2*9*4)
2358 + vmlal.u32 q5,d25,d6
2360 + addne r6,r0,#(48+3*9*4)
2361 + vmlal.u32 q9,d23,d5
2362 + vmlal.u32 q6,d27,d6
2363 + vmlal.u32 q7,d29,d6
2365 + vmlal.u32 q8,d29,d8
2366 + vorn q0,q0,q0 @ all-ones, can be redundant
2367 + vmlal.u32 q5,d23,d8
2368 + vshr.u64 q0,q0,#38
2369 + vmlal.u32 q9,d21,d7
2370 + vmlal.u32 q6,d25,d8
2371 + vmlal.u32 q7,d27,d8
2375 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2376 + @ (hash+inp[0:1])*r^4:r^3 and accumulate
2378 + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
2379 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
2381 + vmlal.u32 q7,d24,d0
2382 + vmlal.u32 q5,d20,d0
2383 + vmlal.u32 q8,d26,d0
2384 + vmlal.u32 q6,d22,d0
2385 + vmlal.u32 q9,d28,d0
2387 + vmlal.u32 q5,d28,d2
2388 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
2389 + vmlal.u32 q8,d24,d1
2390 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
2391 + vmlal.u32 q6,d20,d1
2392 + vmlal.u32 q9,d26,d1
2393 + vmlal.u32 q7,d22,d1
2395 + vmlal.u32 q8,d22,d3
2396 + vld1.32 d8[1],[r7,:32]
2397 + vmlal.u32 q5,d26,d4
2398 + vld1.32 d8[0],[r6,:32]
2399 + vmlal.u32 q9,d24,d3
2400 + vmlal.u32 q6,d28,d4
2401 + vmlal.u32 q7,d20,d3
2403 + vmlal.u32 q8,d20,d5
2404 + vmlal.u32 q5,d24,d6
2405 + vmlal.u32 q9,d22,d5
2406 + vmlal.u32 q6,d26,d6
2407 + vmlal.u32 q7,d28,d6
2409 + vmlal.u32 q8,d28,d8
2410 + vorn q0,q0,q0 @ all-ones
2411 + vmlal.u32 q5,d22,d8
2412 + vshr.u64 q0,q0,#38
2413 + vmlal.u32 q9,d20,d7
2414 + vmlal.u32 q6,d24,d8
2415 + vmlal.u32 q7,d26,d8
2418 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2419 + @ horizontal addition
2421 + vadd.i64 d16,d16,d17
2422 + vadd.i64 d10,d10,d11
2423 + vadd.i64 d18,d18,d19
2424 + vadd.i64 d12,d12,d13
2425 + vadd.i64 d14,d14,d15
2427 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2428 + @ lazy reduction, but without narrowing
2430 + vshr.u64 q15,q8,#26
2432 + vshr.u64 q4,q5,#26
2434 + vadd.i64 q9,q9,q15 @ h3 -> h4
2435 + vadd.i64 q6,q6,q4 @ h0 -> h1
2437 + vshr.u64 q15,q9,#26
2439 + vshr.u64 q4,q6,#26
2441 + vadd.i64 q7,q7,q4 @ h1 -> h2
2443 + vadd.i64 q5,q5,q15
2444 + vshl.u64 q15,q15,#2
2445 + vshr.u64 q4,q7,#26
2447 + vadd.i64 q5,q5,q15 @ h4 -> h0
2448 + vadd.i64 q8,q8,q4 @ h2 -> h3
2450 + vshr.u64 q15,q5,#26
2452 + vshr.u64 q4,q8,#26
2454 + vadd.i64 q6,q6,q15 @ h0 -> h1
2455 + vadd.i64 q9,q9,q4 @ h3 -> h4
2460 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2461 + @ store hash value
2463 + vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
2464 + vst1.32 {d18[0]},[r0]
2466 + vldmia sp!,{d8-d15} @ epilogue
2469 +.size poly1305_blocks_neon,.-poly1305_blocks_neon
2473 +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2477 +.word OPENSSL_armcap_P
2479 +.word OPENSSL_armcap_P-.Lpoly1305_init
2481 +.comm OPENSSL_armcap_P,4,4
2482 +.hidden OPENSSL_armcap_P
2485 +.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by @dot-asm"
2488 +++ b/arch/arm/crypto/poly1305-glue.c
2490 +// SPDX-License-Identifier: GPL-2.0
2492 + * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
2494 + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
2497 +#include <asm/hwcap.h>
2498 +#include <asm/neon.h>
2499 +#include <asm/simd.h>
2500 +#include <asm/unaligned.h>
2501 +#include <crypto/algapi.h>
2502 +#include <crypto/internal/hash.h>
2503 +#include <crypto/internal/poly1305.h>
2504 +#include <crypto/internal/simd.h>
2505 +#include <linux/cpufeature.h>
2506 +#include <linux/crypto.h>
2507 +#include <linux/jump_label.h>
2508 +#include <linux/module.h>
2510 +void poly1305_init_arm(void *state, const u8 *key);
2511 +void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
2512 +void poly1305_emit_arm(void *state, __le32 *digest, const u32 *nonce);
2514 +void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
2518 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
2520 +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
2522 + poly1305_init_arm(&dctx->h, key);
2523 + dctx->s[0] = get_unaligned_le32(key + 16);
2524 + dctx->s[1] = get_unaligned_le32(key + 20);
2525 + dctx->s[2] = get_unaligned_le32(key + 24);
2526 + dctx->s[3] = get_unaligned_le32(key + 28);
2529 +EXPORT_SYMBOL(poly1305_init_arch);
2531 +static int arm_poly1305_init(struct shash_desc *desc)
2533 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2537 + dctx->sset = false;
2542 +static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
2543 + u32 len, u32 hibit, bool do_neon)
2545 + if (unlikely(!dctx->sset)) {
2546 + if (!dctx->rset) {
2547 + poly1305_init_arm(&dctx->h, src);
2548 + src += POLY1305_BLOCK_SIZE;
2549 + len -= POLY1305_BLOCK_SIZE;
2552 + if (len >= POLY1305_BLOCK_SIZE) {
2553 + dctx->s[0] = get_unaligned_le32(src + 0);
2554 + dctx->s[1] = get_unaligned_le32(src + 4);
2555 + dctx->s[2] = get_unaligned_le32(src + 8);
2556 + dctx->s[3] = get_unaligned_le32(src + 12);
2557 + src += POLY1305_BLOCK_SIZE;
2558 + len -= POLY1305_BLOCK_SIZE;
2559 + dctx->sset = true;
2561 + if (len < POLY1305_BLOCK_SIZE)
2565 + len &= ~(POLY1305_BLOCK_SIZE - 1);
2567 + if (static_branch_likely(&have_neon) && likely(do_neon))
2568 + poly1305_blocks_neon(&dctx->h, src, len, hibit);
2570 + poly1305_blocks_arm(&dctx->h, src, len, hibit);
2573 +static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
2574 + const u8 *src, u32 len, bool do_neon)
2576 + if (unlikely(dctx->buflen)) {
2577 + u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
2579 + memcpy(dctx->buf + dctx->buflen, src, bytes);
2582 + dctx->buflen += bytes;
2584 + if (dctx->buflen == POLY1305_BLOCK_SIZE) {
2585 + arm_poly1305_blocks(dctx, dctx->buf,
2586 + POLY1305_BLOCK_SIZE, 1, false);
2591 + if (likely(len >= POLY1305_BLOCK_SIZE)) {
2592 + arm_poly1305_blocks(dctx, src, len, 1, do_neon);
2593 + src += round_down(len, POLY1305_BLOCK_SIZE);
2594 + len %= POLY1305_BLOCK_SIZE;
2597 + if (unlikely(len)) {
2598 + dctx->buflen = len;
2599 + memcpy(dctx->buf, src, len);
2603 +static int arm_poly1305_update(struct shash_desc *desc,
2604 + const u8 *src, unsigned int srclen)
2606 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2608 + arm_poly1305_do_update(dctx, src, srclen, false);
2612 +static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
2614 + unsigned int srclen)
2616 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2617 + bool do_neon = crypto_simd_usable() && srclen > 128;
2619 + if (static_branch_likely(&have_neon) && do_neon)
2620 + kernel_neon_begin();
2621 + arm_poly1305_do_update(dctx, src, srclen, do_neon);
2622 + if (static_branch_likely(&have_neon) && do_neon)
2623 + kernel_neon_end();
2627 +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
2628 + unsigned int nbytes)
2630 + bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
2631 + crypto_simd_usable();
2633 + if (unlikely(dctx->buflen)) {
2634 + u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
2636 + memcpy(dctx->buf + dctx->buflen, src, bytes);
2639 + dctx->buflen += bytes;
2641 + if (dctx->buflen == POLY1305_BLOCK_SIZE) {
2642 + poly1305_blocks_arm(&dctx->h, dctx->buf,
2643 + POLY1305_BLOCK_SIZE, 1);
2648 + if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
2649 + unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
2651 + if (static_branch_likely(&have_neon) && do_neon) {
2652 + kernel_neon_begin();
2653 + poly1305_blocks_neon(&dctx->h, src, len, 1);
2654 + kernel_neon_end();
2656 + poly1305_blocks_arm(&dctx->h, src, len, 1);
2659 + nbytes %= POLY1305_BLOCK_SIZE;
2662 + if (unlikely(nbytes)) {
2663 + dctx->buflen = nbytes;
2664 + memcpy(dctx->buf, src, nbytes);
2667 +EXPORT_SYMBOL(poly1305_update_arch);
2669 +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
2674 + if (unlikely(dctx->buflen)) {
2675 + dctx->buf[dctx->buflen++] = 1;
2676 + memset(dctx->buf + dctx->buflen, 0,
2677 + POLY1305_BLOCK_SIZE - dctx->buflen);
2678 + poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
2681 + poly1305_emit_arm(&dctx->h, digest, dctx->s);
2683 + /* mac = (h + s) % (2^128) */
2684 + f = (f >> 32) + le32_to_cpu(digest[0]);
2685 + put_unaligned_le32(f, dst);
2686 + f = (f >> 32) + le32_to_cpu(digest[1]);
2687 + put_unaligned_le32(f, dst + 4);
2688 + f = (f >> 32) + le32_to_cpu(digest[2]);
2689 + put_unaligned_le32(f, dst + 8);
2690 + f = (f >> 32) + le32_to_cpu(digest[3]);
2691 + put_unaligned_le32(f, dst + 12);
2693 + *dctx = (struct poly1305_desc_ctx){};
2695 +EXPORT_SYMBOL(poly1305_final_arch);
2697 +static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
2699 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2701 + if (unlikely(!dctx->sset))
2704 + poly1305_final_arch(dctx, dst);
2708 +static struct shash_alg arm_poly1305_algs[] = {{
2709 + .init = arm_poly1305_init,
2710 + .update = arm_poly1305_update,
2711 + .final = arm_poly1305_final,
2712 + .digestsize = POLY1305_DIGEST_SIZE,
2713 + .descsize = sizeof(struct poly1305_desc_ctx),
2715 + .base.cra_name = "poly1305",
2716 + .base.cra_driver_name = "poly1305-arm",
2717 + .base.cra_priority = 150,
2718 + .base.cra_blocksize = POLY1305_BLOCK_SIZE,
2719 + .base.cra_module = THIS_MODULE,
2720 +#ifdef CONFIG_KERNEL_MODE_NEON
2722 + .init = arm_poly1305_init,
2723 + .update = arm_poly1305_update_neon,
2724 + .final = arm_poly1305_final,
2725 + .digestsize = POLY1305_DIGEST_SIZE,
2726 + .descsize = sizeof(struct poly1305_desc_ctx),
2728 + .base.cra_name = "poly1305",
2729 + .base.cra_driver_name = "poly1305-neon",
2730 + .base.cra_priority = 200,
2731 + .base.cra_blocksize = POLY1305_BLOCK_SIZE,
2732 + .base.cra_module = THIS_MODULE,
2736 +static int __init arm_poly1305_mod_init(void)
2738 + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
2739 + (elf_hwcap & HWCAP_NEON))
2740 + static_branch_enable(&have_neon);
2742 + /* register only the first entry */
2743 + return crypto_register_shash(&arm_poly1305_algs[0]);
2745 + return crypto_register_shashes(arm_poly1305_algs,
2746 + ARRAY_SIZE(arm_poly1305_algs));
2749 +static void __exit arm_poly1305_mod_exit(void)
2751 + if (!static_branch_likely(&have_neon)) {
2752 + crypto_unregister_shash(&arm_poly1305_algs[0]);
2755 + crypto_unregister_shashes(arm_poly1305_algs,
2756 + ARRAY_SIZE(arm_poly1305_algs));
2759 +module_init(arm_poly1305_mod_init);
2760 +module_exit(arm_poly1305_mod_exit);
2762 +MODULE_LICENSE("GPL v2");
2763 +MODULE_ALIAS_CRYPTO("poly1305");
2764 +MODULE_ALIAS_CRYPTO("poly1305-arm");
2765 +MODULE_ALIAS_CRYPTO("poly1305-neon");
2766 --- a/lib/crypto/Kconfig
2767 +++ b/lib/crypto/Kconfig
2768 @@ -40,7 +40,7 @@ config CRYPTO_LIB_DES
2769 config CRYPTO_LIB_POLY1305_RSIZE
2772 - default 9 if ARM64
2773 + default 9 if ARM || ARM64
2776 config CRYPTO_ARCH_HAVE_LIB_POLY1305