464c6568f66db4b3be8f1444b884b97b5d0e7dca
[openwrt/staging/ldir.git] /
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Ard Biesheuvel <ardb@kernel.org>
3 Date: Fri, 8 Nov 2019 13:22:24 +0100
4 Subject: [PATCH] crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON
5 implementation
6
7 commit f569ca16475155013525686d0f73bc379c67e635 upstream.
8
9 This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation
10 for NEON authored by Andy Polyakov, and contributed by him to the OpenSSL
11 project. The file 'poly1305-armv8.pl' is taken straight from this upstream
12 GitHub repository [0] at commit ec55a08dc0244ce570c4fc7cade330c60798952f,
13 and already contains all the changes required to build it as part of a
14 Linux kernel module.
15
16 [0] https://github.com/dot-asm/cryptogams
17
18 Co-developed-by: Andy Polyakov <appro@cryptogams.org>
19 Signed-off-by: Andy Polyakov <appro@cryptogams.org>
20 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
21 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
22 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
23 ---
24 arch/arm64/crypto/Kconfig | 6 +
25 arch/arm64/crypto/Makefile | 10 +-
26 arch/arm64/crypto/poly1305-armv8.pl | 913 ++++++++++++++++++++++
27 arch/arm64/crypto/poly1305-core.S_shipped | 835 ++++++++++++++++++++
28 arch/arm64/crypto/poly1305-glue.c | 237 ++++++
29 lib/crypto/Kconfig | 1 +
30 6 files changed, 2001 insertions(+), 1 deletion(-)
31 create mode 100644 arch/arm64/crypto/poly1305-armv8.pl
32 create mode 100644 arch/arm64/crypto/poly1305-core.S_shipped
33 create mode 100644 arch/arm64/crypto/poly1305-glue.c
34
35 --- a/arch/arm64/crypto/Kconfig
36 +++ b/arch/arm64/crypto/Kconfig
37 @@ -106,6 +106,12 @@ config CRYPTO_CHACHA20_NEON
38 select CRYPTO_LIB_CHACHA_GENERIC
39 select CRYPTO_ARCH_HAVE_LIB_CHACHA
40
41 +config CRYPTO_POLY1305_NEON
42 + tristate "Poly1305 hash function using scalar or NEON instructions"
43 + depends on KERNEL_MODE_NEON
44 + select CRYPTO_HASH
45 + select CRYPTO_ARCH_HAVE_LIB_POLY1305
46 +
47 config CRYPTO_NHPOLY1305_NEON
48 tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
49 depends on KERNEL_MODE_NEON
50 --- a/arch/arm64/crypto/Makefile
51 +++ b/arch/arm64/crypto/Makefile
52 @@ -50,6 +50,10 @@ sha512-arm64-y := sha512-glue.o sha512-c
53 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
54 chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
55
56 +obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
57 +poly1305-neon-y := poly1305-core.o poly1305-glue.o
58 +AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
59 +
60 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
61 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
62
63 @@ -68,11 +72,15 @@ ifdef REGENERATE_ARM64_CRYPTO
64 quiet_cmd_perlasm = PERLASM $@
65 cmd_perlasm = $(PERL) $(<) void $(@)
66
67 +$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
68 + $(call cmd,perlasm)
69 +
70 $(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
71 $(call cmd,perlasm)
72
73 $(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
74 $(call cmd,perlasm)
75 +
76 endif
77
78 -clean-files += sha256-core.S sha512-core.S
79 +clean-files += poly1305-core.S sha256-core.S sha512-core.S
80 --- /dev/null
81 +++ b/arch/arm64/crypto/poly1305-armv8.pl
82 @@ -0,0 +1,913 @@
83 +#!/usr/bin/env perl
84 +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
85 +#
86 +# ====================================================================
87 +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
88 +# project.
89 +# ====================================================================
90 +#
91 +# This module implements Poly1305 hash for ARMv8.
92 +#
93 +# June 2015
94 +#
95 +# Numbers are cycles per processed byte with poly1305_blocks alone.
96 +#
97 +# IALU/gcc-4.9 NEON
98 +#
99 +# Apple A7 1.86/+5% 0.72
100 +# Cortex-A53 2.69/+58% 1.47
101 +# Cortex-A57 2.70/+7% 1.14
102 +# Denver 1.64/+50% 1.18(*)
103 +# X-Gene 2.13/+68% 2.27
104 +# Mongoose 1.77/+75% 1.12
105 +# Kryo 2.70/+55% 1.13
106 +# ThunderX2 1.17/+95% 1.36
107 +#
108 +# (*) estimate based on resources availability is less than 1.0,
109 +# i.e. measured result is worse than expected, presumably binary
110 +# translator is not almighty;
111 +
112 +$flavour=shift;
113 +$output=shift;
114 +
115 +if ($flavour && $flavour ne "void") {
116 + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
117 + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
118 + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
119 + die "can't locate arm-xlate.pl";
120 +
121 + open STDOUT,"| \"$^X\" $xlate $flavour $output";
122 +} else {
123 + open STDOUT,">$output";
124 +}
125 +
126 +my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
127 +my ($mac,$nonce)=($inp,$len);
128 +
129 +my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
130 +
131 +$code.=<<___;
132 +#ifndef __KERNEL__
133 +# include "arm_arch.h"
134 +.extern OPENSSL_armcap_P
135 +#endif
136 +
137 +.text
138 +
139 +// forward "declarations" are required for Apple
140 +.globl poly1305_blocks
141 +.globl poly1305_emit
142 +
143 +.globl poly1305_init
144 +.type poly1305_init,%function
145 +.align 5
146 +poly1305_init:
147 + cmp $inp,xzr
148 + stp xzr,xzr,[$ctx] // zero hash value
149 + stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
150 +
151 + csel x0,xzr,x0,eq
152 + b.eq .Lno_key
153 +
154 +#ifndef __KERNEL__
155 + adrp x17,OPENSSL_armcap_P
156 + ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
157 +#endif
158 +
159 + ldp $r0,$r1,[$inp] // load key
160 + mov $s1,#0xfffffffc0fffffff
161 + movk $s1,#0x0fff,lsl#48
162 +#ifdef __AARCH64EB__
163 + rev $r0,$r0 // flip bytes
164 + rev $r1,$r1
165 +#endif
166 + and $r0,$r0,$s1 // &=0ffffffc0fffffff
167 + and $s1,$s1,#-4
168 + and $r1,$r1,$s1 // &=0ffffffc0ffffffc
169 + mov w#$s1,#-1
170 + stp $r0,$r1,[$ctx,#32] // save key value
171 + str w#$s1,[$ctx,#48] // impossible key power value
172 +
173 +#ifndef __KERNEL__
174 + tst w17,#ARMV7_NEON
175 +
176 + adr $d0,.Lpoly1305_blocks
177 + adr $r0,.Lpoly1305_blocks_neon
178 + adr $d1,.Lpoly1305_emit
179 +
180 + csel $d0,$d0,$r0,eq
181 +
182 +# ifdef __ILP32__
183 + stp w#$d0,w#$d1,[$len]
184 +# else
185 + stp $d0,$d1,[$len]
186 +# endif
187 +#endif
188 + mov x0,#1
189 +.Lno_key:
190 + ret
191 +.size poly1305_init,.-poly1305_init
192 +
193 +.type poly1305_blocks,%function
194 +.align 5
195 +poly1305_blocks:
196 +.Lpoly1305_blocks:
197 + ands $len,$len,#-16
198 + b.eq .Lno_data
199 +
200 + ldp $h0,$h1,[$ctx] // load hash value
201 + ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
202 + ldp $r0,$r1,[$ctx,#32] // load key value
203 +
204 +#ifdef __AARCH64EB__
205 + lsr $d0,$h0,#32
206 + mov w#$d1,w#$h0
207 + lsr $d2,$h1,#32
208 + mov w15,w#$h1
209 + lsr x16,$h2,#32
210 +#else
211 + mov w#$d0,w#$h0
212 + lsr $d1,$h0,#32
213 + mov w#$d2,w#$h1
214 + lsr x15,$h1,#32
215 + mov w16,w#$h2
216 +#endif
217 +
218 + add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
219 + lsr $d1,$d2,#12
220 + adds $d0,$d0,$d2,lsl#52
221 + add $d1,$d1,x15,lsl#14
222 + adc $d1,$d1,xzr
223 + lsr $d2,x16,#24
224 + adds $d1,$d1,x16,lsl#40
225 + adc $d2,$d2,xzr
226 +
227 + cmp x17,#0 // is_base2_26?
228 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
229 + csel $h0,$h0,$d0,eq // choose between radixes
230 + csel $h1,$h1,$d1,eq
231 + csel $h2,$h2,$d2,eq
232 +
233 +.Loop:
234 + ldp $t0,$t1,[$inp],#16 // load input
235 + sub $len,$len,#16
236 +#ifdef __AARCH64EB__
237 + rev $t0,$t0
238 + rev $t1,$t1
239 +#endif
240 + adds $h0,$h0,$t0 // accumulate input
241 + adcs $h1,$h1,$t1
242 +
243 + mul $d0,$h0,$r0 // h0*r0
244 + adc $h2,$h2,$padbit
245 + umulh $d1,$h0,$r0
246 +
247 + mul $t0,$h1,$s1 // h1*5*r1
248 + umulh $t1,$h1,$s1
249 +
250 + adds $d0,$d0,$t0
251 + mul $t0,$h0,$r1 // h0*r1
252 + adc $d1,$d1,$t1
253 + umulh $d2,$h0,$r1
254 +
255 + adds $d1,$d1,$t0
256 + mul $t0,$h1,$r0 // h1*r0
257 + adc $d2,$d2,xzr
258 + umulh $t1,$h1,$r0
259 +
260 + adds $d1,$d1,$t0
261 + mul $t0,$h2,$s1 // h2*5*r1
262 + adc $d2,$d2,$t1
263 + mul $t1,$h2,$r0 // h2*r0
264 +
265 + adds $d1,$d1,$t0
266 + adc $d2,$d2,$t1
267 +
268 + and $t0,$d2,#-4 // final reduction
269 + and $h2,$d2,#3
270 + add $t0,$t0,$d2,lsr#2
271 + adds $h0,$d0,$t0
272 + adcs $h1,$d1,xzr
273 + adc $h2,$h2,xzr
274 +
275 + cbnz $len,.Loop
276 +
277 + stp $h0,$h1,[$ctx] // store hash value
278 + stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
279 +
280 +.Lno_data:
281 + ret
282 +.size poly1305_blocks,.-poly1305_blocks
283 +
284 +.type poly1305_emit,%function
285 +.align 5
286 +poly1305_emit:
287 +.Lpoly1305_emit:
288 + ldp $h0,$h1,[$ctx] // load hash base 2^64
289 + ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
290 + ldp $t0,$t1,[$nonce] // load nonce
291 +
292 +#ifdef __AARCH64EB__
293 + lsr $d0,$h0,#32
294 + mov w#$d1,w#$h0
295 + lsr $d2,$h1,#32
296 + mov w15,w#$h1
297 + lsr x16,$h2,#32
298 +#else
299 + mov w#$d0,w#$h0
300 + lsr $d1,$h0,#32
301 + mov w#$d2,w#$h1
302 + lsr x15,$h1,#32
303 + mov w16,w#$h2
304 +#endif
305 +
306 + add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
307 + lsr $d1,$d2,#12
308 + adds $d0,$d0,$d2,lsl#52
309 + add $d1,$d1,x15,lsl#14
310 + adc $d1,$d1,xzr
311 + lsr $d2,x16,#24
312 + adds $d1,$d1,x16,lsl#40
313 + adc $d2,$d2,xzr
314 +
315 + cmp $r0,#0 // is_base2_26?
316 + csel $h0,$h0,$d0,eq // choose between radixes
317 + csel $h1,$h1,$d1,eq
318 + csel $h2,$h2,$d2,eq
319 +
320 + adds $d0,$h0,#5 // compare to modulus
321 + adcs $d1,$h1,xzr
322 + adc $d2,$h2,xzr
323 +
324 + tst $d2,#-4 // see if it's carried/borrowed
325 +
326 + csel $h0,$h0,$d0,eq
327 + csel $h1,$h1,$d1,eq
328 +
329 +#ifdef __AARCH64EB__
330 + ror $t0,$t0,#32 // flip nonce words
331 + ror $t1,$t1,#32
332 +#endif
333 + adds $h0,$h0,$t0 // accumulate nonce
334 + adc $h1,$h1,$t1
335 +#ifdef __AARCH64EB__
336 + rev $h0,$h0 // flip output bytes
337 + rev $h1,$h1
338 +#endif
339 + stp $h0,$h1,[$mac] // write result
340 +
341 + ret
342 +.size poly1305_emit,.-poly1305_emit
343 +___
344 +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
345 +my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
346 +my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
347 +my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
348 +my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
349 +my ($T0,$T1,$MASK) = map("v$_",(29..31));
350 +
351 +my ($in2,$zeros)=("x16","x17");
352 +my $is_base2_26 = $zeros; # borrow
353 +
354 +$code.=<<___;
355 +.type poly1305_mult,%function
356 +.align 5
357 +poly1305_mult:
358 + mul $d0,$h0,$r0 // h0*r0
359 + umulh $d1,$h0,$r0
360 +
361 + mul $t0,$h1,$s1 // h1*5*r1
362 + umulh $t1,$h1,$s1
363 +
364 + adds $d0,$d0,$t0
365 + mul $t0,$h0,$r1 // h0*r1
366 + adc $d1,$d1,$t1
367 + umulh $d2,$h0,$r1
368 +
369 + adds $d1,$d1,$t0
370 + mul $t0,$h1,$r0 // h1*r0
371 + adc $d2,$d2,xzr
372 + umulh $t1,$h1,$r0
373 +
374 + adds $d1,$d1,$t0
375 + mul $t0,$h2,$s1 // h2*5*r1
376 + adc $d2,$d2,$t1
377 + mul $t1,$h2,$r0 // h2*r0
378 +
379 + adds $d1,$d1,$t0
380 + adc $d2,$d2,$t1
381 +
382 + and $t0,$d2,#-4 // final reduction
383 + and $h2,$d2,#3
384 + add $t0,$t0,$d2,lsr#2
385 + adds $h0,$d0,$t0
386 + adcs $h1,$d1,xzr
387 + adc $h2,$h2,xzr
388 +
389 + ret
390 +.size poly1305_mult,.-poly1305_mult
391 +
392 +.type poly1305_splat,%function
393 +.align 4
394 +poly1305_splat:
395 + and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
396 + ubfx x13,$h0,#26,#26
397 + extr x14,$h1,$h0,#52
398 + and x14,x14,#0x03ffffff
399 + ubfx x15,$h1,#14,#26
400 + extr x16,$h2,$h1,#40
401 +
402 + str w12,[$ctx,#16*0] // r0
403 + add w12,w13,w13,lsl#2 // r1*5
404 + str w13,[$ctx,#16*1] // r1
405 + add w13,w14,w14,lsl#2 // r2*5
406 + str w12,[$ctx,#16*2] // s1
407 + str w14,[$ctx,#16*3] // r2
408 + add w14,w15,w15,lsl#2 // r3*5
409 + str w13,[$ctx,#16*4] // s2
410 + str w15,[$ctx,#16*5] // r3
411 + add w15,w16,w16,lsl#2 // r4*5
412 + str w14,[$ctx,#16*6] // s3
413 + str w16,[$ctx,#16*7] // r4
414 + str w15,[$ctx,#16*8] // s4
415 +
416 + ret
417 +.size poly1305_splat,.-poly1305_splat
418 +
419 +#ifdef __KERNEL__
420 +.globl poly1305_blocks_neon
421 +#endif
422 +.type poly1305_blocks_neon,%function
423 +.align 5
424 +poly1305_blocks_neon:
425 +.Lpoly1305_blocks_neon:
426 + ldr $is_base2_26,[$ctx,#24]
427 + cmp $len,#128
428 + b.lo .Lpoly1305_blocks
429 +
430 + .inst 0xd503233f // paciasp
431 + stp x29,x30,[sp,#-80]!
432 + add x29,sp,#0
433 +
434 + stp d8,d9,[sp,#16] // meet ABI requirements
435 + stp d10,d11,[sp,#32]
436 + stp d12,d13,[sp,#48]
437 + stp d14,d15,[sp,#64]
438 +
439 + cbz $is_base2_26,.Lbase2_64_neon
440 +
441 + ldp w10,w11,[$ctx] // load hash value base 2^26
442 + ldp w12,w13,[$ctx,#8]
443 + ldr w14,[$ctx,#16]
444 +
445 + tst $len,#31
446 + b.eq .Leven_neon
447 +
448 + ldp $r0,$r1,[$ctx,#32] // load key value
449 +
450 + add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
451 + lsr $h1,x12,#12
452 + adds $h0,$h0,x12,lsl#52
453 + add $h1,$h1,x13,lsl#14
454 + adc $h1,$h1,xzr
455 + lsr $h2,x14,#24
456 + adds $h1,$h1,x14,lsl#40
457 + adc $d2,$h2,xzr // can be partially reduced...
458 +
459 + ldp $d0,$d1,[$inp],#16 // load input
460 + sub $len,$len,#16
461 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
462 +
463 +#ifdef __AARCH64EB__
464 + rev $d0,$d0
465 + rev $d1,$d1
466 +#endif
467 + adds $h0,$h0,$d0 // accumulate input
468 + adcs $h1,$h1,$d1
469 + adc $h2,$h2,$padbit
470 +
471 + bl poly1305_mult
472 +
473 + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
474 + ubfx x11,$h0,#26,#26
475 + extr x12,$h1,$h0,#52
476 + and x12,x12,#0x03ffffff
477 + ubfx x13,$h1,#14,#26
478 + extr x14,$h2,$h1,#40
479 +
480 + b .Leven_neon
481 +
482 +.align 4
483 +.Lbase2_64_neon:
484 + ldp $r0,$r1,[$ctx,#32] // load key value
485 +
486 + ldp $h0,$h1,[$ctx] // load hash value base 2^64
487 + ldr $h2,[$ctx,#16]
488 +
489 + tst $len,#31
490 + b.eq .Linit_neon
491 +
492 + ldp $d0,$d1,[$inp],#16 // load input
493 + sub $len,$len,#16
494 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
495 +#ifdef __AARCH64EB__
496 + rev $d0,$d0
497 + rev $d1,$d1
498 +#endif
499 + adds $h0,$h0,$d0 // accumulate input
500 + adcs $h1,$h1,$d1
501 + adc $h2,$h2,$padbit
502 +
503 + bl poly1305_mult
504 +
505 +.Linit_neon:
506 + ldr w17,[$ctx,#48] // first table element
507 + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
508 + ubfx x11,$h0,#26,#26
509 + extr x12,$h1,$h0,#52
510 + and x12,x12,#0x03ffffff
511 + ubfx x13,$h1,#14,#26
512 + extr x14,$h2,$h1,#40
513 +
514 + cmp w17,#-1 // is value impossible?
515 + b.ne .Leven_neon
516 +
517 + fmov ${H0},x10
518 + fmov ${H1},x11
519 + fmov ${H2},x12
520 + fmov ${H3},x13
521 + fmov ${H4},x14
522 +
523 + ////////////////////////////////// initialize r^n table
524 + mov $h0,$r0 // r^1
525 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
526 + mov $h1,$r1
527 + mov $h2,xzr
528 + add $ctx,$ctx,#48+12
529 + bl poly1305_splat
530 +
531 + bl poly1305_mult // r^2
532 + sub $ctx,$ctx,#4
533 + bl poly1305_splat
534 +
535 + bl poly1305_mult // r^3
536 + sub $ctx,$ctx,#4
537 + bl poly1305_splat
538 +
539 + bl poly1305_mult // r^4
540 + sub $ctx,$ctx,#4
541 + bl poly1305_splat
542 + sub $ctx,$ctx,#48 // restore original $ctx
543 + b .Ldo_neon
544 +
545 +.align 4
546 +.Leven_neon:
547 + fmov ${H0},x10
548 + fmov ${H1},x11
549 + fmov ${H2},x12
550 + fmov ${H3},x13
551 + fmov ${H4},x14
552 +
553 +.Ldo_neon:
554 + ldp x8,x12,[$inp,#32] // inp[2:3]
555 + subs $len,$len,#64
556 + ldp x9,x13,[$inp,#48]
557 + add $in2,$inp,#96
558 + adr $zeros,.Lzeros
559 +
560 + lsl $padbit,$padbit,#24
561 + add x15,$ctx,#48
562 +
563 +#ifdef __AARCH64EB__
564 + rev x8,x8
565 + rev x12,x12
566 + rev x9,x9
567 + rev x13,x13
568 +#endif
569 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
570 + and x5,x9,#0x03ffffff
571 + ubfx x6,x8,#26,#26
572 + ubfx x7,x9,#26,#26
573 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
574 + extr x8,x12,x8,#52
575 + extr x9,x13,x9,#52
576 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
577 + fmov $IN23_0,x4
578 + and x8,x8,#0x03ffffff
579 + and x9,x9,#0x03ffffff
580 + ubfx x10,x12,#14,#26
581 + ubfx x11,x13,#14,#26
582 + add x12,$padbit,x12,lsr#40
583 + add x13,$padbit,x13,lsr#40
584 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
585 + fmov $IN23_1,x6
586 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
587 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
588 + fmov $IN23_2,x8
589 + fmov $IN23_3,x10
590 + fmov $IN23_4,x12
591 +
592 + ldp x8,x12,[$inp],#16 // inp[0:1]
593 + ldp x9,x13,[$inp],#48
594 +
595 + ld1 {$R0,$R1,$S1,$R2},[x15],#64
596 + ld1 {$S2,$R3,$S3,$R4},[x15],#64
597 + ld1 {$S4},[x15]
598 +
599 +#ifdef __AARCH64EB__
600 + rev x8,x8
601 + rev x12,x12
602 + rev x9,x9
603 + rev x13,x13
604 +#endif
605 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
606 + and x5,x9,#0x03ffffff
607 + ubfx x6,x8,#26,#26
608 + ubfx x7,x9,#26,#26
609 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
610 + extr x8,x12,x8,#52
611 + extr x9,x13,x9,#52
612 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
613 + fmov $IN01_0,x4
614 + and x8,x8,#0x03ffffff
615 + and x9,x9,#0x03ffffff
616 + ubfx x10,x12,#14,#26
617 + ubfx x11,x13,#14,#26
618 + add x12,$padbit,x12,lsr#40
619 + add x13,$padbit,x13,lsr#40
620 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
621 + fmov $IN01_1,x6
622 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
623 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
624 + movi $MASK.2d,#-1
625 + fmov $IN01_2,x8
626 + fmov $IN01_3,x10
627 + fmov $IN01_4,x12
628 + ushr $MASK.2d,$MASK.2d,#38
629 +
630 + b.ls .Lskip_loop
631 +
632 +.align 4
633 +.Loop_neon:
634 + ////////////////////////////////////////////////////////////////
635 + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
636 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
637 + // \___________________/
638 + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
639 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
640 + // \___________________/ \____________________/
641 + //
642 + // Note that we start with inp[2:3]*r^2. This is because it
643 + // doesn't depend on reduction in previous iteration.
644 + ////////////////////////////////////////////////////////////////
645 + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
646 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
647 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
648 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
649 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
650 +
651 + subs $len,$len,#64
652 + umull $ACC4,$IN23_0,${R4}[2]
653 + csel $in2,$zeros,$in2,lo
654 + umull $ACC3,$IN23_0,${R3}[2]
655 + umull $ACC2,$IN23_0,${R2}[2]
656 + ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
657 + umull $ACC1,$IN23_0,${R1}[2]
658 + ldp x9,x13,[$in2],#48
659 + umull $ACC0,$IN23_0,${R0}[2]
660 +#ifdef __AARCH64EB__
661 + rev x8,x8
662 + rev x12,x12
663 + rev x9,x9
664 + rev x13,x13
665 +#endif
666 +
667 + umlal $ACC4,$IN23_1,${R3}[2]
668 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
669 + umlal $ACC3,$IN23_1,${R2}[2]
670 + and x5,x9,#0x03ffffff
671 + umlal $ACC2,$IN23_1,${R1}[2]
672 + ubfx x6,x8,#26,#26
673 + umlal $ACC1,$IN23_1,${R0}[2]
674 + ubfx x7,x9,#26,#26
675 + umlal $ACC0,$IN23_1,${S4}[2]
676 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
677 +
678 + umlal $ACC4,$IN23_2,${R2}[2]
679 + extr x8,x12,x8,#52
680 + umlal $ACC3,$IN23_2,${R1}[2]
681 + extr x9,x13,x9,#52
682 + umlal $ACC2,$IN23_2,${R0}[2]
683 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
684 + umlal $ACC1,$IN23_2,${S4}[2]
685 + fmov $IN23_0,x4
686 + umlal $ACC0,$IN23_2,${S3}[2]
687 + and x8,x8,#0x03ffffff
688 +
689 + umlal $ACC4,$IN23_3,${R1}[2]
690 + and x9,x9,#0x03ffffff
691 + umlal $ACC3,$IN23_3,${R0}[2]
692 + ubfx x10,x12,#14,#26
693 + umlal $ACC2,$IN23_3,${S4}[2]
694 + ubfx x11,x13,#14,#26
695 + umlal $ACC1,$IN23_3,${S3}[2]
696 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
697 + umlal $ACC0,$IN23_3,${S2}[2]
698 + fmov $IN23_1,x6
699 +
700 + add $IN01_2,$IN01_2,$H2
701 + add x12,$padbit,x12,lsr#40
702 + umlal $ACC4,$IN23_4,${R0}[2]
703 + add x13,$padbit,x13,lsr#40
704 + umlal $ACC3,$IN23_4,${S4}[2]
705 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
706 + umlal $ACC2,$IN23_4,${S3}[2]
707 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
708 + umlal $ACC1,$IN23_4,${S2}[2]
709 + fmov $IN23_2,x8
710 + umlal $ACC0,$IN23_4,${S1}[2]
711 + fmov $IN23_3,x10
712 +
713 + ////////////////////////////////////////////////////////////////
714 + // (hash+inp[0:1])*r^4 and accumulate
715 +
716 + add $IN01_0,$IN01_0,$H0
717 + fmov $IN23_4,x12
718 + umlal $ACC3,$IN01_2,${R1}[0]
719 + ldp x8,x12,[$inp],#16 // inp[0:1]
720 + umlal $ACC0,$IN01_2,${S3}[0]
721 + ldp x9,x13,[$inp],#48
722 + umlal $ACC4,$IN01_2,${R2}[0]
723 + umlal $ACC1,$IN01_2,${S4}[0]
724 + umlal $ACC2,$IN01_2,${R0}[0]
725 +#ifdef __AARCH64EB__
726 + rev x8,x8
727 + rev x12,x12
728 + rev x9,x9
729 + rev x13,x13
730 +#endif
731 +
732 + add $IN01_1,$IN01_1,$H1
733 + umlal $ACC3,$IN01_0,${R3}[0]
734 + umlal $ACC4,$IN01_0,${R4}[0]
735 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
736 + umlal $ACC2,$IN01_0,${R2}[0]
737 + and x5,x9,#0x03ffffff
738 + umlal $ACC0,$IN01_0,${R0}[0]
739 + ubfx x6,x8,#26,#26
740 + umlal $ACC1,$IN01_0,${R1}[0]
741 + ubfx x7,x9,#26,#26
742 +
743 + add $IN01_3,$IN01_3,$H3
744 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
745 + umlal $ACC3,$IN01_1,${R2}[0]
746 + extr x8,x12,x8,#52
747 + umlal $ACC4,$IN01_1,${R3}[0]
748 + extr x9,x13,x9,#52
749 + umlal $ACC0,$IN01_1,${S4}[0]
750 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
751 + umlal $ACC2,$IN01_1,${R1}[0]
752 + fmov $IN01_0,x4
753 + umlal $ACC1,$IN01_1,${R0}[0]
754 + and x8,x8,#0x03ffffff
755 +
756 + add $IN01_4,$IN01_4,$H4
757 + and x9,x9,#0x03ffffff
758 + umlal $ACC3,$IN01_3,${R0}[0]
759 + ubfx x10,x12,#14,#26
760 + umlal $ACC0,$IN01_3,${S2}[0]
761 + ubfx x11,x13,#14,#26
762 + umlal $ACC4,$IN01_3,${R1}[0]
763 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
764 + umlal $ACC1,$IN01_3,${S3}[0]
765 + fmov $IN01_1,x6
766 + umlal $ACC2,$IN01_3,${S4}[0]
767 + add x12,$padbit,x12,lsr#40
768 +
769 + umlal $ACC3,$IN01_4,${S4}[0]
770 + add x13,$padbit,x13,lsr#40
771 + umlal $ACC0,$IN01_4,${S1}[0]
772 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
773 + umlal $ACC4,$IN01_4,${R0}[0]
774 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
775 + umlal $ACC1,$IN01_4,${S2}[0]
776 + fmov $IN01_2,x8
777 + umlal $ACC2,$IN01_4,${S3}[0]
778 + fmov $IN01_3,x10
779 + fmov $IN01_4,x12
780 +
781 + /////////////////////////////////////////////////////////////////
782 + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
783 + // and P. Schwabe
784 + //
785 + // [see discussion in poly1305-armv4 module]
786 +
787 + ushr $T0.2d,$ACC3,#26
788 + xtn $H3,$ACC3
789 + ushr $T1.2d,$ACC0,#26
790 + and $ACC0,$ACC0,$MASK.2d
791 + add $ACC4,$ACC4,$T0.2d // h3 -> h4
792 + bic $H3,#0xfc,lsl#24 // &=0x03ffffff
793 + add $ACC1,$ACC1,$T1.2d // h0 -> h1
794 +
795 + ushr $T0.2d,$ACC4,#26
796 + xtn $H4,$ACC4
797 + ushr $T1.2d,$ACC1,#26
798 + xtn $H1,$ACC1
799 + bic $H4,#0xfc,lsl#24
800 + add $ACC2,$ACC2,$T1.2d // h1 -> h2
801 +
802 + add $ACC0,$ACC0,$T0.2d
803 + shl $T0.2d,$T0.2d,#2
804 + shrn $T1.2s,$ACC2,#26
805 + xtn $H2,$ACC2
806 + add $ACC0,$ACC0,$T0.2d // h4 -> h0
807 + bic $H1,#0xfc,lsl#24
808 + add $H3,$H3,$T1.2s // h2 -> h3
809 + bic $H2,#0xfc,lsl#24
810 +
811 + shrn $T0.2s,$ACC0,#26
812 + xtn $H0,$ACC0
813 + ushr $T1.2s,$H3,#26
814 + bic $H3,#0xfc,lsl#24
815 + bic $H0,#0xfc,lsl#24
816 + add $H1,$H1,$T0.2s // h0 -> h1
817 + add $H4,$H4,$T1.2s // h3 -> h4
818 +
819 + b.hi .Loop_neon
820 +
821 +.Lskip_loop:
822 + dup $IN23_2,${IN23_2}[0]
823 + add $IN01_2,$IN01_2,$H2
824 +
825 + ////////////////////////////////////////////////////////////////
826 + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
827 +
828 + adds $len,$len,#32
829 + b.ne .Long_tail
830 +
831 + dup $IN23_2,${IN01_2}[0]
832 + add $IN23_0,$IN01_0,$H0
833 + add $IN23_3,$IN01_3,$H3
834 + add $IN23_1,$IN01_1,$H1
835 + add $IN23_4,$IN01_4,$H4
836 +
837 +.Long_tail:
838 + dup $IN23_0,${IN23_0}[0]
839 + umull2 $ACC0,$IN23_2,${S3}
840 + umull2 $ACC3,$IN23_2,${R1}
841 + umull2 $ACC4,$IN23_2,${R2}
842 + umull2 $ACC2,$IN23_2,${R0}
843 + umull2 $ACC1,$IN23_2,${S4}
844 +
845 + dup $IN23_1,${IN23_1}[0]
846 + umlal2 $ACC0,$IN23_0,${R0}
847 + umlal2 $ACC2,$IN23_0,${R2}
848 + umlal2 $ACC3,$IN23_0,${R3}
849 + umlal2 $ACC4,$IN23_0,${R4}
850 + umlal2 $ACC1,$IN23_0,${R1}
851 +
852 + dup $IN23_3,${IN23_3}[0]
853 + umlal2 $ACC0,$IN23_1,${S4}
854 + umlal2 $ACC3,$IN23_1,${R2}
855 + umlal2 $ACC2,$IN23_1,${R1}
856 + umlal2 $ACC4,$IN23_1,${R3}
857 + umlal2 $ACC1,$IN23_1,${R0}
858 +
859 + dup $IN23_4,${IN23_4}[0]
860 + umlal2 $ACC3,$IN23_3,${R0}
861 + umlal2 $ACC4,$IN23_3,${R1}
862 + umlal2 $ACC0,$IN23_3,${S2}
863 + umlal2 $ACC1,$IN23_3,${S3}
864 + umlal2 $ACC2,$IN23_3,${S4}
865 +
866 + umlal2 $ACC3,$IN23_4,${S4}
867 + umlal2 $ACC0,$IN23_4,${S1}
868 + umlal2 $ACC4,$IN23_4,${R0}
869 + umlal2 $ACC1,$IN23_4,${S2}
870 + umlal2 $ACC2,$IN23_4,${S3}
871 +
872 + b.eq .Lshort_tail
873 +
874 + ////////////////////////////////////////////////////////////////
875 + // (hash+inp[0:1])*r^4:r^3 and accumulate
876 +
877 + add $IN01_0,$IN01_0,$H0
878 + umlal $ACC3,$IN01_2,${R1}
879 + umlal $ACC0,$IN01_2,${S3}
880 + umlal $ACC4,$IN01_2,${R2}
881 + umlal $ACC1,$IN01_2,${S4}
882 + umlal $ACC2,$IN01_2,${R0}
883 +
884 + add $IN01_1,$IN01_1,$H1
885 + umlal $ACC3,$IN01_0,${R3}
886 + umlal $ACC0,$IN01_0,${R0}
887 + umlal $ACC4,$IN01_0,${R4}
888 + umlal $ACC1,$IN01_0,${R1}
889 + umlal $ACC2,$IN01_0,${R2}
890 +
891 + add $IN01_3,$IN01_3,$H3
892 + umlal $ACC3,$IN01_1,${R2}
893 + umlal $ACC0,$IN01_1,${S4}
894 + umlal $ACC4,$IN01_1,${R3}
895 + umlal $ACC1,$IN01_1,${R0}
896 + umlal $ACC2,$IN01_1,${R1}
897 +
898 + add $IN01_4,$IN01_4,$H4
899 + umlal $ACC3,$IN01_3,${R0}
900 + umlal $ACC0,$IN01_3,${S2}
901 + umlal $ACC4,$IN01_3,${R1}
902 + umlal $ACC1,$IN01_3,${S3}
903 + umlal $ACC2,$IN01_3,${S4}
904 +
905 + umlal $ACC3,$IN01_4,${S4}
906 + umlal $ACC0,$IN01_4,${S1}
907 + umlal $ACC4,$IN01_4,${R0}
908 + umlal $ACC1,$IN01_4,${S2}
909 + umlal $ACC2,$IN01_4,${S3}
910 +
911 +.Lshort_tail:
912 + ////////////////////////////////////////////////////////////////
913 + // horizontal add
914 +
915 + addp $ACC3,$ACC3,$ACC3
916 + ldp d8,d9,[sp,#16] // meet ABI requirements
917 + addp $ACC0,$ACC0,$ACC0
918 + ldp d10,d11,[sp,#32]
919 + addp $ACC4,$ACC4,$ACC4
920 + ldp d12,d13,[sp,#48]
921 + addp $ACC1,$ACC1,$ACC1
922 + ldp d14,d15,[sp,#64]
923 + addp $ACC2,$ACC2,$ACC2
924 + ldr x30,[sp,#8]
925 + .inst 0xd50323bf // autiasp
926 +
927 + ////////////////////////////////////////////////////////////////
928 + // lazy reduction, but without narrowing
929 +
930 + ushr $T0.2d,$ACC3,#26
931 + and $ACC3,$ACC3,$MASK.2d
932 + ushr $T1.2d,$ACC0,#26
933 + and $ACC0,$ACC0,$MASK.2d
934 +
935 + add $ACC4,$ACC4,$T0.2d // h3 -> h4
936 + add $ACC1,$ACC1,$T1.2d // h0 -> h1
937 +
938 + ushr $T0.2d,$ACC4,#26
939 + and $ACC4,$ACC4,$MASK.2d
940 + ushr $T1.2d,$ACC1,#26
941 + and $ACC1,$ACC1,$MASK.2d
942 + add $ACC2,$ACC2,$T1.2d // h1 -> h2
943 +
944 + add $ACC0,$ACC0,$T0.2d
945 + shl $T0.2d,$T0.2d,#2
946 + ushr $T1.2d,$ACC2,#26
947 + and $ACC2,$ACC2,$MASK.2d
948 + add $ACC0,$ACC0,$T0.2d // h4 -> h0
949 + add $ACC3,$ACC3,$T1.2d // h2 -> h3
950 +
951 + ushr $T0.2d,$ACC0,#26
952 + and $ACC0,$ACC0,$MASK.2d
953 + ushr $T1.2d,$ACC3,#26
954 + and $ACC3,$ACC3,$MASK.2d
955 + add $ACC1,$ACC1,$T0.2d // h0 -> h1
956 + add $ACC4,$ACC4,$T1.2d // h3 -> h4
957 +
958 + ////////////////////////////////////////////////////////////////
959 + // write the result, can be partially reduced
960 +
961 + st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
962 + mov x4,#1
963 + st1 {$ACC4}[0],[$ctx]
964 + str x4,[$ctx,#8] // set is_base2_26
965 +
966 + ldr x29,[sp],#80
967 + ret
968 +.size poly1305_blocks_neon,.-poly1305_blocks_neon
969 +
970 +.align 5
971 +.Lzeros:
972 +.long 0,0,0,0,0,0,0,0
973 +.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
974 +.align 2
975 +#if !defined(__KERNEL__) && !defined(_WIN64)
976 +.comm OPENSSL_armcap_P,4,4
977 +.hidden OPENSSL_armcap_P
978 +#endif
979 +___
980 +
981 +foreach (split("\n",$code)) {
982 + s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
983 + s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
984 + (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
985 + (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
986 + (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
987 + (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
988 + (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
989 +
990 + s/\.[124]([sd])\[/.$1\[/;
991 + s/w#x([0-9]+)/w$1/g;
992 +
993 + print $_,"\n";
994 +}
995 +close STDOUT;
996 --- /dev/null
997 +++ b/arch/arm64/crypto/poly1305-core.S_shipped
998 @@ -0,0 +1,835 @@
999 +#ifndef __KERNEL__
1000 +# include "arm_arch.h"
1001 +.extern OPENSSL_armcap_P
1002 +#endif
1003 +
1004 +.text
1005 +
1006 +// forward "declarations" are required for Apple
1007 +.globl poly1305_blocks
1008 +.globl poly1305_emit
1009 +
1010 +.globl poly1305_init
1011 +.type poly1305_init,%function
1012 +.align 5
1013 +poly1305_init:
1014 + cmp x1,xzr
1015 + stp xzr,xzr,[x0] // zero hash value
1016 + stp xzr,xzr,[x0,#16] // [along with is_base2_26]
1017 +
1018 + csel x0,xzr,x0,eq
1019 + b.eq .Lno_key
1020 +
1021 +#ifndef __KERNEL__
1022 + adrp x17,OPENSSL_armcap_P
1023 + ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
1024 +#endif
1025 +
1026 + ldp x7,x8,[x1] // load key
1027 + mov x9,#0xfffffffc0fffffff
1028 + movk x9,#0x0fff,lsl#48
1029 +#ifdef __AARCH64EB__
1030 + rev x7,x7 // flip bytes
1031 + rev x8,x8
1032 +#endif
1033 + and x7,x7,x9 // &=0ffffffc0fffffff
1034 + and x9,x9,#-4
1035 + and x8,x8,x9 // &=0ffffffc0ffffffc
1036 + mov w9,#-1
1037 + stp x7,x8,[x0,#32] // save key value
1038 + str w9,[x0,#48] // impossible key power value
1039 +
1040 +#ifndef __KERNEL__
1041 + tst w17,#ARMV7_NEON
1042 +
1043 + adr x12,.Lpoly1305_blocks
1044 + adr x7,.Lpoly1305_blocks_neon
1045 + adr x13,.Lpoly1305_emit
1046 +
1047 + csel x12,x12,x7,eq
1048 +
1049 +# ifdef __ILP32__
1050 + stp w12,w13,[x2]
1051 +# else
1052 + stp x12,x13,[x2]
1053 +# endif
1054 +#endif
1055 + mov x0,#1
1056 +.Lno_key:
1057 + ret
1058 +.size poly1305_init,.-poly1305_init
1059 +
1060 +.type poly1305_blocks,%function
1061 +.align 5
1062 +poly1305_blocks:
1063 +.Lpoly1305_blocks:
1064 + ands x2,x2,#-16
1065 + b.eq .Lno_data
1066 +
1067 + ldp x4,x5,[x0] // load hash value
1068 + ldp x6,x17,[x0,#16] // [along with is_base2_26]
1069 + ldp x7,x8,[x0,#32] // load key value
1070 +
1071 +#ifdef __AARCH64EB__
1072 + lsr x12,x4,#32
1073 + mov w13,w4
1074 + lsr x14,x5,#32
1075 + mov w15,w5
1076 + lsr x16,x6,#32
1077 +#else
1078 + mov w12,w4
1079 + lsr x13,x4,#32
1080 + mov w14,w5
1081 + lsr x15,x5,#32
1082 + mov w16,w6
1083 +#endif
1084 +
1085 + add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
1086 + lsr x13,x14,#12
1087 + adds x12,x12,x14,lsl#52
1088 + add x13,x13,x15,lsl#14
1089 + adc x13,x13,xzr
1090 + lsr x14,x16,#24
1091 + adds x13,x13,x16,lsl#40
1092 + adc x14,x14,xzr
1093 +
1094 + cmp x17,#0 // is_base2_26?
1095 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
1096 + csel x4,x4,x12,eq // choose between radixes
1097 + csel x5,x5,x13,eq
1098 + csel x6,x6,x14,eq
1099 +
1100 +.Loop:
1101 + ldp x10,x11,[x1],#16 // load input
1102 + sub x2,x2,#16
1103 +#ifdef __AARCH64EB__
1104 + rev x10,x10
1105 + rev x11,x11
1106 +#endif
1107 + adds x4,x4,x10 // accumulate input
1108 + adcs x5,x5,x11
1109 +
1110 + mul x12,x4,x7 // h0*r0
1111 + adc x6,x6,x3
1112 + umulh x13,x4,x7
1113 +
1114 + mul x10,x5,x9 // h1*5*r1
1115 + umulh x11,x5,x9
1116 +
1117 + adds x12,x12,x10
1118 + mul x10,x4,x8 // h0*r1
1119 + adc x13,x13,x11
1120 + umulh x14,x4,x8
1121 +
1122 + adds x13,x13,x10
1123 + mul x10,x5,x7 // h1*r0
1124 + adc x14,x14,xzr
1125 + umulh x11,x5,x7
1126 +
1127 + adds x13,x13,x10
1128 + mul x10,x6,x9 // h2*5*r1
1129 + adc x14,x14,x11
1130 + mul x11,x6,x7 // h2*r0
1131 +
1132 + adds x13,x13,x10
1133 + adc x14,x14,x11
1134 +
1135 + and x10,x14,#-4 // final reduction
1136 + and x6,x14,#3
1137 + add x10,x10,x14,lsr#2
1138 + adds x4,x12,x10
1139 + adcs x5,x13,xzr
1140 + adc x6,x6,xzr
1141 +
1142 + cbnz x2,.Loop
1143 +
1144 + stp x4,x5,[x0] // store hash value
1145 + stp x6,xzr,[x0,#16] // [and clear is_base2_26]
1146 +
1147 +.Lno_data:
1148 + ret
1149 +.size poly1305_blocks,.-poly1305_blocks
1150 +
1151 +.type poly1305_emit,%function
1152 +.align 5
1153 +poly1305_emit:
1154 +.Lpoly1305_emit:
1155 + ldp x4,x5,[x0] // load hash base 2^64
1156 + ldp x6,x7,[x0,#16] // [along with is_base2_26]
1157 + ldp x10,x11,[x2] // load nonce
1158 +
1159 +#ifdef __AARCH64EB__
1160 + lsr x12,x4,#32
1161 + mov w13,w4
1162 + lsr x14,x5,#32
1163 + mov w15,w5
1164 + lsr x16,x6,#32
1165 +#else
1166 + mov w12,w4
1167 + lsr x13,x4,#32
1168 + mov w14,w5
1169 + lsr x15,x5,#32
1170 + mov w16,w6
1171 +#endif
1172 +
1173 + add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
1174 + lsr x13,x14,#12
1175 + adds x12,x12,x14,lsl#52
1176 + add x13,x13,x15,lsl#14
1177 + adc x13,x13,xzr
1178 + lsr x14,x16,#24
1179 + adds x13,x13,x16,lsl#40
1180 + adc x14,x14,xzr
1181 +
1182 + cmp x7,#0 // is_base2_26?
1183 + csel x4,x4,x12,eq // choose between radixes
1184 + csel x5,x5,x13,eq
1185 + csel x6,x6,x14,eq
1186 +
1187 + adds x12,x4,#5 // compare to modulus
1188 + adcs x13,x5,xzr
1189 + adc x14,x6,xzr
1190 +
1191 + tst x14,#-4 // see if it's carried/borrowed
1192 +
1193 + csel x4,x4,x12,eq
1194 + csel x5,x5,x13,eq
1195 +
1196 +#ifdef __AARCH64EB__
1197 + ror x10,x10,#32 // flip nonce words
1198 + ror x11,x11,#32
1199 +#endif
1200 + adds x4,x4,x10 // accumulate nonce
1201 + adc x5,x5,x11
1202 +#ifdef __AARCH64EB__
1203 + rev x4,x4 // flip output bytes
1204 + rev x5,x5
1205 +#endif
1206 + stp x4,x5,[x1] // write result
1207 +
1208 + ret
1209 +.size poly1305_emit,.-poly1305_emit
1210 +.type poly1305_mult,%function
1211 +.align 5
1212 +poly1305_mult:
1213 + mul x12,x4,x7 // h0*r0
1214 + umulh x13,x4,x7
1215 +
1216 + mul x10,x5,x9 // h1*5*r1
1217 + umulh x11,x5,x9
1218 +
1219 + adds x12,x12,x10
1220 + mul x10,x4,x8 // h0*r1
1221 + adc x13,x13,x11
1222 + umulh x14,x4,x8
1223 +
1224 + adds x13,x13,x10
1225 + mul x10,x5,x7 // h1*r0
1226 + adc x14,x14,xzr
1227 + umulh x11,x5,x7
1228 +
1229 + adds x13,x13,x10
1230 + mul x10,x6,x9 // h2*5*r1
1231 + adc x14,x14,x11
1232 + mul x11,x6,x7 // h2*r0
1233 +
1234 + adds x13,x13,x10
1235 + adc x14,x14,x11
1236 +
1237 + and x10,x14,#-4 // final reduction
1238 + and x6,x14,#3
1239 + add x10,x10,x14,lsr#2
1240 + adds x4,x12,x10
1241 + adcs x5,x13,xzr
1242 + adc x6,x6,xzr
1243 +
1244 + ret
1245 +.size poly1305_mult,.-poly1305_mult
1246 +
1247 +.type poly1305_splat,%function
1248 +.align 4
1249 +poly1305_splat:
1250 + and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
1251 + ubfx x13,x4,#26,#26
1252 + extr x14,x5,x4,#52
1253 + and x14,x14,#0x03ffffff
1254 + ubfx x15,x5,#14,#26
1255 + extr x16,x6,x5,#40
1256 +
1257 + str w12,[x0,#16*0] // r0
1258 + add w12,w13,w13,lsl#2 // r1*5
1259 + str w13,[x0,#16*1] // r1
1260 + add w13,w14,w14,lsl#2 // r2*5
1261 + str w12,[x0,#16*2] // s1
1262 + str w14,[x0,#16*3] // r2
1263 + add w14,w15,w15,lsl#2 // r3*5
1264 + str w13,[x0,#16*4] // s2
1265 + str w15,[x0,#16*5] // r3
1266 + add w15,w16,w16,lsl#2 // r4*5
1267 + str w14,[x0,#16*6] // s3
1268 + str w16,[x0,#16*7] // r4
1269 + str w15,[x0,#16*8] // s4
1270 +
1271 + ret
1272 +.size poly1305_splat,.-poly1305_splat
1273 +
1274 +#ifdef __KERNEL__
1275 +.globl poly1305_blocks_neon
1276 +#endif
1277 +.type poly1305_blocks_neon,%function
1278 +.align 5
1279 +poly1305_blocks_neon:
1280 +.Lpoly1305_blocks_neon:
1281 + ldr x17,[x0,#24]
1282 + cmp x2,#128
1283 + b.lo .Lpoly1305_blocks
1284 +
1285 + .inst 0xd503233f // paciasp
1286 + stp x29,x30,[sp,#-80]!
1287 + add x29,sp,#0
1288 +
1289 + stp d8,d9,[sp,#16] // meet ABI requirements
1290 + stp d10,d11,[sp,#32]
1291 + stp d12,d13,[sp,#48]
1292 + stp d14,d15,[sp,#64]
1293 +
1294 + cbz x17,.Lbase2_64_neon
1295 +
1296 + ldp w10,w11,[x0] // load hash value base 2^26
1297 + ldp w12,w13,[x0,#8]
1298 + ldr w14,[x0,#16]
1299 +
1300 + tst x2,#31
1301 + b.eq .Leven_neon
1302 +
1303 + ldp x7,x8,[x0,#32] // load key value
1304 +
1305 + add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
1306 + lsr x5,x12,#12
1307 + adds x4,x4,x12,lsl#52
1308 + add x5,x5,x13,lsl#14
1309 + adc x5,x5,xzr
1310 + lsr x6,x14,#24
1311 + adds x5,x5,x14,lsl#40
1312 + adc x14,x6,xzr // can be partially reduced...
1313 +
1314 + ldp x12,x13,[x1],#16 // load input
1315 + sub x2,x2,#16
1316 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
1317 +
1318 +#ifdef __AARCH64EB__
1319 + rev x12,x12
1320 + rev x13,x13
1321 +#endif
1322 + adds x4,x4,x12 // accumulate input
1323 + adcs x5,x5,x13
1324 + adc x6,x6,x3
1325 +
1326 + bl poly1305_mult
1327 +
1328 + and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
1329 + ubfx x11,x4,#26,#26
1330 + extr x12,x5,x4,#52
1331 + and x12,x12,#0x03ffffff
1332 + ubfx x13,x5,#14,#26
1333 + extr x14,x6,x5,#40
1334 +
1335 + b .Leven_neon
1336 +
1337 +.align 4
1338 +.Lbase2_64_neon:
1339 + ldp x7,x8,[x0,#32] // load key value
1340 +
1341 + ldp x4,x5,[x0] // load hash value base 2^64
1342 + ldr x6,[x0,#16]
1343 +
1344 + tst x2,#31
1345 + b.eq .Linit_neon
1346 +
1347 + ldp x12,x13,[x1],#16 // load input
1348 + sub x2,x2,#16
1349 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
1350 +#ifdef __AARCH64EB__
1351 + rev x12,x12
1352 + rev x13,x13
1353 +#endif
1354 + adds x4,x4,x12 // accumulate input
1355 + adcs x5,x5,x13
1356 + adc x6,x6,x3
1357 +
1358 + bl poly1305_mult
1359 +
1360 +.Linit_neon:
1361 + ldr w17,[x0,#48] // first table element
1362 + and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
1363 + ubfx x11,x4,#26,#26
1364 + extr x12,x5,x4,#52
1365 + and x12,x12,#0x03ffffff
1366 + ubfx x13,x5,#14,#26
1367 + extr x14,x6,x5,#40
1368 +
1369 + cmp w17,#-1 // is value impossible?
1370 + b.ne .Leven_neon
1371 +
1372 + fmov d24,x10
1373 + fmov d25,x11
1374 + fmov d26,x12
1375 + fmov d27,x13
1376 + fmov d28,x14
1377 +
1378 + ////////////////////////////////// initialize r^n table
1379 + mov x4,x7 // r^1
1380 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
1381 + mov x5,x8
1382 + mov x6,xzr
1383 + add x0,x0,#48+12
1384 + bl poly1305_splat
1385 +
1386 + bl poly1305_mult // r^2
1387 + sub x0,x0,#4
1388 + bl poly1305_splat
1389 +
1390 + bl poly1305_mult // r^3
1391 + sub x0,x0,#4
1392 + bl poly1305_splat
1393 +
1394 + bl poly1305_mult // r^4
1395 + sub x0,x0,#4
1396 + bl poly1305_splat
1397 + sub x0,x0,#48 // restore original x0
1398 + b .Ldo_neon
1399 +
1400 +.align 4
1401 +.Leven_neon:
1402 + fmov d24,x10
1403 + fmov d25,x11
1404 + fmov d26,x12
1405 + fmov d27,x13
1406 + fmov d28,x14
1407 +
1408 +.Ldo_neon:
1409 + ldp x8,x12,[x1,#32] // inp[2:3]
1410 + subs x2,x2,#64
1411 + ldp x9,x13,[x1,#48]
1412 + add x16,x1,#96
1413 + adr x17,.Lzeros
1414 +
1415 + lsl x3,x3,#24
1416 + add x15,x0,#48
1417 +
1418 +#ifdef __AARCH64EB__
1419 + rev x8,x8
1420 + rev x12,x12
1421 + rev x9,x9
1422 + rev x13,x13
1423 +#endif
1424 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
1425 + and x5,x9,#0x03ffffff
1426 + ubfx x6,x8,#26,#26
1427 + ubfx x7,x9,#26,#26
1428 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
1429 + extr x8,x12,x8,#52
1430 + extr x9,x13,x9,#52
1431 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
1432 + fmov d14,x4
1433 + and x8,x8,#0x03ffffff
1434 + and x9,x9,#0x03ffffff
1435 + ubfx x10,x12,#14,#26
1436 + ubfx x11,x13,#14,#26
1437 + add x12,x3,x12,lsr#40
1438 + add x13,x3,x13,lsr#40
1439 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
1440 + fmov d15,x6
1441 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
1442 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
1443 + fmov d16,x8
1444 + fmov d17,x10
1445 + fmov d18,x12
1446 +
1447 + ldp x8,x12,[x1],#16 // inp[0:1]
1448 + ldp x9,x13,[x1],#48
1449 +
1450 + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
1451 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
1452 + ld1 {v8.4s},[x15]
1453 +
1454 +#ifdef __AARCH64EB__
1455 + rev x8,x8
1456 + rev x12,x12
1457 + rev x9,x9
1458 + rev x13,x13
1459 +#endif
1460 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
1461 + and x5,x9,#0x03ffffff
1462 + ubfx x6,x8,#26,#26
1463 + ubfx x7,x9,#26,#26
1464 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
1465 + extr x8,x12,x8,#52
1466 + extr x9,x13,x9,#52
1467 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
1468 + fmov d9,x4
1469 + and x8,x8,#0x03ffffff
1470 + and x9,x9,#0x03ffffff
1471 + ubfx x10,x12,#14,#26
1472 + ubfx x11,x13,#14,#26
1473 + add x12,x3,x12,lsr#40
1474 + add x13,x3,x13,lsr#40
1475 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
1476 + fmov d10,x6
1477 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
1478 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
1479 + movi v31.2d,#-1
1480 + fmov d11,x8
1481 + fmov d12,x10
1482 + fmov d13,x12
1483 + ushr v31.2d,v31.2d,#38
1484 +
1485 + b.ls .Lskip_loop
1486 +
1487 +.align 4
1488 +.Loop_neon:
1489 + ////////////////////////////////////////////////////////////////
1490 + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
1491 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
1492 + // ___________________/
1493 + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
1494 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
1495 + // ___________________/ ____________________/
1496 + //
1497 + // Note that we start with inp[2:3]*r^2. This is because it
1498 + // doesn't depend on reduction in previous iteration.
1499 + ////////////////////////////////////////////////////////////////
1500 + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
1501 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
1502 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
1503 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
1504 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
1505 +
1506 + subs x2,x2,#64
1507 + umull v23.2d,v14.2s,v7.s[2]
1508 + csel x16,x17,x16,lo
1509 + umull v22.2d,v14.2s,v5.s[2]
1510 + umull v21.2d,v14.2s,v3.s[2]
1511 + ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
1512 + umull v20.2d,v14.2s,v1.s[2]
1513 + ldp x9,x13,[x16],#48
1514 + umull v19.2d,v14.2s,v0.s[2]
1515 +#ifdef __AARCH64EB__
1516 + rev x8,x8
1517 + rev x12,x12
1518 + rev x9,x9
1519 + rev x13,x13
1520 +#endif
1521 +
1522 + umlal v23.2d,v15.2s,v5.s[2]
1523 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
1524 + umlal v22.2d,v15.2s,v3.s[2]
1525 + and x5,x9,#0x03ffffff
1526 + umlal v21.2d,v15.2s,v1.s[2]
1527 + ubfx x6,x8,#26,#26
1528 + umlal v20.2d,v15.2s,v0.s[2]
1529 + ubfx x7,x9,#26,#26
1530 + umlal v19.2d,v15.2s,v8.s[2]
1531 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
1532 +
1533 + umlal v23.2d,v16.2s,v3.s[2]
1534 + extr x8,x12,x8,#52
1535 + umlal v22.2d,v16.2s,v1.s[2]
1536 + extr x9,x13,x9,#52
1537 + umlal v21.2d,v16.2s,v0.s[2]
1538 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
1539 + umlal v20.2d,v16.2s,v8.s[2]
1540 + fmov d14,x4
1541 + umlal v19.2d,v16.2s,v6.s[2]
1542 + and x8,x8,#0x03ffffff
1543 +
1544 + umlal v23.2d,v17.2s,v1.s[2]
1545 + and x9,x9,#0x03ffffff
1546 + umlal v22.2d,v17.2s,v0.s[2]
1547 + ubfx x10,x12,#14,#26
1548 + umlal v21.2d,v17.2s,v8.s[2]
1549 + ubfx x11,x13,#14,#26
1550 + umlal v20.2d,v17.2s,v6.s[2]
1551 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
1552 + umlal v19.2d,v17.2s,v4.s[2]
1553 + fmov d15,x6
1554 +
1555 + add v11.2s,v11.2s,v26.2s
1556 + add x12,x3,x12,lsr#40
1557 + umlal v23.2d,v18.2s,v0.s[2]
1558 + add x13,x3,x13,lsr#40
1559 + umlal v22.2d,v18.2s,v8.s[2]
1560 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
1561 + umlal v21.2d,v18.2s,v6.s[2]
1562 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
1563 + umlal v20.2d,v18.2s,v4.s[2]
1564 + fmov d16,x8
1565 + umlal v19.2d,v18.2s,v2.s[2]
1566 + fmov d17,x10
1567 +
1568 + ////////////////////////////////////////////////////////////////
1569 + // (hash+inp[0:1])*r^4 and accumulate
1570 +
1571 + add v9.2s,v9.2s,v24.2s
1572 + fmov d18,x12
1573 + umlal v22.2d,v11.2s,v1.s[0]
1574 + ldp x8,x12,[x1],#16 // inp[0:1]
1575 + umlal v19.2d,v11.2s,v6.s[0]
1576 + ldp x9,x13,[x1],#48
1577 + umlal v23.2d,v11.2s,v3.s[0]
1578 + umlal v20.2d,v11.2s,v8.s[0]
1579 + umlal v21.2d,v11.2s,v0.s[0]
1580 +#ifdef __AARCH64EB__
1581 + rev x8,x8
1582 + rev x12,x12
1583 + rev x9,x9
1584 + rev x13,x13
1585 +#endif
1586 +
1587 + add v10.2s,v10.2s,v25.2s
1588 + umlal v22.2d,v9.2s,v5.s[0]
1589 + umlal v23.2d,v9.2s,v7.s[0]
1590 + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
1591 + umlal v21.2d,v9.2s,v3.s[0]
1592 + and x5,x9,#0x03ffffff
1593 + umlal v19.2d,v9.2s,v0.s[0]
1594 + ubfx x6,x8,#26,#26
1595 + umlal v20.2d,v9.2s,v1.s[0]
1596 + ubfx x7,x9,#26,#26
1597 +
1598 + add v12.2s,v12.2s,v27.2s
1599 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
1600 + umlal v22.2d,v10.2s,v3.s[0]
1601 + extr x8,x12,x8,#52
1602 + umlal v23.2d,v10.2s,v5.s[0]
1603 + extr x9,x13,x9,#52
1604 + umlal v19.2d,v10.2s,v8.s[0]
1605 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
1606 + umlal v21.2d,v10.2s,v1.s[0]
1607 + fmov d9,x4
1608 + umlal v20.2d,v10.2s,v0.s[0]
1609 + and x8,x8,#0x03ffffff
1610 +
1611 + add v13.2s,v13.2s,v28.2s
1612 + and x9,x9,#0x03ffffff
1613 + umlal v22.2d,v12.2s,v0.s[0]
1614 + ubfx x10,x12,#14,#26
1615 + umlal v19.2d,v12.2s,v4.s[0]
1616 + ubfx x11,x13,#14,#26
1617 + umlal v23.2d,v12.2s,v1.s[0]
1618 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
1619 + umlal v20.2d,v12.2s,v6.s[0]
1620 + fmov d10,x6
1621 + umlal v21.2d,v12.2s,v8.s[0]
1622 + add x12,x3,x12,lsr#40
1623 +
1624 + umlal v22.2d,v13.2s,v8.s[0]
1625 + add x13,x3,x13,lsr#40
1626 + umlal v19.2d,v13.2s,v2.s[0]
1627 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
1628 + umlal v23.2d,v13.2s,v0.s[0]
1629 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
1630 + umlal v20.2d,v13.2s,v4.s[0]
1631 + fmov d11,x8
1632 + umlal v21.2d,v13.2s,v6.s[0]
1633 + fmov d12,x10
1634 + fmov d13,x12
1635 +
1636 + /////////////////////////////////////////////////////////////////
1637 + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1638 + // and P. Schwabe
1639 + //
1640 + // [see discussion in poly1305-armv4 module]
1641 +
1642 + ushr v29.2d,v22.2d,#26
1643 + xtn v27.2s,v22.2d
1644 + ushr v30.2d,v19.2d,#26
1645 + and v19.16b,v19.16b,v31.16b
1646 + add v23.2d,v23.2d,v29.2d // h3 -> h4
1647 + bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
1648 + add v20.2d,v20.2d,v30.2d // h0 -> h1
1649 +
1650 + ushr v29.2d,v23.2d,#26
1651 + xtn v28.2s,v23.2d
1652 + ushr v30.2d,v20.2d,#26
1653 + xtn v25.2s,v20.2d
1654 + bic v28.2s,#0xfc,lsl#24
1655 + add v21.2d,v21.2d,v30.2d // h1 -> h2
1656 +
1657 + add v19.2d,v19.2d,v29.2d
1658 + shl v29.2d,v29.2d,#2
1659 + shrn v30.2s,v21.2d,#26
1660 + xtn v26.2s,v21.2d
1661 + add v19.2d,v19.2d,v29.2d // h4 -> h0
1662 + bic v25.2s,#0xfc,lsl#24
1663 + add v27.2s,v27.2s,v30.2s // h2 -> h3
1664 + bic v26.2s,#0xfc,lsl#24
1665 +
1666 + shrn v29.2s,v19.2d,#26
1667 + xtn v24.2s,v19.2d
1668 + ushr v30.2s,v27.2s,#26
1669 + bic v27.2s,#0xfc,lsl#24
1670 + bic v24.2s,#0xfc,lsl#24
1671 + add v25.2s,v25.2s,v29.2s // h0 -> h1
1672 + add v28.2s,v28.2s,v30.2s // h3 -> h4
1673 +
1674 + b.hi .Loop_neon
1675 +
1676 +.Lskip_loop:
1677 + dup v16.2d,v16.d[0]
1678 + add v11.2s,v11.2s,v26.2s
1679 +
1680 + ////////////////////////////////////////////////////////////////
1681 + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1682 +
1683 + adds x2,x2,#32
1684 + b.ne .Long_tail
1685 +
1686 + dup v16.2d,v11.d[0]
1687 + add v14.2s,v9.2s,v24.2s
1688 + add v17.2s,v12.2s,v27.2s
1689 + add v15.2s,v10.2s,v25.2s
1690 + add v18.2s,v13.2s,v28.2s
1691 +
1692 +.Long_tail:
1693 + dup v14.2d,v14.d[0]
1694 + umull2 v19.2d,v16.4s,v6.4s
1695 + umull2 v22.2d,v16.4s,v1.4s
1696 + umull2 v23.2d,v16.4s,v3.4s
1697 + umull2 v21.2d,v16.4s,v0.4s
1698 + umull2 v20.2d,v16.4s,v8.4s
1699 +
1700 + dup v15.2d,v15.d[0]
1701 + umlal2 v19.2d,v14.4s,v0.4s
1702 + umlal2 v21.2d,v14.4s,v3.4s
1703 + umlal2 v22.2d,v14.4s,v5.4s
1704 + umlal2 v23.2d,v14.4s,v7.4s
1705 + umlal2 v20.2d,v14.4s,v1.4s
1706 +
1707 + dup v17.2d,v17.d[0]
1708 + umlal2 v19.2d,v15.4s,v8.4s
1709 + umlal2 v22.2d,v15.4s,v3.4s
1710 + umlal2 v21.2d,v15.4s,v1.4s
1711 + umlal2 v23.2d,v15.4s,v5.4s
1712 + umlal2 v20.2d,v15.4s,v0.4s
1713 +
1714 + dup v18.2d,v18.d[0]
1715 + umlal2 v22.2d,v17.4s,v0.4s
1716 + umlal2 v23.2d,v17.4s,v1.4s
1717 + umlal2 v19.2d,v17.4s,v4.4s
1718 + umlal2 v20.2d,v17.4s,v6.4s
1719 + umlal2 v21.2d,v17.4s,v8.4s
1720 +
1721 + umlal2 v22.2d,v18.4s,v8.4s
1722 + umlal2 v19.2d,v18.4s,v2.4s
1723 + umlal2 v23.2d,v18.4s,v0.4s
1724 + umlal2 v20.2d,v18.4s,v4.4s
1725 + umlal2 v21.2d,v18.4s,v6.4s
1726 +
1727 + b.eq .Lshort_tail
1728 +
1729 + ////////////////////////////////////////////////////////////////
1730 + // (hash+inp[0:1])*r^4:r^3 and accumulate
1731 +
1732 + add v9.2s,v9.2s,v24.2s
1733 + umlal v22.2d,v11.2s,v1.2s
1734 + umlal v19.2d,v11.2s,v6.2s
1735 + umlal v23.2d,v11.2s,v3.2s
1736 + umlal v20.2d,v11.2s,v8.2s
1737 + umlal v21.2d,v11.2s,v0.2s
1738 +
1739 + add v10.2s,v10.2s,v25.2s
1740 + umlal v22.2d,v9.2s,v5.2s
1741 + umlal v19.2d,v9.2s,v0.2s
1742 + umlal v23.2d,v9.2s,v7.2s
1743 + umlal v20.2d,v9.2s,v1.2s
1744 + umlal v21.2d,v9.2s,v3.2s
1745 +
1746 + add v12.2s,v12.2s,v27.2s
1747 + umlal v22.2d,v10.2s,v3.2s
1748 + umlal v19.2d,v10.2s,v8.2s
1749 + umlal v23.2d,v10.2s,v5.2s
1750 + umlal v20.2d,v10.2s,v0.2s
1751 + umlal v21.2d,v10.2s,v1.2s
1752 +
1753 + add v13.2s,v13.2s,v28.2s
1754 + umlal v22.2d,v12.2s,v0.2s
1755 + umlal v19.2d,v12.2s,v4.2s
1756 + umlal v23.2d,v12.2s,v1.2s
1757 + umlal v20.2d,v12.2s,v6.2s
1758 + umlal v21.2d,v12.2s,v8.2s
1759 +
1760 + umlal v22.2d,v13.2s,v8.2s
1761 + umlal v19.2d,v13.2s,v2.2s
1762 + umlal v23.2d,v13.2s,v0.2s
1763 + umlal v20.2d,v13.2s,v4.2s
1764 + umlal v21.2d,v13.2s,v6.2s
1765 +
1766 +.Lshort_tail:
1767 + ////////////////////////////////////////////////////////////////
1768 + // horizontal add
1769 +
1770 + addp v22.2d,v22.2d,v22.2d
1771 + ldp d8,d9,[sp,#16] // meet ABI requirements
1772 + addp v19.2d,v19.2d,v19.2d
1773 + ldp d10,d11,[sp,#32]
1774 + addp v23.2d,v23.2d,v23.2d
1775 + ldp d12,d13,[sp,#48]
1776 + addp v20.2d,v20.2d,v20.2d
1777 + ldp d14,d15,[sp,#64]
1778 + addp v21.2d,v21.2d,v21.2d
1779 + ldr x30,[sp,#8]
1780 + .inst 0xd50323bf // autiasp
1781 +
1782 + ////////////////////////////////////////////////////////////////
1783 + // lazy reduction, but without narrowing
1784 +
1785 + ushr v29.2d,v22.2d,#26
1786 + and v22.16b,v22.16b,v31.16b
1787 + ushr v30.2d,v19.2d,#26
1788 + and v19.16b,v19.16b,v31.16b
1789 +
1790 + add v23.2d,v23.2d,v29.2d // h3 -> h4
1791 + add v20.2d,v20.2d,v30.2d // h0 -> h1
1792 +
1793 + ushr v29.2d,v23.2d,#26
1794 + and v23.16b,v23.16b,v31.16b
1795 + ushr v30.2d,v20.2d,#26
1796 + and v20.16b,v20.16b,v31.16b
1797 + add v21.2d,v21.2d,v30.2d // h1 -> h2
1798 +
1799 + add v19.2d,v19.2d,v29.2d
1800 + shl v29.2d,v29.2d,#2
1801 + ushr v30.2d,v21.2d,#26
1802 + and v21.16b,v21.16b,v31.16b
1803 + add v19.2d,v19.2d,v29.2d // h4 -> h0
1804 + add v22.2d,v22.2d,v30.2d // h2 -> h3
1805 +
1806 + ushr v29.2d,v19.2d,#26
1807 + and v19.16b,v19.16b,v31.16b
1808 + ushr v30.2d,v22.2d,#26
1809 + and v22.16b,v22.16b,v31.16b
1810 + add v20.2d,v20.2d,v29.2d // h0 -> h1
1811 + add v23.2d,v23.2d,v30.2d // h3 -> h4
1812 +
1813 + ////////////////////////////////////////////////////////////////
1814 + // write the result, can be partially reduced
1815 +
1816 + st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
1817 + mov x4,#1
1818 + st1 {v23.s}[0],[x0]
1819 + str x4,[x0,#8] // set is_base2_26
1820 +
1821 + ldr x29,[sp],#80
1822 + ret
1823 +.size poly1305_blocks_neon,.-poly1305_blocks_neon
1824 +
1825 +.align 5
1826 +.Lzeros:
1827 +.long 0,0,0,0,0,0,0,0
1828 +.asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
1829 +.align 2
1830 +#if !defined(__KERNEL__) && !defined(_WIN64)
1831 +.comm OPENSSL_armcap_P,4,4
1832 +.hidden OPENSSL_armcap_P
1833 +#endif
1834 --- /dev/null
1835 +++ b/arch/arm64/crypto/poly1305-glue.c
1836 @@ -0,0 +1,237 @@
1837 +// SPDX-License-Identifier: GPL-2.0
1838 +/*
1839 + * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
1840 + *
1841 + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
1842 + */
1843 +
1844 +#include <asm/hwcap.h>
1845 +#include <asm/neon.h>
1846 +#include <asm/simd.h>
1847 +#include <asm/unaligned.h>
1848 +#include <crypto/algapi.h>
1849 +#include <crypto/internal/hash.h>
1850 +#include <crypto/internal/poly1305.h>
1851 +#include <crypto/internal/simd.h>
1852 +#include <linux/cpufeature.h>
1853 +#include <linux/crypto.h>
1854 +#include <linux/jump_label.h>
1855 +#include <linux/module.h>
1856 +
1857 +asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
1858 +asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
1859 +asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
1860 +asmlinkage void poly1305_emit(void *state, __le32 *digest, const u32 *nonce);
1861 +
1862 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
1863 +
1864 +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
1865 +{
1866 + poly1305_init_arm64(&dctx->h, key);
1867 + dctx->s[0] = get_unaligned_le32(key + 16);
1868 + dctx->s[1] = get_unaligned_le32(key + 20);
1869 + dctx->s[2] = get_unaligned_le32(key + 24);
1870 + dctx->s[3] = get_unaligned_le32(key + 28);
1871 + dctx->buflen = 0;
1872 +}
1873 +EXPORT_SYMBOL(poly1305_init_arch);
1874 +
1875 +static int neon_poly1305_init(struct shash_desc *desc)
1876 +{
1877 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
1878 +
1879 + dctx->buflen = 0;
1880 + dctx->rset = 0;
1881 + dctx->sset = false;
1882 +
1883 + return 0;
1884 +}
1885 +
1886 +static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
1887 + u32 len, u32 hibit, bool do_neon)
1888 +{
1889 + if (unlikely(!dctx->sset)) {
1890 + if (!dctx->rset) {
1891 + poly1305_init_arch(dctx, src);
1892 + src += POLY1305_BLOCK_SIZE;
1893 + len -= POLY1305_BLOCK_SIZE;
1894 + dctx->rset = 1;
1895 + }
1896 + if (len >= POLY1305_BLOCK_SIZE) {
1897 + dctx->s[0] = get_unaligned_le32(src + 0);
1898 + dctx->s[1] = get_unaligned_le32(src + 4);
1899 + dctx->s[2] = get_unaligned_le32(src + 8);
1900 + dctx->s[3] = get_unaligned_le32(src + 12);
1901 + src += POLY1305_BLOCK_SIZE;
1902 + len -= POLY1305_BLOCK_SIZE;
1903 + dctx->sset = true;
1904 + }
1905 + if (len < POLY1305_BLOCK_SIZE)
1906 + return;
1907 + }
1908 +
1909 + len &= ~(POLY1305_BLOCK_SIZE - 1);
1910 +
1911 + if (static_branch_likely(&have_neon) && likely(do_neon))
1912 + poly1305_blocks_neon(&dctx->h, src, len, hibit);
1913 + else
1914 + poly1305_blocks(&dctx->h, src, len, hibit);
1915 +}
1916 +
1917 +static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
1918 + const u8 *src, u32 len, bool do_neon)
1919 +{
1920 + if (unlikely(dctx->buflen)) {
1921 + u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
1922 +
1923 + memcpy(dctx->buf + dctx->buflen, src, bytes);
1924 + src += bytes;
1925 + len -= bytes;
1926 + dctx->buflen += bytes;
1927 +
1928 + if (dctx->buflen == POLY1305_BLOCK_SIZE) {
1929 + neon_poly1305_blocks(dctx, dctx->buf,
1930 + POLY1305_BLOCK_SIZE, 1, false);
1931 + dctx->buflen = 0;
1932 + }
1933 + }
1934 +
1935 + if (likely(len >= POLY1305_BLOCK_SIZE)) {
1936 + neon_poly1305_blocks(dctx, src, len, 1, do_neon);
1937 + src += round_down(len, POLY1305_BLOCK_SIZE);
1938 + len %= POLY1305_BLOCK_SIZE;
1939 + }
1940 +
1941 + if (unlikely(len)) {
1942 + dctx->buflen = len;
1943 + memcpy(dctx->buf, src, len);
1944 + }
1945 +}
1946 +
1947 +static int neon_poly1305_update(struct shash_desc *desc,
1948 + const u8 *src, unsigned int srclen)
1949 +{
1950 + bool do_neon = crypto_simd_usable() && srclen > 128;
1951 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
1952 +
1953 + if (static_branch_likely(&have_neon) && do_neon)
1954 + kernel_neon_begin();
1955 + neon_poly1305_do_update(dctx, src, srclen, do_neon);
1956 + if (static_branch_likely(&have_neon) && do_neon)
1957 + kernel_neon_end();
1958 + return 0;
1959 +}
1960 +
1961 +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
1962 + unsigned int nbytes)
1963 +{
1964 + if (unlikely(dctx->buflen)) {
1965 + u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
1966 +
1967 + memcpy(dctx->buf + dctx->buflen, src, bytes);
1968 + src += bytes;
1969 + nbytes -= bytes;
1970 + dctx->buflen += bytes;
1971 +
1972 + if (dctx->buflen == POLY1305_BLOCK_SIZE) {
1973 + poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
1974 + dctx->buflen = 0;
1975 + }
1976 + }
1977 +
1978 + if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
1979 + unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
1980 +
1981 + if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
1982 + kernel_neon_begin();
1983 + poly1305_blocks_neon(&dctx->h, src, len, 1);
1984 + kernel_neon_end();
1985 + } else {
1986 + poly1305_blocks(&dctx->h, src, len, 1);
1987 + }
1988 + src += len;
1989 + nbytes %= POLY1305_BLOCK_SIZE;
1990 + }
1991 +
1992 + if (unlikely(nbytes)) {
1993 + dctx->buflen = nbytes;
1994 + memcpy(dctx->buf, src, nbytes);
1995 + }
1996 +}
1997 +EXPORT_SYMBOL(poly1305_update_arch);
1998 +
1999 +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
2000 +{
2001 + __le32 digest[4];
2002 + u64 f = 0;
2003 +
2004 + if (unlikely(dctx->buflen)) {
2005 + dctx->buf[dctx->buflen++] = 1;
2006 + memset(dctx->buf + dctx->buflen, 0,
2007 + POLY1305_BLOCK_SIZE - dctx->buflen);
2008 + poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
2009 + }
2010 +
2011 + poly1305_emit(&dctx->h, digest, dctx->s);
2012 +
2013 + /* mac = (h + s) % (2^128) */
2014 + f = (f >> 32) + le32_to_cpu(digest[0]);
2015 + put_unaligned_le32(f, dst);
2016 + f = (f >> 32) + le32_to_cpu(digest[1]);
2017 + put_unaligned_le32(f, dst + 4);
2018 + f = (f >> 32) + le32_to_cpu(digest[2]);
2019 + put_unaligned_le32(f, dst + 8);
2020 + f = (f >> 32) + le32_to_cpu(digest[3]);
2021 + put_unaligned_le32(f, dst + 12);
2022 +
2023 + *dctx = (struct poly1305_desc_ctx){};
2024 +}
2025 +EXPORT_SYMBOL(poly1305_final_arch);
2026 +
2027 +static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
2028 +{
2029 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2030 +
2031 + if (unlikely(!dctx->sset))
2032 + return -ENOKEY;
2033 +
2034 + poly1305_final_arch(dctx, dst);
2035 + return 0;
2036 +}
2037 +
2038 +static struct shash_alg neon_poly1305_alg = {
2039 + .init = neon_poly1305_init,
2040 + .update = neon_poly1305_update,
2041 + .final = neon_poly1305_final,
2042 + .digestsize = POLY1305_DIGEST_SIZE,
2043 + .descsize = sizeof(struct poly1305_desc_ctx),
2044 +
2045 + .base.cra_name = "poly1305",
2046 + .base.cra_driver_name = "poly1305-neon",
2047 + .base.cra_priority = 200,
2048 + .base.cra_blocksize = POLY1305_BLOCK_SIZE,
2049 + .base.cra_module = THIS_MODULE,
2050 +};
2051 +
2052 +static int __init neon_poly1305_mod_init(void)
2053 +{
2054 + if (!cpu_have_named_feature(ASIMD))
2055 + return 0;
2056 +
2057 + static_branch_enable(&have_neon);
2058 +
2059 + return crypto_register_shash(&neon_poly1305_alg);
2060 +}
2061 +
2062 +static void __exit neon_poly1305_mod_exit(void)
2063 +{
2064 + if (cpu_have_named_feature(ASIMD))
2065 + crypto_unregister_shash(&neon_poly1305_alg);
2066 +}
2067 +
2068 +module_init(neon_poly1305_mod_init);
2069 +module_exit(neon_poly1305_mod_exit);
2070 +
2071 +MODULE_LICENSE("GPL v2");
2072 +MODULE_ALIAS_CRYPTO("poly1305");
2073 +MODULE_ALIAS_CRYPTO("poly1305-neon");
2074 --- a/lib/crypto/Kconfig
2075 +++ b/lib/crypto/Kconfig
2076 @@ -40,6 +40,7 @@ config CRYPTO_LIB_DES
2077 config CRYPTO_LIB_POLY1305_RSIZE
2078 int
2079 default 4 if X86_64
2080 + default 9 if ARM64
2081 default 1
2082
2083 config CRYPTO_ARCH_HAVE_LIB_POLY1305