1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Sun, 5 Jan 2020 22:40:47 -0500
4 Subject: [PATCH] crypto: x86/poly1305 - import unmodified cryptogams
7 commit 0896ca2a0cb6127e8a129f1f2a680d49b6b0f65c upstream.
9 These x86_64 vectorized implementations come from Andy Polyakov's
10 CRYPTOGAMS implementation, and are included here in raw form without
11 modification, so that subsequent commits that fix these up for the
12 kernel can see how it has changed.
14 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
15 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
16 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
18 arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 4159 +++++++++++++++++
19 1 file changed, 4159 insertions(+)
20 create mode 100644 arch/x86/crypto/poly1305-x86_64-cryptogams.pl
23 +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
26 +# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
28 +# Licensed under the OpenSSL license (the "License"). You may not use
29 +# this file except in compliance with the License. You can obtain a copy
30 +# in the file LICENSE in the source distribution or at
31 +# https://www.openssl.org/source/license.html
34 +# ====================================================================
35 +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
36 +# project. The module is, however, dual licensed under OpenSSL and
37 +# CRYPTOGAMS licenses depending on where you obtain it. For further
38 +# details see http://www.openssl.org/~appro/cryptogams/.
39 +# ====================================================================
41 +# This module implements Poly1305 hash for x86_64.
49 +# Add AVX512F+VL+BW code path.
53 +# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
54 +# executed even on Knights Landing. Trigger for modification was
55 +# observation that AVX512 code paths can negatively affect overall
56 +# Skylake-X system performance. Since we are likely to suppress
57 +# AVX512F capability flag [at least on Skylake-X], conversion serves
58 +# as kind of "investment protection". Note that next *lake processor,
59 +# Cannolake, has AVX512IFMA code path to execute...
61 +# Numbers are cycles per processed byte with poly1305_blocks alone,
62 +# measured with rdtsc at fixed clock frequency.
64 +# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
67 +# Westmere 1.88/+120% -
68 +# Sandy Bridge 1.39/+140% 1.10
69 +# Haswell 1.14/+175% 1.11 0.65
70 +# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
71 +# Silvermont 2.83/+95% -
72 +# Knights L 3.60/? 1.65 1.10 0.41(***)
73 +# Goldmont 1.70/+180% -
74 +# VIA Nano 1.82/+150% -
75 +# Sledgehammer 1.38/+160% -
76 +# Bulldozer 2.30/+130% 0.97
77 +# Ryzen 1.15/+200% 1.08 1.18
79 +# (*) improvement coefficients relative to clang are more modest and
80 +# are ~50% on most processors, in both cases we are comparing to
82 +# (**) SSE2 implementation was attempted, but among non-AVX processors
83 +# it was faster than integer-only code only on older Intel P4 and
84 +# Core processors, 50-30%, less newer processor is, but slower on
85 +# contemporary ones, for example almost 2x slower on Atom, and as
86 +# former are naturally disappearing, SSE2 is deemed unnecessary;
87 +# (***) strangely enough performance seems to vary from core to core,
88 +# listed result is best case;
92 +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
94 +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
96 +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
97 +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
98 +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
99 +die "can't locate x86_64-xlate.pl";
101 +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102 + =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
103 + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
106 +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
107 + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
108 + $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
109 + $avx += 2 if ($1==2.11 && $2>=8);
112 +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
113 + `ml64 2>&1` =~ /Version ([0-9]+)\./) {
114 + $avx = ($1>=10) + ($1>=12);
117 +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
118 + $avx = ($2>=3.0) + ($2>3.0);
121 +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
124 +my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
125 +my ($mac,$nonce)=($inp,$len); # *_emit arguments
126 +my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
127 +my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
129 +sub poly1305_iteration {
130 +# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
131 +# output: $h0-$h2 *= $r0-$r1
139 + mov %rax,$h0 # future $h0
149 + mov $h2,$h1 # borrow $h1
153 + imulq $s1,$h1 # h2*s1
158 + imulq $r0,$h2 # h2*r0
160 + mov \$-4,%rax # mask value
163 + and $d3,%rax # last reduction step
174 +########################################################################
175 +# Layout of opaque area is following.
177 +# unsigned __int64 h[3]; # current hash value base 2^64
178 +# unsigned __int64 r[2]; # key value base 2^64
183 +.extern OPENSSL_ia32cap_P
185 +.globl poly1305_init
186 +.hidden poly1305_init
187 +.globl poly1305_blocks
188 +.hidden poly1305_blocks
189 +.globl poly1305_emit
190 +.hidden poly1305_emit
192 +.type poly1305_init,\@function,3
196 + mov %rax,0($ctx) # initialize hash value
203 + lea poly1305_blocks(%rip),%r10
204 + lea poly1305_emit(%rip),%r11
206 +$code.=<<___ if ($avx);
207 + mov OPENSSL_ia32cap_P+4(%rip),%r9
208 + lea poly1305_blocks_avx(%rip),%rax
209 + lea poly1305_emit_avx(%rip),%rcx
210 + bt \$`60-32`,%r9 # AVX?
214 +$code.=<<___ if ($avx>1);
215 + lea poly1305_blocks_avx2(%rip),%rax
216 + bt \$`5+32`,%r9 # AVX2?
219 +$code.=<<___ if ($avx>3);
220 + mov \$`(1<<31|1<<21|1<<16)`,%rax
227 + mov \$0x0ffffffc0fffffff,%rax
228 + mov \$0x0ffffffc0ffffffc,%rcx
234 +$code.=<<___ if ($flavour !~ /elf32/);
238 +$code.=<<___ if ($flavour =~ /elf32/);
246 +.size poly1305_init,.-poly1305_init
248 +.type poly1305_blocks,\@function,4
254 + jz .Lno_data # too short
270 + mov $len,%r15 # reassign $len
272 + mov 24($ctx),$r0 # load r
275 + mov 0($ctx),$h0 # load hash value
282 + add $r1,$s1 # s1 = r1 + (r1 >> 2)
287 + add 0($inp),$h0 # accumulate input
292 + &poly1305_iteration();
298 + mov $h0,0($ctx) # store hash value
315 +.cfi_adjust_cfa_offset -48
320 +.size poly1305_blocks,.-poly1305_blocks
322 +.type poly1305_emit,\@function,3
326 + mov 0($ctx),%r8 # load hash value
331 + add \$5,%r8 # compare to modulus
335 + shr \$2,%r10 # did 130-bit value overflow?
339 + add 0($nonce),%rax # accumulate nonce
341 + mov %rax,0($mac) # write result
345 +.size poly1305_emit,.-poly1305_emit
349 +########################################################################
350 +# Layout of opaque area is following.
352 +# unsigned __int32 h[5]; # current hash value base 2^26
353 +# unsigned __int32 is_base2_26;
354 +# unsigned __int64 r[2]; # key value base 2^64
355 +# unsigned __int64 pad;
356 +# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
358 +# where r^n are base 2^26 digits of degrees of multiplier key. There are
359 +# 5 digits, but last four are interleaved with multiples of 5, totalling
360 +# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
362 +my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
363 + map("%xmm$_",(0..15));
366 +.type __poly1305_block,\@abi-omnipotent
370 + &poly1305_iteration();
373 +.size __poly1305_block,.-__poly1305_block
375 +.type __poly1305_init_avx,\@abi-omnipotent
377 +__poly1305_init_avx:
382 + lea 48+64($ctx),$ctx # size optimization
385 + call __poly1305_block # r^2
387 + mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
388 + mov \$0x3ffffff,%edx
393 + mov %eax,`16*0+0-64`($ctx)
395 + mov %edx,`16*0+4-64`($ctx)
398 + mov \$0x3ffffff,%eax
399 + mov \$0x3ffffff,%edx
402 + mov %eax,`16*1+0-64`($ctx)
403 + lea (%rax,%rax,4),%eax # *5
404 + mov %edx,`16*1+4-64`($ctx)
405 + lea (%rdx,%rdx,4),%edx # *5
406 + mov %eax,`16*2+0-64`($ctx)
408 + mov %edx,`16*2+4-64`($ctx)
417 + and \$0x3ffffff,%eax
418 + and \$0x3ffffff,%edx
419 + mov %eax,`16*3+0-64`($ctx)
420 + lea (%rax,%rax,4),%eax # *5
421 + mov %edx,`16*3+4-64`($ctx)
422 + lea (%rdx,%rdx,4),%edx # *5
423 + mov %eax,`16*4+0-64`($ctx)
425 + mov %edx,`16*4+4-64`($ctx)
428 + mov \$0x3ffffff,%eax
429 + mov \$0x3ffffff,%edx
434 + mov %eax,`16*5+0-64`($ctx)
435 + lea (%rax,%rax,4),%eax # *5
436 + mov %edx,`16*5+4-64`($ctx)
437 + lea (%rdx,%rdx,4),%edx # *5
438 + mov %eax,`16*6+0-64`($ctx)
440 + mov %edx,`16*6+4-64`($ctx)
446 + mov $d1#d,`16*7+0-64`($ctx)
447 + lea ($d1,$d1,4),$d1 # *5
448 + mov $d2#d,`16*7+4-64`($ctx)
449 + lea ($d2,$d2,4),$d2 # *5
450 + mov $d1#d,`16*8+0-64`($ctx)
451 + mov $d2#d,`16*8+4-64`($ctx)
454 + call __poly1305_block # r^3
456 + mov \$0x3ffffff,%eax # save r^3 base 2^26
460 + mov %eax,`16*0+12-64`($ctx)
462 + mov \$0x3ffffff,%edx
464 + mov %edx,`16*1+12-64`($ctx)
465 + lea (%rdx,%rdx,4),%edx # *5
467 + mov %edx,`16*2+12-64`($ctx)
472 + and \$0x3ffffff,%eax
473 + mov %eax,`16*3+12-64`($ctx)
474 + lea (%rax,%rax,4),%eax # *5
476 + mov %eax,`16*4+12-64`($ctx)
478 + mov \$0x3ffffff,%edx
481 + mov %edx,`16*5+12-64`($ctx)
482 + lea (%rdx,%rdx,4),%edx # *5
484 + mov %edx,`16*6+12-64`($ctx)
489 + mov $d1#d,`16*7+12-64`($ctx)
490 + lea ($d1,$d1,4),$d1 # *5
491 + mov $d1#d,`16*8+12-64`($ctx)
494 + call __poly1305_block # r^4
496 + mov \$0x3ffffff,%eax # save r^4 base 2^26
500 + mov %eax,`16*0+8-64`($ctx)
502 + mov \$0x3ffffff,%edx
504 + mov %edx,`16*1+8-64`($ctx)
505 + lea (%rdx,%rdx,4),%edx # *5
507 + mov %edx,`16*2+8-64`($ctx)
512 + and \$0x3ffffff,%eax
513 + mov %eax,`16*3+8-64`($ctx)
514 + lea (%rax,%rax,4),%eax # *5
516 + mov %eax,`16*4+8-64`($ctx)
518 + mov \$0x3ffffff,%edx
521 + mov %edx,`16*5+8-64`($ctx)
522 + lea (%rdx,%rdx,4),%edx # *5
524 + mov %edx,`16*6+8-64`($ctx)
529 + mov $d1#d,`16*7+8-64`($ctx)
530 + lea ($d1,$d1,4),$d1 # *5
531 + mov $d1#d,`16*8+8-64`($ctx)
533 + lea -48-64($ctx),$ctx # size [de-]optimization
535 +.size __poly1305_init_avx,.-__poly1305_init_avx
537 +.type poly1305_blocks_avx,\@function,4
539 +poly1305_blocks_avx:
541 + mov 20($ctx),%r8d # is_base2_26
573 + mov $len,%r15 # reassign $len
575 + mov 0($ctx),$d1 # load hash value
579 + mov 24($ctx),$r0 # load r
582 + ################################# base 2^26 -> base 2^64
584 + and \$`-1*(1<<31)`,$d1
585 + mov $d2,$r1 # borrow $r1
587 + and \$`-1*(1<<31)`,$d2
601 + adc \$0,$h2 # can be partially reduced...
603 + mov \$-4,$d2 # ... so reduce
616 + add $r1,$s1 # s1 = r1 + (r1 >> 2)
618 + add 0($inp),$h0 # accumulate input
623 + call __poly1305_block
625 + test $padbit,$padbit # if $padbit is zero,
626 + jz .Lstore_base2_64_avx # store hash in base 2^64 format
628 + ################################# base 2^64 -> base 2^26
635 + and \$0x3ffffff,%rax # h[0]
637 + and \$0x3ffffff,%rdx # h[1]
641 + and \$0x3ffffff,$h0 # h[2]
643 + and \$0x3ffffff,$h1 # h[3]
647 + jz .Lstore_base2_26_avx
657 +.Lstore_base2_64_avx:
660 + mov $h2,16($ctx) # note that is_base2_26 is zeroed
664 +.Lstore_base2_26_avx:
665 + mov %rax#d,0($ctx) # store hash value base 2^26
685 +.cfi_adjust_cfa_offset -48
687 +.Lblocks_avx_epilogue:
706 +.Lbase2_64_avx_body:
708 + mov $len,%r15 # reassign $len
710 + mov 24($ctx),$r0 # load r
713 + mov 0($ctx),$h0 # load hash value
720 + add $r1,$s1 # s1 = r1 + (r1 >> 2)
725 + add 0($inp),$h0 # accumulate input
731 + call __poly1305_block
734 + ################################# base 2^64 -> base 2^26
741 + and \$0x3ffffff,%rax # h[0]
743 + and \$0x3ffffff,%rdx # h[1]
747 + and \$0x3ffffff,$h0 # h[2]
749 + and \$0x3ffffff,$h1 # h[3]
757 + movl \$1,20($ctx) # set is_base2_26
759 + call __poly1305_init_avx
778 +.cfi_adjust_cfa_offset -48
779 +.Lbase2_64_avx_epilogue:
786 + vmovd 4*0($ctx),$H0 # load hash value
787 + vmovd 4*1($ctx),$H1
788 + vmovd 4*2($ctx),$H2
789 + vmovd 4*3($ctx),$H3
790 + vmovd 4*4($ctx),$H4
794 +$code.=<<___ if (!$win64);
795 + lea -0x58(%rsp),%r11
796 +.cfi_def_cfa %r11,0x60
799 +$code.=<<___ if ($win64);
800 + lea -0xf8(%rsp),%r11
802 + vmovdqa %xmm6,0x50(%r11)
803 + vmovdqa %xmm7,0x60(%r11)
804 + vmovdqa %xmm8,0x70(%r11)
805 + vmovdqa %xmm9,0x80(%r11)
806 + vmovdqa %xmm10,0x90(%r11)
807 + vmovdqa %xmm11,0xa0(%r11)
808 + vmovdqa %xmm12,0xb0(%r11)
809 + vmovdqa %xmm13,0xc0(%r11)
810 + vmovdqa %xmm14,0xd0(%r11)
811 + vmovdqa %xmm15,0xe0(%r11)
819 + vmovdqu `16*3`($ctx),$D4 # preload r0^2
820 + lea `16*3+64`($ctx),$ctx # size optimization
821 + lea .Lconst(%rip),%rcx
823 + ################################################################
825 + vmovdqu 16*2($inp),$T0
826 + vmovdqu 16*3($inp),$T1
827 + vmovdqa 64(%rcx),$MASK # .Lmask26
829 + vpsrldq \$6,$T0,$T2 # splat input
830 + vpsrldq \$6,$T1,$T3
831 + vpunpckhqdq $T1,$T0,$T4 # 4
832 + vpunpcklqdq $T1,$T0,$T0 # 0:1
833 + vpunpcklqdq $T3,$T2,$T3 # 2:3
835 + vpsrlq \$40,$T4,$T4 # 4
836 + vpsrlq \$26,$T0,$T1
837 + vpand $MASK,$T0,$T0 # 0
839 + vpand $MASK,$T1,$T1 # 1
840 + vpsrlq \$30,$T3,$T3
841 + vpand $MASK,$T2,$T2 # 2
842 + vpand $MASK,$T3,$T3 # 3
843 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always
845 + jbe .Lskip_loop_avx
847 + # expand and copy pre-calculated table to stack
848 + vmovdqu `16*1-64`($ctx),$D1
849 + vmovdqu `16*2-64`($ctx),$D2
850 + vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
851 + vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
852 + vmovdqa $D3,-0x90(%r11)
853 + vmovdqa $D0,0x00(%rsp)
854 + vpshufd \$0xEE,$D1,$D4
855 + vmovdqu `16*3-64`($ctx),$D0
856 + vpshufd \$0x44,$D1,$D1
857 + vmovdqa $D4,-0x80(%r11)
858 + vmovdqa $D1,0x10(%rsp)
859 + vpshufd \$0xEE,$D2,$D3
860 + vmovdqu `16*4-64`($ctx),$D1
861 + vpshufd \$0x44,$D2,$D2
862 + vmovdqa $D3,-0x70(%r11)
863 + vmovdqa $D2,0x20(%rsp)
864 + vpshufd \$0xEE,$D0,$D4
865 + vmovdqu `16*5-64`($ctx),$D2
866 + vpshufd \$0x44,$D0,$D0
867 + vmovdqa $D4,-0x60(%r11)
868 + vmovdqa $D0,0x30(%rsp)
869 + vpshufd \$0xEE,$D1,$D3
870 + vmovdqu `16*6-64`($ctx),$D0
871 + vpshufd \$0x44,$D1,$D1
872 + vmovdqa $D3,-0x50(%r11)
873 + vmovdqa $D1,0x40(%rsp)
874 + vpshufd \$0xEE,$D2,$D4
875 + vmovdqu `16*7-64`($ctx),$D1
876 + vpshufd \$0x44,$D2,$D2
877 + vmovdqa $D4,-0x40(%r11)
878 + vmovdqa $D2,0x50(%rsp)
879 + vpshufd \$0xEE,$D0,$D3
880 + vmovdqu `16*8-64`($ctx),$D2
881 + vpshufd \$0x44,$D0,$D0
882 + vmovdqa $D3,-0x30(%r11)
883 + vmovdqa $D0,0x60(%rsp)
884 + vpshufd \$0xEE,$D1,$D4
885 + vpshufd \$0x44,$D1,$D1
886 + vmovdqa $D4,-0x20(%r11)
887 + vmovdqa $D1,0x70(%rsp)
888 + vpshufd \$0xEE,$D2,$D3
889 + vmovdqa 0x00(%rsp),$D4 # preload r0^2
890 + vpshufd \$0x44,$D2,$D2
891 + vmovdqa $D3,-0x10(%r11)
892 + vmovdqa $D2,0x80(%rsp)
898 + ################################################################
899 + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
900 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
901 + # \___________________/
902 + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
903 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
904 + # \___________________/ \____________________/
906 + # Note that we start with inp[2:3]*r^2. This is because it
907 + # doesn't depend on reduction in previous iteration.
908 + ################################################################
909 + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
910 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
911 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
912 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
913 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
915 + # though note that $Tx and $Hx are "reversed" in this section,
916 + # and $D4 is preloaded with r0^2...
918 + vpmuludq $T0,$D4,$D0 # d0 = h0*r0
919 + vpmuludq $T1,$D4,$D1 # d1 = h1*r0
920 + vmovdqa $H2,0x20(%r11) # offload hash
921 + vpmuludq $T2,$D4,$D2 # d3 = h2*r0
922 + vmovdqa 0x10(%rsp),$H2 # r1^2
923 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0
924 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0
926 + vmovdqa $H0,0x00(%r11) #
927 + vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
928 + vmovdqa $H1,0x10(%r11) #
929 + vpmuludq $T3,$H2,$H1 # h3*r1
930 + vpaddq $H0,$D0,$D0 # d0 += h4*s1
931 + vpaddq $H1,$D4,$D4 # d4 += h3*r1
932 + vmovdqa $H3,0x30(%r11) #
933 + vpmuludq $T2,$H2,$H0 # h2*r1
934 + vpmuludq $T1,$H2,$H1 # h1*r1
935 + vpaddq $H0,$D3,$D3 # d3 += h2*r1
936 + vmovdqa 0x30(%rsp),$H3 # r2^2
937 + vpaddq $H1,$D2,$D2 # d2 += h1*r1
938 + vmovdqa $H4,0x40(%r11) #
939 + vpmuludq $T0,$H2,$H2 # h0*r1
940 + vpmuludq $T2,$H3,$H0 # h2*r2
941 + vpaddq $H2,$D1,$D1 # d1 += h0*r1
943 + vmovdqa 0x40(%rsp),$H4 # s2^2
944 + vpaddq $H0,$D4,$D4 # d4 += h2*r2
945 + vpmuludq $T1,$H3,$H1 # h1*r2
946 + vpmuludq $T0,$H3,$H3 # h0*r2
947 + vpaddq $H1,$D3,$D3 # d3 += h1*r2
948 + vmovdqa 0x50(%rsp),$H2 # r3^2
949 + vpaddq $H3,$D2,$D2 # d2 += h0*r2
950 + vpmuludq $T4,$H4,$H0 # h4*s2
951 + vpmuludq $T3,$H4,$H4 # h3*s2
952 + vpaddq $H0,$D1,$D1 # d1 += h4*s2
953 + vmovdqa 0x60(%rsp),$H3 # s3^2
954 + vpaddq $H4,$D0,$D0 # d0 += h3*s2
956 + vmovdqa 0x80(%rsp),$H4 # s4^2
957 + vpmuludq $T1,$H2,$H1 # h1*r3
958 + vpmuludq $T0,$H2,$H2 # h0*r3
959 + vpaddq $H1,$D4,$D4 # d4 += h1*r3
960 + vpaddq $H2,$D3,$D3 # d3 += h0*r3
961 + vpmuludq $T4,$H3,$H0 # h4*s3
962 + vpmuludq $T3,$H3,$H1 # h3*s3
963 + vpaddq $H0,$D2,$D2 # d2 += h4*s3
964 + vmovdqu 16*0($inp),$H0 # load input
965 + vpaddq $H1,$D1,$D1 # d1 += h3*s3
966 + vpmuludq $T2,$H3,$H3 # h2*s3
967 + vpmuludq $T2,$H4,$T2 # h2*s4
968 + vpaddq $H3,$D0,$D0 # d0 += h2*s3
970 + vmovdqu 16*1($inp),$H1 #
971 + vpaddq $T2,$D1,$D1 # d1 += h2*s4
972 + vpmuludq $T3,$H4,$T3 # h3*s4
973 + vpmuludq $T4,$H4,$T4 # h4*s4
974 + vpsrldq \$6,$H0,$H2 # splat input
975 + vpaddq $T3,$D2,$D2 # d2 += h3*s4
976 + vpaddq $T4,$D3,$D3 # d3 += h4*s4
977 + vpsrldq \$6,$H1,$H3 #
978 + vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
979 + vpmuludq $T1,$H4,$T0 # h1*s4
980 + vpunpckhqdq $H1,$H0,$H4 # 4
981 + vpaddq $T4,$D4,$D4 # d4 += h0*r4
982 + vmovdqa -0x90(%r11),$T4 # r0^4
983 + vpaddq $T0,$D0,$D0 # d0 += h1*s4
985 + vpunpcklqdq $H1,$H0,$H0 # 0:1
986 + vpunpcklqdq $H3,$H2,$H3 # 2:3
988 + #vpsrlq \$40,$H4,$H4 # 4
989 + vpsrldq \$`40/8`,$H4,$H4 # 4
990 + vpsrlq \$26,$H0,$H1
991 + vpand $MASK,$H0,$H0 # 0
993 + vpand $MASK,$H1,$H1 # 1
994 + vpand 0(%rcx),$H4,$H4 # .Lmask24
995 + vpsrlq \$30,$H3,$H3
996 + vpand $MASK,$H2,$H2 # 2
997 + vpand $MASK,$H3,$H3 # 3
998 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1000 + vpaddq 0x00(%r11),$H0,$H0 # add hash value
1001 + vpaddq 0x10(%r11),$H1,$H1
1002 + vpaddq 0x20(%r11),$H2,$H2
1003 + vpaddq 0x30(%r11),$H3,$H3
1004 + vpaddq 0x40(%r11),$H4,$H4
1006 + lea 16*2($inp),%rax
1007 + lea 16*4($inp),$inp
1011 + ################################################################
1012 + # Now we accumulate (inp[0:1]+hash)*r^4
1013 + ################################################################
1014 + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1015 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1016 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1017 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1018 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1020 + vpmuludq $H0,$T4,$T0 # h0*r0
1021 + vpmuludq $H1,$T4,$T1 # h1*r0
1022 + vpaddq $T0,$D0,$D0
1023 + vpaddq $T1,$D1,$D1
1024 + vmovdqa -0x80(%r11),$T2 # r1^4
1025 + vpmuludq $H2,$T4,$T0 # h2*r0
1026 + vpmuludq $H3,$T4,$T1 # h3*r0
1027 + vpaddq $T0,$D2,$D2
1028 + vpaddq $T1,$D3,$D3
1029 + vpmuludq $H4,$T4,$T4 # h4*r0
1030 + vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
1031 + vpaddq $T4,$D4,$D4
1033 + vpaddq $T0,$D0,$D0 # d0 += h4*s1
1034 + vpmuludq $H2,$T2,$T1 # h2*r1
1035 + vpmuludq $H3,$T2,$T0 # h3*r1
1036 + vpaddq $T1,$D3,$D3 # d3 += h2*r1
1037 + vmovdqa -0x60(%r11),$T3 # r2^4
1038 + vpaddq $T0,$D4,$D4 # d4 += h3*r1
1039 + vpmuludq $H1,$T2,$T1 # h1*r1
1040 + vpmuludq $H0,$T2,$T2 # h0*r1
1041 + vpaddq $T1,$D2,$D2 # d2 += h1*r1
1042 + vpaddq $T2,$D1,$D1 # d1 += h0*r1
1044 + vmovdqa -0x50(%r11),$T4 # s2^4
1045 + vpmuludq $H2,$T3,$T0 # h2*r2
1046 + vpmuludq $H1,$T3,$T1 # h1*r2
1047 + vpaddq $T0,$D4,$D4 # d4 += h2*r2
1048 + vpaddq $T1,$D3,$D3 # d3 += h1*r2
1049 + vmovdqa -0x40(%r11),$T2 # r3^4
1050 + vpmuludq $H0,$T3,$T3 # h0*r2
1051 + vpmuludq $H4,$T4,$T0 # h4*s2
1052 + vpaddq $T3,$D2,$D2 # d2 += h0*r2
1053 + vpaddq $T0,$D1,$D1 # d1 += h4*s2
1054 + vmovdqa -0x30(%r11),$T3 # s3^4
1055 + vpmuludq $H3,$T4,$T4 # h3*s2
1056 + vpmuludq $H1,$T2,$T1 # h1*r3
1057 + vpaddq $T4,$D0,$D0 # d0 += h3*s2
1059 + vmovdqa -0x10(%r11),$T4 # s4^4
1060 + vpaddq $T1,$D4,$D4 # d4 += h1*r3
1061 + vpmuludq $H0,$T2,$T2 # h0*r3
1062 + vpmuludq $H4,$T3,$T0 # h4*s3
1063 + vpaddq $T2,$D3,$D3 # d3 += h0*r3
1064 + vpaddq $T0,$D2,$D2 # d2 += h4*s3
1065 + vmovdqu 16*2($inp),$T0 # load input
1066 + vpmuludq $H3,$T3,$T2 # h3*s3
1067 + vpmuludq $H2,$T3,$T3 # h2*s3
1068 + vpaddq $T2,$D1,$D1 # d1 += h3*s3
1069 + vmovdqu 16*3($inp),$T1 #
1070 + vpaddq $T3,$D0,$D0 # d0 += h2*s3
1072 + vpmuludq $H2,$T4,$H2 # h2*s4
1073 + vpmuludq $H3,$T4,$H3 # h3*s4
1074 + vpsrldq \$6,$T0,$T2 # splat input
1075 + vpaddq $H2,$D1,$D1 # d1 += h2*s4
1076 + vpmuludq $H4,$T4,$H4 # h4*s4
1077 + vpsrldq \$6,$T1,$T3 #
1078 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
1079 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
1080 + vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
1081 + vpmuludq $H1,$T4,$H0
1082 + vpunpckhqdq $T1,$T0,$T4 # 4
1083 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1084 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1086 + vpunpcklqdq $T1,$T0,$T0 # 0:1
1087 + vpunpcklqdq $T3,$T2,$T3 # 2:3
1089 + #vpsrlq \$40,$T4,$T4 # 4
1090 + vpsrldq \$`40/8`,$T4,$T4 # 4
1091 + vpsrlq \$26,$T0,$T1
1092 + vmovdqa 0x00(%rsp),$D4 # preload r0^2
1093 + vpand $MASK,$T0,$T0 # 0
1094 + vpsrlq \$4,$T3,$T2
1095 + vpand $MASK,$T1,$T1 # 1
1096 + vpand 0(%rcx),$T4,$T4 # .Lmask24
1097 + vpsrlq \$30,$T3,$T3
1098 + vpand $MASK,$T2,$T2 # 2
1099 + vpand $MASK,$T3,$T3 # 3
1100 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1102 + ################################################################
1103 + # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1106 + vpsrlq \$26,$H3,$D3
1107 + vpand $MASK,$H3,$H3
1108 + vpaddq $D3,$H4,$H4 # h3 -> h4
1110 + vpsrlq \$26,$H0,$D0
1111 + vpand $MASK,$H0,$H0
1112 + vpaddq $D0,$D1,$H1 # h0 -> h1
1114 + vpsrlq \$26,$H4,$D0
1115 + vpand $MASK,$H4,$H4
1117 + vpsrlq \$26,$H1,$D1
1118 + vpand $MASK,$H1,$H1
1119 + vpaddq $D1,$H2,$H2 # h1 -> h2
1121 + vpaddq $D0,$H0,$H0
1122 + vpsllq \$2,$D0,$D0
1123 + vpaddq $D0,$H0,$H0 # h4 -> h0
1125 + vpsrlq \$26,$H2,$D2
1126 + vpand $MASK,$H2,$H2
1127 + vpaddq $D2,$H3,$H3 # h2 -> h3
1129 + vpsrlq \$26,$H0,$D0
1130 + vpand $MASK,$H0,$H0
1131 + vpaddq $D0,$H1,$H1 # h0 -> h1
1133 + vpsrlq \$26,$H3,$D3
1134 + vpand $MASK,$H3,$H3
1135 + vpaddq $D3,$H4,$H4 # h3 -> h4
1140 + ################################################################
1141 + # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1143 + vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1145 + jnz .Long_tail_avx
1147 + vpaddq $H2,$T2,$T2
1148 + vpaddq $H0,$T0,$T0
1149 + vpaddq $H1,$T1,$T1
1150 + vpaddq $H3,$T3,$T3
1151 + vpaddq $H4,$T4,$T4
1154 + vmovdqa $H2,0x20(%r11)
1155 + vmovdqa $H0,0x00(%r11)
1156 + vmovdqa $H1,0x10(%r11)
1157 + vmovdqa $H3,0x30(%r11)
1158 + vmovdqa $H4,0x40(%r11)
1160 + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1161 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1162 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1163 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1164 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1166 + vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1167 + vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1168 + vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1169 + vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1170 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1171 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1173 + vpmuludq $T3,$H2,$H0 # h3*r1
1174 + vpaddq $H0,$D4,$D4 # d4 += h3*r1
1175 + vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1176 + vpmuludq $T2,$H2,$H1 # h2*r1
1177 + vpaddq $H1,$D3,$D3 # d3 += h2*r1
1178 + vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1179 + vpmuludq $T1,$H2,$H0 # h1*r1
1180 + vpaddq $H0,$D2,$D2 # d2 += h1*r1
1181 + vpmuludq $T0,$H2,$H2 # h0*r1
1182 + vpaddq $H2,$D1,$D1 # d1 += h0*r1
1183 + vpmuludq $T4,$H3,$H3 # h4*s1
1184 + vpaddq $H3,$D0,$D0 # d0 += h4*s1
1186 + vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1187 + vpmuludq $T2,$H4,$H1 # h2*r2
1188 + vpaddq $H1,$D4,$D4 # d4 += h2*r2
1189 + vpmuludq $T1,$H4,$H0 # h1*r2
1190 + vpaddq $H0,$D3,$D3 # d3 += h1*r2
1191 + vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1192 + vpmuludq $T0,$H4,$H4 # h0*r2
1193 + vpaddq $H4,$D2,$D2 # d2 += h0*r2
1194 + vpmuludq $T4,$H2,$H1 # h4*s2
1195 + vpaddq $H1,$D1,$D1 # d1 += h4*s2
1196 + vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1197 + vpmuludq $T3,$H2,$H2 # h3*s2
1198 + vpaddq $H2,$D0,$D0 # d0 += h3*s2
1200 + vpmuludq $T1,$H3,$H0 # h1*r3
1201 + vpaddq $H0,$D4,$D4 # d4 += h1*r3
1202 + vpmuludq $T0,$H3,$H3 # h0*r3
1203 + vpaddq $H3,$D3,$D3 # d3 += h0*r3
1204 + vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1205 + vpmuludq $T4,$H4,$H1 # h4*s3
1206 + vpaddq $H1,$D2,$D2 # d2 += h4*s3
1207 + vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1208 + vpmuludq $T3,$H4,$H0 # h3*s3
1209 + vpaddq $H0,$D1,$D1 # d1 += h3*s3
1210 + vpmuludq $T2,$H4,$H4 # h2*s3
1211 + vpaddq $H4,$D0,$D0 # d0 += h2*s3
1213 + vpmuludq $T0,$H2,$H2 # h0*r4
1214 + vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1215 + vpmuludq $T4,$H3,$H1 # h4*s4
1216 + vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1217 + vpmuludq $T3,$H3,$H0 # h3*s4
1218 + vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1219 + vpmuludq $T2,$H3,$H1 # h2*s4
1220 + vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1221 + vpmuludq $T1,$H3,$H3 # h1*s4
1222 + vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1224 + jz .Lshort_tail_avx
1226 + vmovdqu 16*0($inp),$H0 # load input
1227 + vmovdqu 16*1($inp),$H1
1229 + vpsrldq \$6,$H0,$H2 # splat input
1230 + vpsrldq \$6,$H1,$H3
1231 + vpunpckhqdq $H1,$H0,$H4 # 4
1232 + vpunpcklqdq $H1,$H0,$H0 # 0:1
1233 + vpunpcklqdq $H3,$H2,$H3 # 2:3
1235 + vpsrlq \$40,$H4,$H4 # 4
1236 + vpsrlq \$26,$H0,$H1
1237 + vpand $MASK,$H0,$H0 # 0
1238 + vpsrlq \$4,$H3,$H2
1239 + vpand $MASK,$H1,$H1 # 1
1240 + vpsrlq \$30,$H3,$H3
1241 + vpand $MASK,$H2,$H2 # 2
1242 + vpand $MASK,$H3,$H3 # 3
1243 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1245 + vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1246 + vpaddq 0x00(%r11),$H0,$H0
1247 + vpaddq 0x10(%r11),$H1,$H1
1248 + vpaddq 0x20(%r11),$H2,$H2
1249 + vpaddq 0x30(%r11),$H3,$H3
1250 + vpaddq 0x40(%r11),$H4,$H4
1252 + ################################################################
1253 + # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1255 + vpmuludq $H0,$T4,$T0 # h0*r0
1256 + vpaddq $T0,$D0,$D0 # d0 += h0*r0
1257 + vpmuludq $H1,$T4,$T1 # h1*r0
1258 + vpaddq $T1,$D1,$D1 # d1 += h1*r0
1259 + vpmuludq $H2,$T4,$T0 # h2*r0
1260 + vpaddq $T0,$D2,$D2 # d2 += h2*r0
1261 + vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1262 + vpmuludq $H3,$T4,$T1 # h3*r0
1263 + vpaddq $T1,$D3,$D3 # d3 += h3*r0
1264 + vpmuludq $H4,$T4,$T4 # h4*r0
1265 + vpaddq $T4,$D4,$D4 # d4 += h4*r0
1267 + vpmuludq $H3,$T2,$T0 # h3*r1
1268 + vpaddq $T0,$D4,$D4 # d4 += h3*r1
1269 + vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1270 + vpmuludq $H2,$T2,$T1 # h2*r1
1271 + vpaddq $T1,$D3,$D3 # d3 += h2*r1
1272 + vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1273 + vpmuludq $H1,$T2,$T0 # h1*r1
1274 + vpaddq $T0,$D2,$D2 # d2 += h1*r1
1275 + vpmuludq $H0,$T2,$T2 # h0*r1
1276 + vpaddq $T2,$D1,$D1 # d1 += h0*r1
1277 + vpmuludq $H4,$T3,$T3 # h4*s1
1278 + vpaddq $T3,$D0,$D0 # d0 += h4*s1
1280 + vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1281 + vpmuludq $H2,$T4,$T1 # h2*r2
1282 + vpaddq $T1,$D4,$D4 # d4 += h2*r2
1283 + vpmuludq $H1,$T4,$T0 # h1*r2
1284 + vpaddq $T0,$D3,$D3 # d3 += h1*r2
1285 + vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1286 + vpmuludq $H0,$T4,$T4 # h0*r2
1287 + vpaddq $T4,$D2,$D2 # d2 += h0*r2
1288 + vpmuludq $H4,$T2,$T1 # h4*s2
1289 + vpaddq $T1,$D1,$D1 # d1 += h4*s2
1290 + vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1291 + vpmuludq $H3,$T2,$T2 # h3*s2
1292 + vpaddq $T2,$D0,$D0 # d0 += h3*s2
1294 + vpmuludq $H1,$T3,$T0 # h1*r3
1295 + vpaddq $T0,$D4,$D4 # d4 += h1*r3
1296 + vpmuludq $H0,$T3,$T3 # h0*r3
1297 + vpaddq $T3,$D3,$D3 # d3 += h0*r3
1298 + vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1299 + vpmuludq $H4,$T4,$T1 # h4*s3
1300 + vpaddq $T1,$D2,$D2 # d2 += h4*s3
1301 + vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1302 + vpmuludq $H3,$T4,$T0 # h3*s3
1303 + vpaddq $T0,$D1,$D1 # d1 += h3*s3
1304 + vpmuludq $H2,$T4,$T4 # h2*s3
1305 + vpaddq $T4,$D0,$D0 # d0 += h2*s3
1307 + vpmuludq $H0,$T2,$T2 # h0*r4
1308 + vpaddq $T2,$D4,$D4 # d4 += h0*r4
1309 + vpmuludq $H4,$T3,$T1 # h4*s4
1310 + vpaddq $T1,$D3,$D3 # d3 += h4*s4
1311 + vpmuludq $H3,$T3,$T0 # h3*s4
1312 + vpaddq $T0,$D2,$D2 # d2 += h3*s4
1313 + vpmuludq $H2,$T3,$T1 # h2*s4
1314 + vpaddq $T1,$D1,$D1 # d1 += h2*s4
1315 + vpmuludq $H1,$T3,$T3 # h1*s4
1316 + vpaddq $T3,$D0,$D0 # d0 += h1*s4
1319 + ################################################################
1320 + # horizontal addition
1322 + vpsrldq \$8,$D4,$T4
1323 + vpsrldq \$8,$D3,$T3
1324 + vpsrldq \$8,$D1,$T1
1325 + vpsrldq \$8,$D0,$T0
1326 + vpsrldq \$8,$D2,$T2
1327 + vpaddq $T3,$D3,$D3
1328 + vpaddq $T4,$D4,$D4
1329 + vpaddq $T0,$D0,$D0
1330 + vpaddq $T1,$D1,$D1
1331 + vpaddq $T2,$D2,$D2
1333 + ################################################################
1336 + vpsrlq \$26,$D3,$H3
1337 + vpand $MASK,$D3,$D3
1338 + vpaddq $H3,$D4,$D4 # h3 -> h4
1340 + vpsrlq \$26,$D0,$H0
1341 + vpand $MASK,$D0,$D0
1342 + vpaddq $H0,$D1,$D1 # h0 -> h1
1344 + vpsrlq \$26,$D4,$H4
1345 + vpand $MASK,$D4,$D4
1347 + vpsrlq \$26,$D1,$H1
1348 + vpand $MASK,$D1,$D1
1349 + vpaddq $H1,$D2,$D2 # h1 -> h2
1351 + vpaddq $H4,$D0,$D0
1352 + vpsllq \$2,$H4,$H4
1353 + vpaddq $H4,$D0,$D0 # h4 -> h0
1355 + vpsrlq \$26,$D2,$H2
1356 + vpand $MASK,$D2,$D2
1357 + vpaddq $H2,$D3,$D3 # h2 -> h3
1359 + vpsrlq \$26,$D0,$H0
1360 + vpand $MASK,$D0,$D0
1361 + vpaddq $H0,$D1,$D1 # h0 -> h1
1363 + vpsrlq \$26,$D3,$H3
1364 + vpand $MASK,$D3,$D3
1365 + vpaddq $H3,$D4,$D4 # h3 -> h4
1367 + vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1368 + vmovd $D1,`4*1-48-64`($ctx)
1369 + vmovd $D2,`4*2-48-64`($ctx)
1370 + vmovd $D3,`4*3-48-64`($ctx)
1371 + vmovd $D4,`4*4-48-64`($ctx)
1373 +$code.=<<___ if ($win64);
1374 + vmovdqa 0x50(%r11),%xmm6
1375 + vmovdqa 0x60(%r11),%xmm7
1376 + vmovdqa 0x70(%r11),%xmm8
1377 + vmovdqa 0x80(%r11),%xmm9
1378 + vmovdqa 0x90(%r11),%xmm10
1379 + vmovdqa 0xa0(%r11),%xmm11
1380 + vmovdqa 0xb0(%r11),%xmm12
1381 + vmovdqa 0xc0(%r11),%xmm13
1382 + vmovdqa 0xd0(%r11),%xmm14
1383 + vmovdqa 0xe0(%r11),%xmm15
1384 + lea 0xf8(%r11),%rsp
1387 +$code.=<<___ if (!$win64);
1388 + lea 0x58(%r11),%rsp
1389 +.cfi_def_cfa %rsp,8
1395 +.size poly1305_blocks_avx,.-poly1305_blocks_avx
1397 +.type poly1305_emit_avx,\@function,3
1400 + cmpl \$0,20($ctx) # is_base2_26?
1403 + mov 0($ctx),%eax # load hash value base 2^26
1406 + mov 12($ctx),%r11d
1407 + mov 16($ctx),%r10d
1409 + shl \$26,%rcx # base 2^26 -> base 2^64
1425 + mov %r10,%rax # could be partially reduced, so reduce
1436 + add \$5,%r8 # compare to modulus
1440 + shr \$2,%r10 # did 130-bit value overflow?
1444 + add 0($nonce),%rax # accumulate nonce
1445 + adc 8($nonce),%rcx
1446 + mov %rax,0($mac) # write result
1450 +.size poly1305_emit_avx,.-poly1305_emit_avx
1454 +my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1455 + map("%ymm$_",(0..15));
1459 +.type poly1305_blocks_avx2,\@function,4
1461 +poly1305_blocks_avx2:
1463 + mov 20($ctx),%r8d # is_base2_26
1476 + jz .Lbase2_64_avx2
1493 +.Lblocks_avx2_body:
1495 + mov $len,%r15 # reassign $len
1497 + mov 0($ctx),$d1 # load hash value
1499 + mov 16($ctx),$h2#d
1501 + mov 24($ctx),$r0 # load r
1504 + ################################# base 2^26 -> base 2^64
1506 + and \$`-1*(1<<31)`,$d1
1507 + mov $d2,$r1 # borrow $r1
1509 + and \$`-1*(1<<31)`,$d2
1523 + adc \$0,$h2 # can be partially reduced...
1525 + mov \$-4,$d2 # ... so reduce
1538 + add $r1,$s1 # s1 = r1 + (r1 >> 2)
1540 +.Lbase2_26_pre_avx2:
1541 + add 0($inp),$h0 # accumulate input
1547 + call __poly1305_block
1551 + jnz .Lbase2_26_pre_avx2
1553 + test $padbit,$padbit # if $padbit is zero,
1554 + jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
1556 + ################################# base 2^64 -> base 2^26
1563 + and \$0x3ffffff,%rax # h[0]
1565 + and \$0x3ffffff,%rdx # h[1]
1569 + and \$0x3ffffff,$h0 # h[2]
1571 + and \$0x3ffffff,$h1 # h[3]
1575 + jz .Lstore_base2_26_avx2
1577 + vmovd %rax#d,%x#$H0
1578 + vmovd %rdx#d,%x#$H1
1579 + vmovd $h0#d,%x#$H2
1580 + vmovd $h1#d,%x#$H3
1581 + vmovd $h2#d,%x#$H4
1582 + jmp .Lproceed_avx2
1585 +.Lstore_base2_64_avx2:
1588 + mov $h2,16($ctx) # note that is_base2_26 is zeroed
1592 +.Lstore_base2_26_avx2:
1593 + mov %rax#d,0($ctx) # store hash value base 2^26
1594 + mov %rdx#d,4($ctx)
1596 + mov $h1#d,12($ctx)
1597 + mov $h2#d,16($ctx)
1613 +.cfi_adjust_cfa_offset -48
1615 +.Lblocks_avx2_epilogue:
1634 +.Lbase2_64_avx2_body:
1636 + mov $len,%r15 # reassign $len
1638 + mov 24($ctx),$r0 # load r
1641 + mov 0($ctx),$h0 # load hash value
1643 + mov 16($ctx),$h2#d
1648 + add $r1,$s1 # s1 = r1 + (r1 >> 2)
1653 +.Lbase2_64_pre_avx2:
1654 + add 0($inp),$h0 # accumulate input
1660 + call __poly1305_block
1664 + jnz .Lbase2_64_pre_avx2
1667 + ################################# base 2^64 -> base 2^26
1674 + and \$0x3ffffff,%rax # h[0]
1676 + and \$0x3ffffff,%rdx # h[1]
1680 + and \$0x3ffffff,$h0 # h[2]
1682 + and \$0x3ffffff,$h1 # h[3]
1685 + vmovd %rax#d,%x#$H0
1686 + vmovd %rdx#d,%x#$H1
1687 + vmovd $h0#d,%x#$H2
1688 + vmovd $h1#d,%x#$H3
1689 + vmovd $h2#d,%x#$H4
1690 + movl \$1,20($ctx) # set is_base2_26
1692 + call __poly1305_init_avx
1695 + mov %r15,$len # restore $len
1696 + mov OPENSSL_ia32cap_P+8(%rip),%r10d
1697 + mov \$`(1<<31|1<<30|1<<16)`,%r11d
1713 +.cfi_adjust_cfa_offset -48
1714 +.Lbase2_64_avx2_epilogue:
1721 + mov OPENSSL_ia32cap_P+8(%rip),%r10d
1722 + vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1723 + vmovd 4*1($ctx),%x#$H1
1724 + vmovd 4*2($ctx),%x#$H2
1725 + vmovd 4*3($ctx),%x#$H3
1726 + vmovd 4*4($ctx),%x#$H4
1730 +$code.=<<___ if ($avx>2);
1734 + test \$`1<<16`,%r10d # check for AVX512F
1735 + jnz .Lblocks_avx512
1738 +$code.=<<___ if (!$win64);
1740 +.cfi_def_cfa %r11,16
1743 +$code.=<<___ if ($win64);
1744 + lea -0xf8(%rsp),%r11
1746 + vmovdqa %xmm6,0x50(%r11)
1747 + vmovdqa %xmm7,0x60(%r11)
1748 + vmovdqa %xmm8,0x70(%r11)
1749 + vmovdqa %xmm9,0x80(%r11)
1750 + vmovdqa %xmm10,0x90(%r11)
1751 + vmovdqa %xmm11,0xa0(%r11)
1752 + vmovdqa %xmm12,0xb0(%r11)
1753 + vmovdqa %xmm13,0xc0(%r11)
1754 + vmovdqa %xmm14,0xd0(%r11)
1755 + vmovdqa %xmm15,0xe0(%r11)
1759 + lea .Lconst(%rip),%rcx
1760 + lea 48+64($ctx),$ctx # size optimization
1761 + vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
1763 + # expand and copy pre-calculated table to stack
1764 + vmovdqu `16*0-64`($ctx),%x#$T2
1766 + vmovdqu `16*1-64`($ctx),%x#$T3
1767 + vmovdqu `16*2-64`($ctx),%x#$T4
1768 + vmovdqu `16*3-64`($ctx),%x#$D0
1769 + vmovdqu `16*4-64`($ctx),%x#$D1
1770 + vmovdqu `16*5-64`($ctx),%x#$D2
1771 + lea 0x90(%rsp),%rax # size optimization
1772 + vmovdqu `16*6-64`($ctx),%x#$D3
1773 + vpermd $T2,$T0,$T2 # 00003412 -> 14243444
1774 + vmovdqu `16*7-64`($ctx),%x#$D4
1775 + vpermd $T3,$T0,$T3
1776 + vmovdqu `16*8-64`($ctx),%x#$MASK
1777 + vpermd $T4,$T0,$T4
1778 + vmovdqa $T2,0x00(%rsp)
1779 + vpermd $D0,$T0,$D0
1780 + vmovdqa $T3,0x20-0x90(%rax)
1781 + vpermd $D1,$T0,$D1
1782 + vmovdqa $T4,0x40-0x90(%rax)
1783 + vpermd $D2,$T0,$D2
1784 + vmovdqa $D0,0x60-0x90(%rax)
1785 + vpermd $D3,$T0,$D3
1786 + vmovdqa $D1,0x80-0x90(%rax)
1787 + vpermd $D4,$T0,$D4
1788 + vmovdqa $D2,0xa0-0x90(%rax)
1789 + vpermd $MASK,$T0,$MASK
1790 + vmovdqa $D3,0xc0-0x90(%rax)
1791 + vmovdqa $D4,0xe0-0x90(%rax)
1792 + vmovdqa $MASK,0x100-0x90(%rax)
1793 + vmovdqa 64(%rcx),$MASK # .Lmask26
1795 + ################################################################
1797 + vmovdqu 16*0($inp),%x#$T0
1798 + vmovdqu 16*1($inp),%x#$T1
1799 + vinserti128 \$1,16*2($inp),$T0,$T0
1800 + vinserti128 \$1,16*3($inp),$T1,$T1
1801 + lea 16*4($inp),$inp
1803 + vpsrldq \$6,$T0,$T2 # splat input
1804 + vpsrldq \$6,$T1,$T3
1805 + vpunpckhqdq $T1,$T0,$T4 # 4
1806 + vpunpcklqdq $T3,$T2,$T2 # 2:3
1807 + vpunpcklqdq $T1,$T0,$T0 # 0:1
1809 + vpsrlq \$30,$T2,$T3
1810 + vpsrlq \$4,$T2,$T2
1811 + vpsrlq \$26,$T0,$T1
1812 + vpsrlq \$40,$T4,$T4 # 4
1813 + vpand $MASK,$T2,$T2 # 2
1814 + vpand $MASK,$T0,$T0 # 0
1815 + vpand $MASK,$T1,$T1 # 1
1816 + vpand $MASK,$T3,$T3 # 3
1817 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1819 + vpaddq $H2,$T2,$H2 # accumulate input
1826 + ################################################################
1827 + # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1828 + # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1829 + # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1830 + # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1831 + # \________/\__________/
1832 + ################################################################
1833 + #vpaddq $H2,$T2,$H2 # accumulate input
1834 + vpaddq $H0,$T0,$H0
1835 + vmovdqa `32*0`(%rsp),$T0 # r0^4
1836 + vpaddq $H1,$T1,$H1
1837 + vmovdqa `32*1`(%rsp),$T1 # r1^4
1838 + vpaddq $H3,$T3,$H3
1839 + vmovdqa `32*3`(%rsp),$T2 # r2^4
1840 + vpaddq $H4,$T4,$H4
1841 + vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1842 + vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1844 + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1845 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1846 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1847 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1848 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1850 + # however, as h2 is "chronologically" first one available pull
1851 + # corresponding operations up, so it's
1853 + # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1854 + # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1855 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1856 + # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1857 + # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1859 + vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1860 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1861 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1862 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1863 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1865 + vpmuludq $H0,$T1,$T4 # h0*r1
1866 + vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1867 + vpaddq $T4,$D1,$D1 # d1 += h0*r1
1868 + vpaddq $H2,$D2,$D2 # d2 += h1*r1
1869 + vpmuludq $H3,$T1,$T4 # h3*r1
1870 + vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1871 + vpaddq $T4,$D4,$D4 # d4 += h3*r1
1872 + vpaddq $H2,$D0,$D0 # d0 += h4*s1
1873 + vmovdqa `32*4-0x90`(%rax),$T1 # s2
1875 + vpmuludq $H0,$T0,$T4 # h0*r0
1876 + vpmuludq $H1,$T0,$H2 # h1*r0
1877 + vpaddq $T4,$D0,$D0 # d0 += h0*r0
1878 + vpaddq $H2,$D1,$D1 # d1 += h1*r0
1879 + vpmuludq $H3,$T0,$T4 # h3*r0
1880 + vpmuludq $H4,$T0,$H2 # h4*r0
1881 + vmovdqu 16*0($inp),%x#$T0 # load input
1882 + vpaddq $T4,$D3,$D3 # d3 += h3*r0
1883 + vpaddq $H2,$D4,$D4 # d4 += h4*r0
1884 + vinserti128 \$1,16*2($inp),$T0,$T0
1886 + vpmuludq $H3,$T1,$T4 # h3*s2
1887 + vpmuludq $H4,$T1,$H2 # h4*s2
1888 + vmovdqu 16*1($inp),%x#$T1
1889 + vpaddq $T4,$D0,$D0 # d0 += h3*s2
1890 + vpaddq $H2,$D1,$D1 # d1 += h4*s2
1891 + vmovdqa `32*5-0x90`(%rax),$H2 # r3
1892 + vpmuludq $H1,$T2,$T4 # h1*r2
1893 + vpmuludq $H0,$T2,$T2 # h0*r2
1894 + vpaddq $T4,$D3,$D3 # d3 += h1*r2
1895 + vpaddq $T2,$D2,$D2 # d2 += h0*r2
1896 + vinserti128 \$1,16*3($inp),$T1,$T1
1897 + lea 16*4($inp),$inp
1899 + vpmuludq $H1,$H2,$T4 # h1*r3
1900 + vpmuludq $H0,$H2,$H2 # h0*r3
1901 + vpsrldq \$6,$T0,$T2 # splat input
1902 + vpaddq $T4,$D4,$D4 # d4 += h1*r3
1903 + vpaddq $H2,$D3,$D3 # d3 += h0*r3
1904 + vpmuludq $H3,$T3,$T4 # h3*s3
1905 + vpmuludq $H4,$T3,$H2 # h4*s3
1906 + vpsrldq \$6,$T1,$T3
1907 + vpaddq $T4,$D1,$D1 # d1 += h3*s3
1908 + vpaddq $H2,$D2,$D2 # d2 += h4*s3
1909 + vpunpckhqdq $T1,$T0,$T4 # 4
1911 + vpmuludq $H3,$S4,$H3 # h3*s4
1912 + vpmuludq $H4,$S4,$H4 # h4*s4
1913 + vpunpcklqdq $T1,$T0,$T0 # 0:1
1914 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1915 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1916 + vpunpcklqdq $T3,$T2,$T3 # 2:3
1917 + vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
1918 + vpmuludq $H1,$S4,$H0 # h1*s4
1919 + vmovdqa 64(%rcx),$MASK # .Lmask26
1920 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1921 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1923 + ################################################################
1924 + # lazy reduction (interleaved with tail of input splat)
1926 + vpsrlq \$26,$H3,$D3
1927 + vpand $MASK,$H3,$H3
1928 + vpaddq $D3,$H4,$H4 # h3 -> h4
1930 + vpsrlq \$26,$H0,$D0
1931 + vpand $MASK,$H0,$H0
1932 + vpaddq $D0,$D1,$H1 # h0 -> h1
1934 + vpsrlq \$26,$H4,$D4
1935 + vpand $MASK,$H4,$H4
1937 + vpsrlq \$4,$T3,$T2
1939 + vpsrlq \$26,$H1,$D1
1940 + vpand $MASK,$H1,$H1
1941 + vpaddq $D1,$H2,$H2 # h1 -> h2
1943 + vpaddq $D4,$H0,$H0
1944 + vpsllq \$2,$D4,$D4
1945 + vpaddq $D4,$H0,$H0 # h4 -> h0
1947 + vpand $MASK,$T2,$T2 # 2
1948 + vpsrlq \$26,$T0,$T1
1950 + vpsrlq \$26,$H2,$D2
1951 + vpand $MASK,$H2,$H2
1952 + vpaddq $D2,$H3,$H3 # h2 -> h3
1954 + vpaddq $T2,$H2,$H2 # modulo-scheduled
1955 + vpsrlq \$30,$T3,$T3
1957 + vpsrlq \$26,$H0,$D0
1958 + vpand $MASK,$H0,$H0
1959 + vpaddq $D0,$H1,$H1 # h0 -> h1
1961 + vpsrlq \$40,$T4,$T4 # 4
1963 + vpsrlq \$26,$H3,$D3
1964 + vpand $MASK,$H3,$H3
1965 + vpaddq $D3,$H4,$H4 # h3 -> h4
1967 + vpand $MASK,$T0,$T0 # 0
1968 + vpand $MASK,$T1,$T1 # 1
1969 + vpand $MASK,$T3,$T3 # 3
1970 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1977 + ################################################################
1978 + # while above multiplications were by r^4 in all lanes, in last
1979 + # iteration we multiply least significant lane by r^4 and most
1980 + # significant one by r, so copy of above except that references
1981 + # to the precomputed table are displaced by 4...
1983 + #vpaddq $H2,$T2,$H2 # accumulate input
1984 + vpaddq $H0,$T0,$H0
1985 + vmovdqu `32*0+4`(%rsp),$T0 # r0^4
1986 + vpaddq $H1,$T1,$H1
1987 + vmovdqu `32*1+4`(%rsp),$T1 # r1^4
1988 + vpaddq $H3,$T3,$H3
1989 + vmovdqu `32*3+4`(%rsp),$T2 # r2^4
1990 + vpaddq $H4,$T4,$H4
1991 + vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
1992 + vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
1994 + vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1995 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1996 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1997 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1998 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2000 + vpmuludq $H0,$T1,$T4 # h0*r1
2001 + vpmuludq $H1,$T1,$H2 # h1*r1
2002 + vpaddq $T4,$D1,$D1 # d1 += h0*r1
2003 + vpaddq $H2,$D2,$D2 # d2 += h1*r1
2004 + vpmuludq $H3,$T1,$T4 # h3*r1
2005 + vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
2006 + vpaddq $T4,$D4,$D4 # d4 += h3*r1
2007 + vpaddq $H2,$D0,$D0 # d0 += h4*s1
2009 + vpmuludq $H0,$T0,$T4 # h0*r0
2010 + vpmuludq $H1,$T0,$H2 # h1*r0
2011 + vpaddq $T4,$D0,$D0 # d0 += h0*r0
2012 + vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
2013 + vpaddq $H2,$D1,$D1 # d1 += h1*r0
2014 + vpmuludq $H3,$T0,$T4 # h3*r0
2015 + vpmuludq $H4,$T0,$H2 # h4*r0
2016 + vpaddq $T4,$D3,$D3 # d3 += h3*r0
2017 + vpaddq $H2,$D4,$D4 # d4 += h4*r0
2019 + vpmuludq $H3,$T1,$T4 # h3*s2
2020 + vpmuludq $H4,$T1,$H2 # h4*s2
2021 + vpaddq $T4,$D0,$D0 # d0 += h3*s2
2022 + vpaddq $H2,$D1,$D1 # d1 += h4*s2
2023 + vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
2024 + vpmuludq $H1,$T2,$T4 # h1*r2
2025 + vpmuludq $H0,$T2,$T2 # h0*r2
2026 + vpaddq $T4,$D3,$D3 # d3 += h1*r2
2027 + vpaddq $T2,$D2,$D2 # d2 += h0*r2
2029 + vpmuludq $H1,$H2,$T4 # h1*r3
2030 + vpmuludq $H0,$H2,$H2 # h0*r3
2031 + vpaddq $T4,$D4,$D4 # d4 += h1*r3
2032 + vpaddq $H2,$D3,$D3 # d3 += h0*r3
2033 + vpmuludq $H3,$T3,$T4 # h3*s3
2034 + vpmuludq $H4,$T3,$H2 # h4*s3
2035 + vpaddq $T4,$D1,$D1 # d1 += h3*s3
2036 + vpaddq $H2,$D2,$D2 # d2 += h4*s3
2038 + vpmuludq $H3,$S4,$H3 # h3*s4
2039 + vpmuludq $H4,$S4,$H4 # h4*s4
2040 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
2041 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
2042 + vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
2043 + vpmuludq $H1,$S4,$H0 # h1*s4
2044 + vmovdqa 64(%rcx),$MASK # .Lmask26
2045 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
2046 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
2048 + ################################################################
2049 + # horizontal addition
2051 + vpsrldq \$8,$D1,$T1
2052 + vpsrldq \$8,$H2,$T2
2053 + vpsrldq \$8,$H3,$T3
2054 + vpsrldq \$8,$H4,$T4
2055 + vpsrldq \$8,$H0,$T0
2056 + vpaddq $T1,$D1,$D1
2057 + vpaddq $T2,$H2,$H2
2058 + vpaddq $T3,$H3,$H3
2059 + vpaddq $T4,$H4,$H4
2060 + vpaddq $T0,$H0,$H0
2062 + vpermq \$0x2,$H3,$T3
2063 + vpermq \$0x2,$H4,$T4
2064 + vpermq \$0x2,$H0,$T0
2065 + vpermq \$0x2,$D1,$T1
2066 + vpermq \$0x2,$H2,$T2
2067 + vpaddq $T3,$H3,$H3
2068 + vpaddq $T4,$H4,$H4
2069 + vpaddq $T0,$H0,$H0
2070 + vpaddq $T1,$D1,$D1
2071 + vpaddq $T2,$H2,$H2
2073 + ################################################################
2076 + vpsrlq \$26,$H3,$D3
2077 + vpand $MASK,$H3,$H3
2078 + vpaddq $D3,$H4,$H4 # h3 -> h4
2080 + vpsrlq \$26,$H0,$D0
2081 + vpand $MASK,$H0,$H0
2082 + vpaddq $D0,$D1,$H1 # h0 -> h1
2084 + vpsrlq \$26,$H4,$D4
2085 + vpand $MASK,$H4,$H4
2087 + vpsrlq \$26,$H1,$D1
2088 + vpand $MASK,$H1,$H1
2089 + vpaddq $D1,$H2,$H2 # h1 -> h2
2091 + vpaddq $D4,$H0,$H0
2092 + vpsllq \$2,$D4,$D4
2093 + vpaddq $D4,$H0,$H0 # h4 -> h0
2095 + vpsrlq \$26,$H2,$D2
2096 + vpand $MASK,$H2,$H2
2097 + vpaddq $D2,$H3,$H3 # h2 -> h3
2099 + vpsrlq \$26,$H0,$D0
2100 + vpand $MASK,$H0,$H0
2101 + vpaddq $D0,$H1,$H1 # h0 -> h1
2103 + vpsrlq \$26,$H3,$D3
2104 + vpand $MASK,$H3,$H3
2105 + vpaddq $D3,$H4,$H4 # h3 -> h4
2107 + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2108 + vmovd %x#$H1,`4*1-48-64`($ctx)
2109 + vmovd %x#$H2,`4*2-48-64`($ctx)
2110 + vmovd %x#$H3,`4*3-48-64`($ctx)
2111 + vmovd %x#$H4,`4*4-48-64`($ctx)
2113 +$code.=<<___ if ($win64);
2114 + vmovdqa 0x50(%r11),%xmm6
2115 + vmovdqa 0x60(%r11),%xmm7
2116 + vmovdqa 0x70(%r11),%xmm8
2117 + vmovdqa 0x80(%r11),%xmm9
2118 + vmovdqa 0x90(%r11),%xmm10
2119 + vmovdqa 0xa0(%r11),%xmm11
2120 + vmovdqa 0xb0(%r11),%xmm12
2121 + vmovdqa 0xc0(%r11),%xmm13
2122 + vmovdqa 0xd0(%r11),%xmm14
2123 + vmovdqa 0xe0(%r11),%xmm15
2124 + lea 0xf8(%r11),%rsp
2125 +.Ldo_avx2_epilogue:
2127 +$code.=<<___ if (!$win64);
2129 +.cfi_def_cfa %rsp,8
2135 +.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
2137 +#######################################################################
2139 +# On entry we have input length divisible by 64. But since inner loop
2140 +# processes 128 bytes per iteration, cases when length is not divisible
2141 +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2142 +# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2143 +# for this tail, we wouldn't have to even allocate stack frame...
2145 +my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2146 +my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2147 +my $PADBIT="%zmm30";
2149 +map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
2150 +map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2151 +map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2152 +map(s/%y/%z/,($MASK));
2155 +.type poly1305_blocks_avx512,\@function,4
2157 +poly1305_blocks_avx512:
2163 +$code.=<<___ if (!$win64);
2165 +.cfi_def_cfa %r11,16
2168 +$code.=<<___ if ($win64);
2169 + lea -0xf8(%rsp),%r11
2171 + vmovdqa %xmm6,0x50(%r11)
2172 + vmovdqa %xmm7,0x60(%r11)
2173 + vmovdqa %xmm8,0x70(%r11)
2174 + vmovdqa %xmm9,0x80(%r11)
2175 + vmovdqa %xmm10,0x90(%r11)
2176 + vmovdqa %xmm11,0xa0(%r11)
2177 + vmovdqa %xmm12,0xb0(%r11)
2178 + vmovdqa %xmm13,0xc0(%r11)
2179 + vmovdqa %xmm14,0xd0(%r11)
2180 + vmovdqa %xmm15,0xe0(%r11)
2184 + lea .Lconst(%rip),%rcx
2185 + lea 48+64($ctx),$ctx # size optimization
2186 + vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
2188 + # expand pre-calculated table
2189 + vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
2191 + vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
2193 + vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
2194 + vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
2195 + vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
2196 + vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
2197 + vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
2198 + vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
2199 + vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
2200 + vpermd $D0,$T2,$R0 # 00003412 -> 14243444
2201 + vpbroadcastq 64(%rcx),$MASK # .Lmask26
2202 + vpermd $D1,$T2,$R1
2203 + vpermd $T0,$T2,$S1
2204 + vpermd $D2,$T2,$R2
2205 + vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
2206 + vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
2207 + vpermd $T1,$T2,$S2
2208 + vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
2209 + vpsrlq \$32,$R1,$T1
2210 + vpermd $D3,$T2,$R3
2211 + vmovdqa64 $S1,0x40(%rsp){%k2}
2212 + vpermd $T3,$T2,$S3
2213 + vpermd $D4,$T2,$R4
2214 + vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
2215 + vpermd $T4,$T2,$S4
2216 + vmovdqa64 $S2,0x80(%rsp){%k2}
2217 + vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
2218 + vmovdqa64 $S3,0xc0(%rsp){%k2}
2219 + vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
2220 + vmovdqa64 $S4,0x100(%rsp){%k2}
2222 + ################################################################
2223 + # calculate 5th through 8th powers of the key
2225 + # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2226 + # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2227 + # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
2228 + # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
2229 + # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
2231 + vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
2232 + vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
2233 + vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
2234 + vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
2235 + vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
2236 + vpsrlq \$32,$R2,$T2
2238 + vpmuludq $T1,$S4,$M0
2239 + vpmuludq $T1,$R0,$M1
2240 + vpmuludq $T1,$R1,$M2
2241 + vpmuludq $T1,$R2,$M3
2242 + vpmuludq $T1,$R3,$M4
2243 + vpsrlq \$32,$R3,$T3
2244 + vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
2245 + vpaddq $M1,$D1,$D1 # d1 += r1'*r0
2246 + vpaddq $M2,$D2,$D2 # d2 += r1'*r1
2247 + vpaddq $M3,$D3,$D3 # d3 += r1'*r2
2248 + vpaddq $M4,$D4,$D4 # d4 += r1'*r3
2250 + vpmuludq $T2,$S3,$M0
2251 + vpmuludq $T2,$S4,$M1
2252 + vpmuludq $T2,$R1,$M3
2253 + vpmuludq $T2,$R2,$M4
2254 + vpmuludq $T2,$R0,$M2
2255 + vpsrlq \$32,$R4,$T4
2256 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
2257 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
2258 + vpaddq $M3,$D3,$D3 # d3 += r2'*r1
2259 + vpaddq $M4,$D4,$D4 # d4 += r2'*r2
2260 + vpaddq $M2,$D2,$D2 # d2 += r2'*r0
2262 + vpmuludq $T3,$S2,$M0
2263 + vpmuludq $T3,$R0,$M3
2264 + vpmuludq $T3,$R1,$M4
2265 + vpmuludq $T3,$S3,$M1
2266 + vpmuludq $T3,$S4,$M2
2267 + vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
2268 + vpaddq $M3,$D3,$D3 # d3 += r3'*r0
2269 + vpaddq $M4,$D4,$D4 # d4 += r3'*r1
2270 + vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
2271 + vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
2273 + vpmuludq $T4,$S4,$M3
2274 + vpmuludq $T4,$R0,$M4
2275 + vpmuludq $T4,$S1,$M0
2276 + vpmuludq $T4,$S2,$M1
2277 + vpmuludq $T4,$S3,$M2
2278 + vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
2279 + vpaddq $M4,$D4,$D4 # d4 += r2'*r0
2280 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
2281 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
2282 + vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
2284 + ################################################################
2286 + vmovdqu64 16*0($inp),%z#$T3
2287 + vmovdqu64 16*4($inp),%z#$T4
2288 + lea 16*8($inp),$inp
2290 + ################################################################
2293 + vpsrlq \$26,$D3,$M3
2294 + vpandq $MASK,$D3,$D3
2295 + vpaddq $M3,$D4,$D4 # d3 -> d4
2297 + vpsrlq \$26,$D0,$M0
2298 + vpandq $MASK,$D0,$D0
2299 + vpaddq $M0,$D1,$D1 # d0 -> d1
2301 + vpsrlq \$26,$D4,$M4
2302 + vpandq $MASK,$D4,$D4
2304 + vpsrlq \$26,$D1,$M1
2305 + vpandq $MASK,$D1,$D1
2306 + vpaddq $M1,$D2,$D2 # d1 -> d2
2308 + vpaddq $M4,$D0,$D0
2309 + vpsllq \$2,$M4,$M4
2310 + vpaddq $M4,$D0,$D0 # d4 -> d0
2312 + vpsrlq \$26,$D2,$M2
2313 + vpandq $MASK,$D2,$D2
2314 + vpaddq $M2,$D3,$D3 # d2 -> d3
2316 + vpsrlq \$26,$D0,$M0
2317 + vpandq $MASK,$D0,$D0
2318 + vpaddq $M0,$D1,$D1 # d0 -> d1
2320 + vpsrlq \$26,$D3,$M3
2321 + vpandq $MASK,$D3,$D3
2322 + vpaddq $M3,$D4,$D4 # d3 -> d4
2324 + ################################################################
2325 + # at this point we have 14243444 in $R0-$S4 and 05060708 in
2328 + vpunpcklqdq $T4,$T3,$T0 # transpose input
2329 + vpunpckhqdq $T4,$T3,$T4
2331 + # ... since input 64-bit lanes are ordered as 73625140, we could
2332 + # "vperm" it to 76543210 (here and in each loop iteration), *or*
2333 + # we could just flow along, hence the goal for $R0-$S4 is
2334 + # 1858286838784888 ...
2336 + vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
2340 + vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
2341 + vpermd $R1,$M0,$R1
2342 + vpermd $R2,$M0,$R2
2343 + vpermd $R3,$M0,$R3
2344 + vpermd $R4,$M0,$R4
2346 + vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
2347 + vpermd $D1,$M0,${R1}{%k1}
2348 + vpermd $D2,$M0,${R2}{%k1}
2349 + vpermd $D3,$M0,${R3}{%k1}
2350 + vpermd $D4,$M0,${R4}{%k1}
2352 + vpslld \$2,$R1,$S1 # *5
2353 + vpslld \$2,$R2,$S2
2354 + vpslld \$2,$R3,$S3
2355 + vpslld \$2,$R4,$S4
2356 + vpaddd $R1,$S1,$S1
2357 + vpaddd $R2,$S2,$S2
2358 + vpaddd $R3,$S3,$S3
2359 + vpaddd $R4,$S4,$S4
2361 + vpbroadcastq 32(%rcx),$PADBIT # .L129
2363 + vpsrlq \$52,$T0,$T2 # splat input
2364 + vpsllq \$12,$T4,$T3
2366 + vpsrlq \$26,$T0,$T1
2367 + vpsrlq \$14,$T4,$T3
2368 + vpsrlq \$40,$T4,$T4 # 4
2369 + vpandq $MASK,$T2,$T2 # 2
2370 + vpandq $MASK,$T0,$T0 # 0
2371 + #vpandq $MASK,$T1,$T1 # 1
2372 + #vpandq $MASK,$T3,$T3 # 3
2373 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2375 + vpaddq $H2,$T2,$H2 # accumulate input
2382 + ################################################################
2383 + # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2384 + # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2385 + # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2386 + # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2387 + # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2388 + # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2389 + # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2390 + # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2391 + # \________/\___________/
2392 + ################################################################
2393 + #vpaddq $H2,$T2,$H2 # accumulate input
2395 + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
2396 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
2397 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
2398 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2399 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2401 + # however, as h2 is "chronologically" first one available pull
2402 + # corresponding operations up, so it's
2404 + # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
2405 + # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
2406 + # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
2407 + # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
2408 + # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
2410 + vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2411 + vpaddq $H0,$T0,$H0
2412 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2413 + vpandq $MASK,$T1,$T1 # 1
2414 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2415 + vpandq $MASK,$T3,$T3 # 3
2416 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2417 + vporq $PADBIT,$T4,$T4 # padbit, yes, always
2418 + vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2419 + vpaddq $H1,$T1,$H1 # accumulate input
2420 + vpaddq $H3,$T3,$H3
2421 + vpaddq $H4,$T4,$H4
2423 + vmovdqu64 16*0($inp),$T3 # load input
2424 + vmovdqu64 16*4($inp),$T4
2425 + lea 16*8($inp),$inp
2426 + vpmuludq $H0,$R3,$M3
2427 + vpmuludq $H0,$R4,$M4
2428 + vpmuludq $H0,$R0,$M0
2429 + vpmuludq $H0,$R1,$M1
2430 + vpaddq $M3,$D3,$D3 # d3 += h0*r3
2431 + vpaddq $M4,$D4,$D4 # d4 += h0*r4
2432 + vpaddq $M0,$D0,$D0 # d0 += h0*r0
2433 + vpaddq $M1,$D1,$D1 # d1 += h0*r1
2435 + vpmuludq $H1,$R2,$M3
2436 + vpmuludq $H1,$R3,$M4
2437 + vpmuludq $H1,$S4,$M0
2438 + vpmuludq $H0,$R2,$M2
2439 + vpaddq $M3,$D3,$D3 # d3 += h1*r2
2440 + vpaddq $M4,$D4,$D4 # d4 += h1*r3
2441 + vpaddq $M0,$D0,$D0 # d0 += h1*s4
2442 + vpaddq $M2,$D2,$D2 # d2 += h0*r2
2444 + vpunpcklqdq $T4,$T3,$T0 # transpose input
2445 + vpunpckhqdq $T4,$T3,$T4
2447 + vpmuludq $H3,$R0,$M3
2448 + vpmuludq $H3,$R1,$M4
2449 + vpmuludq $H1,$R0,$M1
2450 + vpmuludq $H1,$R1,$M2
2451 + vpaddq $M3,$D3,$D3 # d3 += h3*r0
2452 + vpaddq $M4,$D4,$D4 # d4 += h3*r1
2453 + vpaddq $M1,$D1,$D1 # d1 += h1*r0
2454 + vpaddq $M2,$D2,$D2 # d2 += h1*r1
2456 + vpmuludq $H4,$S4,$M3
2457 + vpmuludq $H4,$R0,$M4
2458 + vpmuludq $H3,$S2,$M0
2459 + vpmuludq $H3,$S3,$M1
2460 + vpaddq $M3,$D3,$D3 # d3 += h4*s4
2461 + vpmuludq $H3,$S4,$M2
2462 + vpaddq $M4,$D4,$D4 # d4 += h4*r0
2463 + vpaddq $M0,$D0,$D0 # d0 += h3*s2
2464 + vpaddq $M1,$D1,$D1 # d1 += h3*s3
2465 + vpaddq $M2,$D2,$D2 # d2 += h3*s4
2467 + vpmuludq $H4,$S1,$M0
2468 + vpmuludq $H4,$S2,$M1
2469 + vpmuludq $H4,$S3,$M2
2470 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2471 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2472 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2474 + ################################################################
2475 + # lazy reduction (interleaved with input splat)
2477 + vpsrlq \$52,$T0,$T2 # splat input
2478 + vpsllq \$12,$T4,$T3
2480 + vpsrlq \$26,$D3,$H3
2481 + vpandq $MASK,$D3,$D3
2482 + vpaddq $H3,$D4,$H4 # h3 -> h4
2486 + vpsrlq \$26,$H0,$D0
2487 + vpandq $MASK,$H0,$H0
2488 + vpaddq $D0,$H1,$H1 # h0 -> h1
2490 + vpandq $MASK,$T2,$T2 # 2
2492 + vpsrlq \$26,$H4,$D4
2493 + vpandq $MASK,$H4,$H4
2495 + vpsrlq \$26,$H1,$D1
2496 + vpandq $MASK,$H1,$H1
2497 + vpaddq $D1,$H2,$H2 # h1 -> h2
2499 + vpaddq $D4,$H0,$H0
2500 + vpsllq \$2,$D4,$D4
2501 + vpaddq $D4,$H0,$H0 # h4 -> h0
2503 + vpaddq $T2,$H2,$H2 # modulo-scheduled
2504 + vpsrlq \$26,$T0,$T1
2506 + vpsrlq \$26,$H2,$D2
2507 + vpandq $MASK,$H2,$H2
2508 + vpaddq $D2,$D3,$H3 # h2 -> h3
2510 + vpsrlq \$14,$T4,$T3
2512 + vpsrlq \$26,$H0,$D0
2513 + vpandq $MASK,$H0,$H0
2514 + vpaddq $D0,$H1,$H1 # h0 -> h1
2516 + vpsrlq \$40,$T4,$T4 # 4
2518 + vpsrlq \$26,$H3,$D3
2519 + vpandq $MASK,$H3,$H3
2520 + vpaddq $D3,$H4,$H4 # h3 -> h4
2522 + vpandq $MASK,$T0,$T0 # 0
2523 + #vpandq $MASK,$T1,$T1 # 1
2524 + #vpandq $MASK,$T3,$T3 # 3
2525 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2531 + ################################################################
2532 + # while above multiplications were by r^8 in all lanes, in last
2533 + # iteration we multiply least significant lane by r^8 and most
2534 + # significant one by r, that's why table gets shifted...
2536 + vpsrlq \$32,$R0,$R0 # 0105020603070408
2537 + vpsrlq \$32,$R1,$R1
2538 + vpsrlq \$32,$R2,$R2
2539 + vpsrlq \$32,$S3,$S3
2540 + vpsrlq \$32,$S4,$S4
2541 + vpsrlq \$32,$R3,$R3
2542 + vpsrlq \$32,$R4,$R4
2543 + vpsrlq \$32,$S1,$S1
2544 + vpsrlq \$32,$S2,$S2
2546 + ################################################################
2547 + # load either next or last 64 byte of input
2548 + lea ($inp,$len),$inp
2550 + #vpaddq $H2,$T2,$H2 # accumulate input
2551 + vpaddq $H0,$T0,$H0
2553 + vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2554 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2555 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2556 + vpandq $MASK,$T1,$T1 # 1
2557 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2558 + vpandq $MASK,$T3,$T3 # 3
2559 + vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2560 + vporq $PADBIT,$T4,$T4 # padbit, yes, always
2561 + vpaddq $H1,$T1,$H1 # accumulate input
2562 + vpaddq $H3,$T3,$H3
2563 + vpaddq $H4,$T4,$H4
2565 + vmovdqu 16*0($inp),%x#$T0
2566 + vpmuludq $H0,$R3,$M3
2567 + vpmuludq $H0,$R4,$M4
2568 + vpmuludq $H0,$R0,$M0
2569 + vpmuludq $H0,$R1,$M1
2570 + vpaddq $M3,$D3,$D3 # d3 += h0*r3
2571 + vpaddq $M4,$D4,$D4 # d4 += h0*r4
2572 + vpaddq $M0,$D0,$D0 # d0 += h0*r0
2573 + vpaddq $M1,$D1,$D1 # d1 += h0*r1
2575 + vmovdqu 16*1($inp),%x#$T1
2576 + vpmuludq $H1,$R2,$M3
2577 + vpmuludq $H1,$R3,$M4
2578 + vpmuludq $H1,$S4,$M0
2579 + vpmuludq $H0,$R2,$M2
2580 + vpaddq $M3,$D3,$D3 # d3 += h1*r2
2581 + vpaddq $M4,$D4,$D4 # d4 += h1*r3
2582 + vpaddq $M0,$D0,$D0 # d0 += h1*s4
2583 + vpaddq $M2,$D2,$D2 # d2 += h0*r2
2585 + vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
2586 + vpmuludq $H3,$R0,$M3
2587 + vpmuludq $H3,$R1,$M4
2588 + vpmuludq $H1,$R0,$M1
2589 + vpmuludq $H1,$R1,$M2
2590 + vpaddq $M3,$D3,$D3 # d3 += h3*r0
2591 + vpaddq $M4,$D4,$D4 # d4 += h3*r1
2592 + vpaddq $M1,$D1,$D1 # d1 += h1*r0
2593 + vpaddq $M2,$D2,$D2 # d2 += h1*r1
2595 + vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
2596 + vpmuludq $H4,$S4,$M3
2597 + vpmuludq $H4,$R0,$M4
2598 + vpmuludq $H3,$S2,$M0
2599 + vpmuludq $H3,$S3,$M1
2600 + vpmuludq $H3,$S4,$M2
2601 + vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
2602 + vpaddq $M4,$D4,$D4 # d4 += h4*r0
2603 + vpaddq $M0,$D0,$D0 # d0 += h3*s2
2604 + vpaddq $M1,$D1,$D1 # d1 += h3*s3
2605 + vpaddq $M2,$D2,$D2 # d2 += h3*s4
2607 + vpmuludq $H4,$S1,$M0
2608 + vpmuludq $H4,$S2,$M1
2609 + vpmuludq $H4,$S3,$M2
2610 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2611 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2612 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2614 + ################################################################
2615 + # horizontal addition
2618 + vpermq \$0xb1,$H3,$D3
2619 + vpermq \$0xb1,$D4,$H4
2620 + vpermq \$0xb1,$H0,$D0
2621 + vpermq \$0xb1,$H1,$D1
2622 + vpermq \$0xb1,$H2,$D2
2623 + vpaddq $D3,$H3,$H3
2624 + vpaddq $D4,$H4,$H4
2625 + vpaddq $D0,$H0,$H0
2626 + vpaddq $D1,$H1,$H1
2627 + vpaddq $D2,$H2,$H2
2630 + vpermq \$0x2,$H3,$D3
2631 + vpermq \$0x2,$H4,$D4
2632 + vpermq \$0x2,$H0,$D0
2633 + vpermq \$0x2,$H1,$D1
2634 + vpermq \$0x2,$H2,$D2
2635 + vpaddq $D3,$H3,$H3
2636 + vpaddq $D4,$H4,$H4
2637 + vpaddq $D0,$H0,$H0
2638 + vpaddq $D1,$H1,$H1
2639 + vpaddq $D2,$H2,$H2
2641 + vextracti64x4 \$0x1,$H3,%y#$D3
2642 + vextracti64x4 \$0x1,$H4,%y#$D4
2643 + vextracti64x4 \$0x1,$H0,%y#$D0
2644 + vextracti64x4 \$0x1,$H1,%y#$D1
2645 + vextracti64x4 \$0x1,$H2,%y#$D2
2646 + vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
2647 + vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
2648 + vpaddq $D0,$H0,${H0}{%k3}{z}
2649 + vpaddq $D1,$H1,${H1}{%k3}{z}
2650 + vpaddq $D2,$H2,${H2}{%k3}{z}
2652 +map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2653 +map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2655 + ################################################################
2656 + # lazy reduction (interleaved with input splat)
2658 + vpsrlq \$26,$H3,$D3
2659 + vpand $MASK,$H3,$H3
2660 + vpsrldq \$6,$T0,$T2 # splat input
2661 + vpsrldq \$6,$T1,$T3
2662 + vpunpckhqdq $T1,$T0,$T4 # 4
2663 + vpaddq $D3,$H4,$H4 # h3 -> h4
2665 + vpsrlq \$26,$H0,$D0
2666 + vpand $MASK,$H0,$H0
2667 + vpunpcklqdq $T3,$T2,$T2 # 2:3
2668 + vpunpcklqdq $T1,$T0,$T0 # 0:1
2669 + vpaddq $D0,$H1,$H1 # h0 -> h1
2671 + vpsrlq \$26,$H4,$D4
2672 + vpand $MASK,$H4,$H4
2674 + vpsrlq \$26,$H1,$D1
2675 + vpand $MASK,$H1,$H1
2676 + vpsrlq \$30,$T2,$T3
2677 + vpsrlq \$4,$T2,$T2
2678 + vpaddq $D1,$H2,$H2 # h1 -> h2
2680 + vpaddq $D4,$H0,$H0
2681 + vpsllq \$2,$D4,$D4
2682 + vpsrlq \$26,$T0,$T1
2683 + vpsrlq \$40,$T4,$T4 # 4
2684 + vpaddq $D4,$H0,$H0 # h4 -> h0
2686 + vpsrlq \$26,$H2,$D2
2687 + vpand $MASK,$H2,$H2
2688 + vpand $MASK,$T2,$T2 # 2
2689 + vpand $MASK,$T0,$T0 # 0
2690 + vpaddq $D2,$H3,$H3 # h2 -> h3
2692 + vpsrlq \$26,$H0,$D0
2693 + vpand $MASK,$H0,$H0
2694 + vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
2695 + vpand $MASK,$T1,$T1 # 1
2696 + vpaddq $D0,$H1,$H1 # h0 -> h1
2698 + vpsrlq \$26,$H3,$D3
2699 + vpand $MASK,$H3,$H3
2700 + vpand $MASK,$T3,$T3 # 3
2701 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always
2702 + vpaddq $D3,$H4,$H4 # h3 -> h4
2704 + lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
2708 + vpsubq $T2,$H2,$H2 # undo input accumulation
2709 + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2710 + vmovd %x#$H1,`4*1-48-64`($ctx)
2711 + vmovd %x#$H2,`4*2-48-64`($ctx)
2712 + vmovd %x#$H3,`4*3-48-64`($ctx)
2713 + vmovd %x#$H4,`4*4-48-64`($ctx)
2716 +$code.=<<___ if ($win64);
2717 + movdqa 0x50(%r11),%xmm6
2718 + movdqa 0x60(%r11),%xmm7
2719 + movdqa 0x70(%r11),%xmm8
2720 + movdqa 0x80(%r11),%xmm9
2721 + movdqa 0x90(%r11),%xmm10
2722 + movdqa 0xa0(%r11),%xmm11
2723 + movdqa 0xb0(%r11),%xmm12
2724 + movdqa 0xc0(%r11),%xmm13
2725 + movdqa 0xd0(%r11),%xmm14
2726 + movdqa 0xe0(%r11),%xmm15
2727 + lea 0xf8(%r11),%rsp
2728 +.Ldo_avx512_epilogue:
2730 +$code.=<<___ if (!$win64);
2732 +.cfi_def_cfa %rsp,8
2737 +.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
2740 +########################################################################
2741 +# VPMADD52 version using 2^44 radix.
2743 +# One can argue that base 2^52 would be more natural. Well, even though
2744 +# some operations would be more natural, one has to recognize couple of
2745 +# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2746 +# at amount of multiply-n-accumulate operations. Secondly, it makes it
2747 +# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2748 +# reference implementations], which means that more such operations
2749 +# would have to be performed in inner loop, which in turn makes critical
2750 +# path longer. In other words, even though base 2^44 reduction might
2751 +# look less elegant, overall critical path is actually shorter...
2753 +########################################################################
2754 +# Layout of opaque area is following.
2756 +# unsigned __int64 h[3]; # current hash value base 2^44
2757 +# unsigned __int64 s[2]; # key value*20 base 2^44
2758 +# unsigned __int64 r[3]; # key value base 2^44
2759 +# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2760 +# # r^n positions reflect
2761 +# # placement in register, not
2762 +# # memory, R[3] is R[1]*20
2765 +.type poly1305_init_base2_44,\@function,3
2767 +poly1305_init_base2_44:
2769 + mov %rax,0($ctx) # initialize hash value
2774 + lea poly1305_blocks_vpmadd52(%rip),%r10
2775 + lea poly1305_emit_base2_44(%rip),%r11
2777 + mov \$0x0ffffffc0fffffff,%rax
2778 + mov \$0x0ffffffc0ffffffc,%rcx
2780 + mov \$0x00000fffffffffff,%r8
2782 + mov \$0x00000fffffffffff,%r9
2784 + shrd \$44,%rcx,%rax
2785 + mov %r8,40($ctx) # r0
2788 + mov %rax,48($ctx) # r1
2789 + lea (%rax,%rax,4),%rax # *5
2790 + mov %rcx,56($ctx) # r2
2791 + shl \$2,%rax # magic <<2
2792 + lea (%rcx,%rcx,4),%rcx # *5
2793 + shl \$2,%rcx # magic <<2
2794 + mov %rax,24($ctx) # s1
2795 + mov %rcx,32($ctx) # s2
2796 + movq \$-1,64($ctx) # write impossible value
2798 +$code.=<<___ if ($flavour !~ /elf32/);
2802 +$code.=<<___ if ($flavour =~ /elf32/);
2809 +.size poly1305_init_base2_44,.-poly1305_init_base2_44
2812 +my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2813 +my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2814 +my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2817 +.type poly1305_blocks_vpmadd52,\@function,4
2819 +poly1305_blocks_vpmadd52:
2821 + jz .Lno_data_vpmadd52 # too short
2824 + mov 64($ctx),%r8 # peek on power of the key
2826 + # if powers of the key are not calculated yet, process up to 3
2827 + # blocks with this single-block subroutine, otherwise ensure that
2828 + # length is divisible by 2 blocks and pass the rest down to next
2833 + cmp \$4,$len # is input long
2835 + test %r8,%r8 # is power value impossible?
2838 + and $len,%rax # is input of favourable length?
2839 + jz .Lblocks_vpmadd52_4x
2845 + lea .L2_44_inp_permd(%rip),%r10
2848 + vmovq $padbit,%x#$PAD
2849 + vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
2850 + vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
2851 + vpermq \$0xcf,$PAD,$PAD
2852 + vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
2854 + vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
2855 + vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
2856 + vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
2857 + vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
2859 + vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
2860 + vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
2862 + jmp .Loop_vpmadd52
2866 + vmovdqu32 0($inp),%x#$T0 # load input as ----3210
2869 + vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
2870 + vpsrlvq $inp_shift,$T0,$T0
2871 + vpandq $reduc_mask,$T0,$T0
2872 + vporq $PAD,$T0,$T0
2874 + vpaddq $T0,$Dlo,$Dlo # accumulate input
2876 + vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
2877 + vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
2878 + vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
2880 + vpxord $Dlo,$Dlo,$Dlo
2881 + vpxord $Dhi,$Dhi,$Dhi
2883 + vpmadd52luq $r2r1r0,$H0,$Dlo
2884 + vpmadd52huq $r2r1r0,$H0,$Dhi
2886 + vpmadd52luq $r1r0s2,$H1,$Dlo
2887 + vpmadd52huq $r1r0s2,$H1,$Dhi
2889 + vpmadd52luq $r0s2s1,$H2,$Dlo
2890 + vpmadd52huq $r0s2s1,$H2,$Dhi
2892 + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
2893 + vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
2894 + vpandq $reduc_mask,$Dlo,$Dlo
2896 + vpaddq $T0,$Dhi,$Dhi
2898 + vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
2900 + vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
2902 + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
2903 + vpandq $reduc_mask,$Dlo,$Dlo
2905 + vpermq \$0b10010011,$T0,$T0
2907 + vpaddq $T0,$Dlo,$Dlo
2909 + vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
2911 + vpaddq $T0,$Dlo,$Dlo
2912 + vpsllq \$2,$T0,$T0
2914 + vpaddq $T0,$Dlo,$Dlo
2916 + dec %rax # len-=16
2917 + jnz .Loop_vpmadd52
2919 + vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
2922 + jnz .Lblocks_vpmadd52_4x
2924 +.Lno_data_vpmadd52:
2926 +.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
2930 +########################################################################
2931 +# As implied by its name 4x subroutine processes 4 blocks in parallel
2932 +# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
2933 +# and is handled in 256-bit %ymm registers.
2935 +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
2936 +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
2937 +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
2940 +.type poly1305_blocks_vpmadd52_4x,\@function,4
2942 +poly1305_blocks_vpmadd52_4x:
2944 + jz .Lno_data_vpmadd52_4x # too short
2947 + mov 64($ctx),%r8 # peek on power of the key
2949 +.Lblocks_vpmadd52_4x:
2950 + vpbroadcastq $padbit,$PAD
2952 + vmovdqa64 .Lx_mask44(%rip),$mask44
2954 + vmovdqa64 .Lx_mask42(%rip),$mask42
2955 + kmovw %eax,%k1 # used in 2x path
2957 + test %r8,%r8 # is power value impossible?
2958 + js .Linit_vpmadd52 # if it is, then init R[4]
2960 + vmovq 0($ctx),%x#$H0 # load current hash value
2961 + vmovq 8($ctx),%x#$H1
2962 + vmovq 16($ctx),%x#$H2
2964 + test \$3,$len # is length 4*n+2?
2965 + jnz .Lblocks_vpmadd52_2x_do
2967 +.Lblocks_vpmadd52_4x_do:
2968 + vpbroadcastq 64($ctx),$R0 # load 4th power of the key
2969 + vpbroadcastq 96($ctx),$R1
2970 + vpbroadcastq 128($ctx),$R2
2971 + vpbroadcastq 160($ctx),$S1
2973 +.Lblocks_vpmadd52_4x_key_loaded:
2974 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4
2975 + vpaddq $R2,$S2,$S2
2976 + vpsllq \$2,$S2,$S2
2978 + test \$7,$len # is len 8*n?
2979 + jz .Lblocks_vpmadd52_8x
2981 + vmovdqu64 16*0($inp),$T2 # load data
2982 + vmovdqu64 16*2($inp),$T3
2983 + lea 16*4($inp),$inp
2985 + vpunpcklqdq $T3,$T2,$T1 # transpose data
2986 + vpunpckhqdq $T3,$T2,$T3
2988 + # at this point 64-bit lanes are ordered as 3-1-2-0
2990 + vpsrlq \$24,$T3,$T2 # splat the data
2991 + vporq $PAD,$T2,$T2
2992 + vpaddq $T2,$H2,$H2 # accumulate input
2993 + vpandq $mask44,$T1,$T0
2994 + vpsrlq \$44,$T1,$T1
2995 + vpsllq \$20,$T3,$T3
2997 + vpandq $mask44,$T1,$T1
3000 + jz .Ltail_vpmadd52_4x
3001 + jmp .Loop_vpmadd52_4x
3006 + vmovq 24($ctx),%x#$S1 # load key
3007 + vmovq 56($ctx),%x#$H2
3008 + vmovq 32($ctx),%x#$S2
3009 + vmovq 40($ctx),%x#$R0
3010 + vmovq 48($ctx),%x#$R1
3018 +.Lmul_init_vpmadd52:
3019 + vpxorq $D0lo,$D0lo,$D0lo
3020 + vpmadd52luq $H2,$S1,$D0lo
3021 + vpxorq $D0hi,$D0hi,$D0hi
3022 + vpmadd52huq $H2,$S1,$D0hi
3023 + vpxorq $D1lo,$D1lo,$D1lo
3024 + vpmadd52luq $H2,$S2,$D1lo
3025 + vpxorq $D1hi,$D1hi,$D1hi
3026 + vpmadd52huq $H2,$S2,$D1hi
3027 + vpxorq $D2lo,$D2lo,$D2lo
3028 + vpmadd52luq $H2,$R0,$D2lo
3029 + vpxorq $D2hi,$D2hi,$D2hi
3030 + vpmadd52huq $H2,$R0,$D2hi
3032 + vpmadd52luq $H0,$R0,$D0lo
3033 + vpmadd52huq $H0,$R0,$D0hi
3034 + vpmadd52luq $H0,$R1,$D1lo
3035 + vpmadd52huq $H0,$R1,$D1hi
3036 + vpmadd52luq $H0,$R2,$D2lo
3037 + vpmadd52huq $H0,$R2,$D2hi
3039 + vpmadd52luq $H1,$S2,$D0lo
3040 + vpmadd52huq $H1,$S2,$D0hi
3041 + vpmadd52luq $H1,$R0,$D1lo
3042 + vpmadd52huq $H1,$R0,$D1hi
3043 + vpmadd52luq $H1,$R1,$D2lo
3044 + vpmadd52huq $H1,$R1,$D2hi
3046 + ################################################################
3047 + # partial reduction
3048 + vpsrlq \$44,$D0lo,$tmp
3049 + vpsllq \$8,$D0hi,$D0hi
3050 + vpandq $mask44,$D0lo,$H0
3051 + vpaddq $tmp,$D0hi,$D0hi
3053 + vpaddq $D0hi,$D1lo,$D1lo
3055 + vpsrlq \$44,$D1lo,$tmp
3056 + vpsllq \$8,$D1hi,$D1hi
3057 + vpandq $mask44,$D1lo,$H1
3058 + vpaddq $tmp,$D1hi,$D1hi
3060 + vpaddq $D1hi,$D2lo,$D2lo
3062 + vpsrlq \$42,$D2lo,$tmp
3063 + vpsllq \$10,$D2hi,$D2hi
3064 + vpandq $mask42,$D2lo,$H2
3065 + vpaddq $tmp,$D2hi,$D2hi
3067 + vpaddq $D2hi,$H0,$H0
3068 + vpsllq \$2,$D2hi,$D2hi
3070 + vpaddq $D2hi,$H0,$H0
3072 + vpsrlq \$44,$H0,$tmp # additional step
3073 + vpandq $mask44,$H0,$H0
3075 + vpaddq $tmp,$H1,$H1
3078 + jz .Ldone_init_vpmadd52
3080 + vpunpcklqdq $R1,$H1,$R1 # 1,2
3081 + vpbroadcastq %x#$H1,%x#$H1 # 2,2
3082 + vpunpcklqdq $R2,$H2,$R2
3083 + vpbroadcastq %x#$H2,%x#$H2
3084 + vpunpcklqdq $R0,$H0,$R0
3085 + vpbroadcastq %x#$H0,%x#$H0
3087 + vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3088 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3089 + vpaddq $R1,$S1,$S1
3090 + vpaddq $R2,$S2,$S2
3091 + vpsllq \$2,$S1,$S1
3092 + vpsllq \$2,$S2,$S2
3094 + jmp .Lmul_init_vpmadd52
3098 +.Ldone_init_vpmadd52:
3099 + vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
3100 + vinserti128 \$1,%x#$R2,$H2,$R2
3101 + vinserti128 \$1,%x#$R0,$H0,$R0
3103 + vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
3104 + vpermq \$0b11011000,$R2,$R2
3105 + vpermq \$0b11011000,$R0,$R0
3107 + vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3108 + vpaddq $R1,$S1,$S1
3109 + vpsllq \$2,$S1,$S1
3111 + vmovq 0($ctx),%x#$H0 # load current hash value
3112 + vmovq 8($ctx),%x#$H1
3113 + vmovq 16($ctx),%x#$H2
3115 + test \$3,$len # is length 4*n+2?
3116 + jnz .Ldone_init_vpmadd52_2x
3118 + vmovdqu64 $R0,64($ctx) # save key powers
3119 + vpbroadcastq %x#$R0,$R0 # broadcast 4th power
3120 + vmovdqu64 $R1,96($ctx)
3121 + vpbroadcastq %x#$R1,$R1
3122 + vmovdqu64 $R2,128($ctx)
3123 + vpbroadcastq %x#$R2,$R2
3124 + vmovdqu64 $S1,160($ctx)
3125 + vpbroadcastq %x#$S1,$S1
3127 + jmp .Lblocks_vpmadd52_4x_key_loaded
3131 +.Ldone_init_vpmadd52_2x:
3132 + vmovdqu64 $R0,64($ctx) # save key powers
3133 + vpsrldq \$8,$R0,$R0 # 0-1-0-2
3134 + vmovdqu64 $R1,96($ctx)
3135 + vpsrldq \$8,$R1,$R1
3136 + vmovdqu64 $R2,128($ctx)
3137 + vpsrldq \$8,$R2,$R2
3138 + vmovdqu64 $S1,160($ctx)
3139 + vpsrldq \$8,$S1,$S1
3140 + jmp .Lblocks_vpmadd52_2x_key_loaded
3144 +.Lblocks_vpmadd52_2x_do:
3145 + vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
3146 + vmovdqu64 160+8($ctx),${S1}{%k1}{z}
3147 + vmovdqu64 64+8($ctx),${R0}{%k1}{z}
3148 + vmovdqu64 96+8($ctx),${R1}{%k1}{z}
3150 +.Lblocks_vpmadd52_2x_key_loaded:
3151 + vmovdqu64 16*0($inp),$T2 # load data
3152 + vpxorq $T3,$T3,$T3
3153 + lea 16*2($inp),$inp
3155 + vpunpcklqdq $T3,$T2,$T1 # transpose data
3156 + vpunpckhqdq $T3,$T2,$T3
3158 + # at this point 64-bit lanes are ordered as x-1-x-0
3160 + vpsrlq \$24,$T3,$T2 # splat the data
3161 + vporq $PAD,$T2,$T2
3162 + vpaddq $T2,$H2,$H2 # accumulate input
3163 + vpandq $mask44,$T1,$T0
3164 + vpsrlq \$44,$T1,$T1
3165 + vpsllq \$20,$T3,$T3
3167 + vpandq $mask44,$T1,$T1
3169 + jmp .Ltail_vpmadd52_2x
3174 + #vpaddq $T2,$H2,$H2 # accumulate input
3175 + vpaddq $T0,$H0,$H0
3176 + vpaddq $T1,$H1,$H1
3178 + vpxorq $D0lo,$D0lo,$D0lo
3179 + vpmadd52luq $H2,$S1,$D0lo
3180 + vpxorq $D0hi,$D0hi,$D0hi
3181 + vpmadd52huq $H2,$S1,$D0hi
3182 + vpxorq $D1lo,$D1lo,$D1lo
3183 + vpmadd52luq $H2,$S2,$D1lo
3184 + vpxorq $D1hi,$D1hi,$D1hi
3185 + vpmadd52huq $H2,$S2,$D1hi
3186 + vpxorq $D2lo,$D2lo,$D2lo
3187 + vpmadd52luq $H2,$R0,$D2lo
3188 + vpxorq $D2hi,$D2hi,$D2hi
3189 + vpmadd52huq $H2,$R0,$D2hi
3191 + vmovdqu64 16*0($inp),$T2 # load data
3192 + vmovdqu64 16*2($inp),$T3
3193 + lea 16*4($inp),$inp
3194 + vpmadd52luq $H0,$R0,$D0lo
3195 + vpmadd52huq $H0,$R0,$D0hi
3196 + vpmadd52luq $H0,$R1,$D1lo
3197 + vpmadd52huq $H0,$R1,$D1hi
3198 + vpmadd52luq $H0,$R2,$D2lo
3199 + vpmadd52huq $H0,$R2,$D2hi
3201 + vpunpcklqdq $T3,$T2,$T1 # transpose data
3202 + vpunpckhqdq $T3,$T2,$T3
3203 + vpmadd52luq $H1,$S2,$D0lo
3204 + vpmadd52huq $H1,$S2,$D0hi
3205 + vpmadd52luq $H1,$R0,$D1lo
3206 + vpmadd52huq $H1,$R0,$D1hi
3207 + vpmadd52luq $H1,$R1,$D2lo
3208 + vpmadd52huq $H1,$R1,$D2hi
3210 + ################################################################
3211 + # partial reduction (interleaved with data splat)
3212 + vpsrlq \$44,$D0lo,$tmp
3213 + vpsllq \$8,$D0hi,$D0hi
3214 + vpandq $mask44,$D0lo,$H0
3215 + vpaddq $tmp,$D0hi,$D0hi
3217 + vpsrlq \$24,$T3,$T2
3218 + vporq $PAD,$T2,$T2
3219 + vpaddq $D0hi,$D1lo,$D1lo
3221 + vpsrlq \$44,$D1lo,$tmp
3222 + vpsllq \$8,$D1hi,$D1hi
3223 + vpandq $mask44,$D1lo,$H1
3224 + vpaddq $tmp,$D1hi,$D1hi
3226 + vpandq $mask44,$T1,$T0
3227 + vpsrlq \$44,$T1,$T1
3228 + vpsllq \$20,$T3,$T3
3229 + vpaddq $D1hi,$D2lo,$D2lo
3231 + vpsrlq \$42,$D2lo,$tmp
3232 + vpsllq \$10,$D2hi,$D2hi
3233 + vpandq $mask42,$D2lo,$H2
3234 + vpaddq $tmp,$D2hi,$D2hi
3236 + vpaddq $T2,$H2,$H2 # accumulate input
3237 + vpaddq $D2hi,$H0,$H0
3238 + vpsllq \$2,$D2hi,$D2hi
3240 + vpaddq $D2hi,$H0,$H0
3242 + vpandq $mask44,$T1,$T1
3244 + vpsrlq \$44,$H0,$tmp # additional step
3245 + vpandq $mask44,$H0,$H0
3247 + vpaddq $tmp,$H1,$H1
3249 + sub \$4,$len # len-=64
3250 + jnz .Loop_vpmadd52_4x
3252 +.Ltail_vpmadd52_4x:
3253 + vmovdqu64 128($ctx),$R2 # load all key powers
3254 + vmovdqu64 160($ctx),$S1
3255 + vmovdqu64 64($ctx),$R0
3256 + vmovdqu64 96($ctx),$R1
3258 +.Ltail_vpmadd52_2x:
3259 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3260 + vpaddq $R2,$S2,$S2
3261 + vpsllq \$2,$S2,$S2
3263 + #vpaddq $T2,$H2,$H2 # accumulate input
3264 + vpaddq $T0,$H0,$H0
3265 + vpaddq $T1,$H1,$H1
3267 + vpxorq $D0lo,$D0lo,$D0lo
3268 + vpmadd52luq $H2,$S1,$D0lo
3269 + vpxorq $D0hi,$D0hi,$D0hi
3270 + vpmadd52huq $H2,$S1,$D0hi
3271 + vpxorq $D1lo,$D1lo,$D1lo
3272 + vpmadd52luq $H2,$S2,$D1lo
3273 + vpxorq $D1hi,$D1hi,$D1hi
3274 + vpmadd52huq $H2,$S2,$D1hi
3275 + vpxorq $D2lo,$D2lo,$D2lo
3276 + vpmadd52luq $H2,$R0,$D2lo
3277 + vpxorq $D2hi,$D2hi,$D2hi
3278 + vpmadd52huq $H2,$R0,$D2hi
3280 + vpmadd52luq $H0,$R0,$D0lo
3281 + vpmadd52huq $H0,$R0,$D0hi
3282 + vpmadd52luq $H0,$R1,$D1lo
3283 + vpmadd52huq $H0,$R1,$D1hi
3284 + vpmadd52luq $H0,$R2,$D2lo
3285 + vpmadd52huq $H0,$R2,$D2hi
3287 + vpmadd52luq $H1,$S2,$D0lo
3288 + vpmadd52huq $H1,$S2,$D0hi
3289 + vpmadd52luq $H1,$R0,$D1lo
3290 + vpmadd52huq $H1,$R0,$D1hi
3291 + vpmadd52luq $H1,$R1,$D2lo
3292 + vpmadd52huq $H1,$R1,$D2hi
3294 + ################################################################
3295 + # horizontal addition
3299 + vpsrldq \$8,$D0lo,$T0
3300 + vpsrldq \$8,$D0hi,$H0
3301 + vpsrldq \$8,$D1lo,$T1
3302 + vpsrldq \$8,$D1hi,$H1
3303 + vpaddq $T0,$D0lo,$D0lo
3304 + vpaddq $H0,$D0hi,$D0hi
3305 + vpsrldq \$8,$D2lo,$T2
3306 + vpsrldq \$8,$D2hi,$H2
3307 + vpaddq $T1,$D1lo,$D1lo
3308 + vpaddq $H1,$D1hi,$D1hi
3309 + vpermq \$0x2,$D0lo,$T0
3310 + vpermq \$0x2,$D0hi,$H0
3311 + vpaddq $T2,$D2lo,$D2lo
3312 + vpaddq $H2,$D2hi,$D2hi
3314 + vpermq \$0x2,$D1lo,$T1
3315 + vpermq \$0x2,$D1hi,$H1
3316 + vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3317 + vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3318 + vpermq \$0x2,$D2lo,$T2
3319 + vpermq \$0x2,$D2hi,$H2
3320 + vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3321 + vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3322 + vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3323 + vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3325 + ################################################################
3326 + # partial reduction
3327 + vpsrlq \$44,$D0lo,$tmp
3328 + vpsllq \$8,$D0hi,$D0hi
3329 + vpandq $mask44,$D0lo,$H0
3330 + vpaddq $tmp,$D0hi,$D0hi
3332 + vpaddq $D0hi,$D1lo,$D1lo
3334 + vpsrlq \$44,$D1lo,$tmp
3335 + vpsllq \$8,$D1hi,$D1hi
3336 + vpandq $mask44,$D1lo,$H1
3337 + vpaddq $tmp,$D1hi,$D1hi
3339 + vpaddq $D1hi,$D2lo,$D2lo
3341 + vpsrlq \$42,$D2lo,$tmp
3342 + vpsllq \$10,$D2hi,$D2hi
3343 + vpandq $mask42,$D2lo,$H2
3344 + vpaddq $tmp,$D2hi,$D2hi
3346 + vpaddq $D2hi,$H0,$H0
3347 + vpsllq \$2,$D2hi,$D2hi
3349 + vpaddq $D2hi,$H0,$H0
3351 + vpsrlq \$44,$H0,$tmp # additional step
3352 + vpandq $mask44,$H0,$H0
3354 + vpaddq $tmp,$H1,$H1
3355 + # at this point $len is
3356 + # either 4*n+2 or 0...
3357 + sub \$2,$len # len-=32
3358 + ja .Lblocks_vpmadd52_4x_do
3360 + vmovq %x#$H0,0($ctx)
3361 + vmovq %x#$H1,8($ctx)
3362 + vmovq %x#$H2,16($ctx)
3365 +.Lno_data_vpmadd52_4x:
3367 +.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3371 +########################################################################
3372 +# As implied by its name 8x subroutine processes 8 blocks in parallel...
3373 +# This is intermediate version, as it's used only in cases when input
3374 +# length is either 8*n, 8*n+1 or 8*n+2...
3376 +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3377 +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3378 +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3379 +my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3382 +.type poly1305_blocks_vpmadd52_8x,\@function,4
3384 +poly1305_blocks_vpmadd52_8x:
3386 + jz .Lno_data_vpmadd52_8x # too short
3389 + mov 64($ctx),%r8 # peek on power of the key
3391 + vmovdqa64 .Lx_mask44(%rip),$mask44
3392 + vmovdqa64 .Lx_mask42(%rip),$mask42
3394 + test %r8,%r8 # is power value impossible?
3395 + js .Linit_vpmadd52 # if it is, then init R[4]
3397 + vmovq 0($ctx),%x#$H0 # load current hash value
3398 + vmovq 8($ctx),%x#$H1
3399 + vmovq 16($ctx),%x#$H2
3401 +.Lblocks_vpmadd52_8x:
3402 + ################################################################
3403 + # fist we calculate more key powers
3405 + vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
3406 + vmovdqu64 160($ctx),$S1
3407 + vmovdqu64 64($ctx),$R0
3408 + vmovdqu64 96($ctx),$R1
3410 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3411 + vpaddq $R2,$S2,$S2
3412 + vpsllq \$2,$S2,$S2
3414 + vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
3415 + vpbroadcastq %x#$R0,$RR0
3416 + vpbroadcastq %x#$R1,$RR1
3418 + vpxorq $D0lo,$D0lo,$D0lo
3419 + vpmadd52luq $RR2,$S1,$D0lo
3420 + vpxorq $D0hi,$D0hi,$D0hi
3421 + vpmadd52huq $RR2,$S1,$D0hi
3422 + vpxorq $D1lo,$D1lo,$D1lo
3423 + vpmadd52luq $RR2,$S2,$D1lo
3424 + vpxorq $D1hi,$D1hi,$D1hi
3425 + vpmadd52huq $RR2,$S2,$D1hi
3426 + vpxorq $D2lo,$D2lo,$D2lo
3427 + vpmadd52luq $RR2,$R0,$D2lo
3428 + vpxorq $D2hi,$D2hi,$D2hi
3429 + vpmadd52huq $RR2,$R0,$D2hi
3431 + vpmadd52luq $RR0,$R0,$D0lo
3432 + vpmadd52huq $RR0,$R0,$D0hi
3433 + vpmadd52luq $RR0,$R1,$D1lo
3434 + vpmadd52huq $RR0,$R1,$D1hi
3435 + vpmadd52luq $RR0,$R2,$D2lo
3436 + vpmadd52huq $RR0,$R2,$D2hi
3438 + vpmadd52luq $RR1,$S2,$D0lo
3439 + vpmadd52huq $RR1,$S2,$D0hi
3440 + vpmadd52luq $RR1,$R0,$D1lo
3441 + vpmadd52huq $RR1,$R0,$D1hi
3442 + vpmadd52luq $RR1,$R1,$D2lo
3443 + vpmadd52huq $RR1,$R1,$D2hi
3445 + ################################################################
3446 + # partial reduction
3447 + vpsrlq \$44,$D0lo,$tmp
3448 + vpsllq \$8,$D0hi,$D0hi
3449 + vpandq $mask44,$D0lo,$RR0
3450 + vpaddq $tmp,$D0hi,$D0hi
3452 + vpaddq $D0hi,$D1lo,$D1lo
3454 + vpsrlq \$44,$D1lo,$tmp
3455 + vpsllq \$8,$D1hi,$D1hi
3456 + vpandq $mask44,$D1lo,$RR1
3457 + vpaddq $tmp,$D1hi,$D1hi
3459 + vpaddq $D1hi,$D2lo,$D2lo
3461 + vpsrlq \$42,$D2lo,$tmp
3462 + vpsllq \$10,$D2hi,$D2hi
3463 + vpandq $mask42,$D2lo,$RR2
3464 + vpaddq $tmp,$D2hi,$D2hi
3466 + vpaddq $D2hi,$RR0,$RR0
3467 + vpsllq \$2,$D2hi,$D2hi
3469 + vpaddq $D2hi,$RR0,$RR0
3471 + vpsrlq \$44,$RR0,$tmp # additional step
3472 + vpandq $mask44,$RR0,$RR0
3474 + vpaddq $tmp,$RR1,$RR1
3476 + ################################################################
3477 + # At this point Rx holds 1324 powers, RRx - 5768, and the goal
3478 + # is 15263748, which reflects how data is loaded...
3480 + vpunpcklqdq $R2,$RR2,$T2 # 3748
3481 + vpunpckhqdq $R2,$RR2,$R2 # 1526
3482 + vpunpcklqdq $R0,$RR0,$T0
3483 + vpunpckhqdq $R0,$RR0,$R0
3484 + vpunpcklqdq $R1,$RR1,$T1
3485 + vpunpckhqdq $R1,$RR1,$R1
3487 +######## switch to %zmm
3488 +map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3489 +map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3490 +map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3491 +map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3494 + vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
3495 + vshufi64x2 \$0x44,$R0,$T0,$RR0
3496 + vshufi64x2 \$0x44,$R1,$T1,$RR1
3498 + vmovdqu64 16*0($inp),$T2 # load data
3499 + vmovdqu64 16*4($inp),$T3
3500 + lea 16*8($inp),$inp
3502 + vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
3503 + vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
3504 + vpaddq $RR2,$SS2,$SS2
3505 + vpaddq $RR1,$SS1,$SS1
3506 + vpsllq \$2,$SS2,$SS2
3507 + vpsllq \$2,$SS1,$SS1
3509 + vpbroadcastq $padbit,$PAD
3510 + vpbroadcastq %x#$mask44,$mask44
3511 + vpbroadcastq %x#$mask42,$mask42
3513 + vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
3514 + vpbroadcastq %x#$SS2,$S2
3515 + vpbroadcastq %x#$RR0,$R0
3516 + vpbroadcastq %x#$RR1,$R1
3517 + vpbroadcastq %x#$RR2,$R2
3519 + vpunpcklqdq $T3,$T2,$T1 # transpose data
3520 + vpunpckhqdq $T3,$T2,$T3
3522 + # at this point 64-bit lanes are ordered as 73625140
3524 + vpsrlq \$24,$T3,$T2 # splat the data
3525 + vporq $PAD,$T2,$T2
3526 + vpaddq $T2,$H2,$H2 # accumulate input
3527 + vpandq $mask44,$T1,$T0
3528 + vpsrlq \$44,$T1,$T1
3529 + vpsllq \$20,$T3,$T3
3531 + vpandq $mask44,$T1,$T1
3534 + jz .Ltail_vpmadd52_8x
3535 + jmp .Loop_vpmadd52_8x
3539 + #vpaddq $T2,$H2,$H2 # accumulate input
3540 + vpaddq $T0,$H0,$H0
3541 + vpaddq $T1,$H1,$H1
3543 + vpxorq $D0lo,$D0lo,$D0lo
3544 + vpmadd52luq $H2,$S1,$D0lo
3545 + vpxorq $D0hi,$D0hi,$D0hi
3546 + vpmadd52huq $H2,$S1,$D0hi
3547 + vpxorq $D1lo,$D1lo,$D1lo
3548 + vpmadd52luq $H2,$S2,$D1lo
3549 + vpxorq $D1hi,$D1hi,$D1hi
3550 + vpmadd52huq $H2,$S2,$D1hi
3551 + vpxorq $D2lo,$D2lo,$D2lo
3552 + vpmadd52luq $H2,$R0,$D2lo
3553 + vpxorq $D2hi,$D2hi,$D2hi
3554 + vpmadd52huq $H2,$R0,$D2hi
3556 + vmovdqu64 16*0($inp),$T2 # load data
3557 + vmovdqu64 16*4($inp),$T3
3558 + lea 16*8($inp),$inp
3559 + vpmadd52luq $H0,$R0,$D0lo
3560 + vpmadd52huq $H0,$R0,$D0hi
3561 + vpmadd52luq $H0,$R1,$D1lo
3562 + vpmadd52huq $H0,$R1,$D1hi
3563 + vpmadd52luq $H0,$R2,$D2lo
3564 + vpmadd52huq $H0,$R2,$D2hi
3566 + vpunpcklqdq $T3,$T2,$T1 # transpose data
3567 + vpunpckhqdq $T3,$T2,$T3
3568 + vpmadd52luq $H1,$S2,$D0lo
3569 + vpmadd52huq $H1,$S2,$D0hi
3570 + vpmadd52luq $H1,$R0,$D1lo
3571 + vpmadd52huq $H1,$R0,$D1hi
3572 + vpmadd52luq $H1,$R1,$D2lo
3573 + vpmadd52huq $H1,$R1,$D2hi
3575 + ################################################################
3576 + # partial reduction (interleaved with data splat)
3577 + vpsrlq \$44,$D0lo,$tmp
3578 + vpsllq \$8,$D0hi,$D0hi
3579 + vpandq $mask44,$D0lo,$H0
3580 + vpaddq $tmp,$D0hi,$D0hi
3582 + vpsrlq \$24,$T3,$T2
3583 + vporq $PAD,$T2,$T2
3584 + vpaddq $D0hi,$D1lo,$D1lo
3586 + vpsrlq \$44,$D1lo,$tmp
3587 + vpsllq \$8,$D1hi,$D1hi
3588 + vpandq $mask44,$D1lo,$H1
3589 + vpaddq $tmp,$D1hi,$D1hi
3591 + vpandq $mask44,$T1,$T0
3592 + vpsrlq \$44,$T1,$T1
3593 + vpsllq \$20,$T3,$T3
3594 + vpaddq $D1hi,$D2lo,$D2lo
3596 + vpsrlq \$42,$D2lo,$tmp
3597 + vpsllq \$10,$D2hi,$D2hi
3598 + vpandq $mask42,$D2lo,$H2
3599 + vpaddq $tmp,$D2hi,$D2hi
3601 + vpaddq $T2,$H2,$H2 # accumulate input
3602 + vpaddq $D2hi,$H0,$H0
3603 + vpsllq \$2,$D2hi,$D2hi
3605 + vpaddq $D2hi,$H0,$H0
3607 + vpandq $mask44,$T1,$T1
3609 + vpsrlq \$44,$H0,$tmp # additional step
3610 + vpandq $mask44,$H0,$H0
3612 + vpaddq $tmp,$H1,$H1
3614 + sub \$8,$len # len-=128
3615 + jnz .Loop_vpmadd52_8x
3617 +.Ltail_vpmadd52_8x:
3618 + #vpaddq $T2,$H2,$H2 # accumulate input
3619 + vpaddq $T0,$H0,$H0
3620 + vpaddq $T1,$H1,$H1
3622 + vpxorq $D0lo,$D0lo,$D0lo
3623 + vpmadd52luq $H2,$SS1,$D0lo
3624 + vpxorq $D0hi,$D0hi,$D0hi
3625 + vpmadd52huq $H2,$SS1,$D0hi
3626 + vpxorq $D1lo,$D1lo,$D1lo
3627 + vpmadd52luq $H2,$SS2,$D1lo
3628 + vpxorq $D1hi,$D1hi,$D1hi
3629 + vpmadd52huq $H2,$SS2,$D1hi
3630 + vpxorq $D2lo,$D2lo,$D2lo
3631 + vpmadd52luq $H2,$RR0,$D2lo
3632 + vpxorq $D2hi,$D2hi,$D2hi
3633 + vpmadd52huq $H2,$RR0,$D2hi
3635 + vpmadd52luq $H0,$RR0,$D0lo
3636 + vpmadd52huq $H0,$RR0,$D0hi
3637 + vpmadd52luq $H0,$RR1,$D1lo
3638 + vpmadd52huq $H0,$RR1,$D1hi
3639 + vpmadd52luq $H0,$RR2,$D2lo
3640 + vpmadd52huq $H0,$RR2,$D2hi
3642 + vpmadd52luq $H1,$SS2,$D0lo
3643 + vpmadd52huq $H1,$SS2,$D0hi
3644 + vpmadd52luq $H1,$RR0,$D1lo
3645 + vpmadd52huq $H1,$RR0,$D1hi
3646 + vpmadd52luq $H1,$RR1,$D2lo
3647 + vpmadd52huq $H1,$RR1,$D2hi
3649 + ################################################################
3650 + # horizontal addition
3654 + vpsrldq \$8,$D0lo,$T0
3655 + vpsrldq \$8,$D0hi,$H0
3656 + vpsrldq \$8,$D1lo,$T1
3657 + vpsrldq \$8,$D1hi,$H1
3658 + vpaddq $T0,$D0lo,$D0lo
3659 + vpaddq $H0,$D0hi,$D0hi
3660 + vpsrldq \$8,$D2lo,$T2
3661 + vpsrldq \$8,$D2hi,$H2
3662 + vpaddq $T1,$D1lo,$D1lo
3663 + vpaddq $H1,$D1hi,$D1hi
3664 + vpermq \$0x2,$D0lo,$T0
3665 + vpermq \$0x2,$D0hi,$H0
3666 + vpaddq $T2,$D2lo,$D2lo
3667 + vpaddq $H2,$D2hi,$D2hi
3669 + vpermq \$0x2,$D1lo,$T1
3670 + vpermq \$0x2,$D1hi,$H1
3671 + vpaddq $T0,$D0lo,$D0lo
3672 + vpaddq $H0,$D0hi,$D0hi
3673 + vpermq \$0x2,$D2lo,$T2
3674 + vpermq \$0x2,$D2hi,$H2
3675 + vpaddq $T1,$D1lo,$D1lo
3676 + vpaddq $H1,$D1hi,$D1hi
3677 + vextracti64x4 \$1,$D0lo,%y#$T0
3678 + vextracti64x4 \$1,$D0hi,%y#$H0
3679 + vpaddq $T2,$D2lo,$D2lo
3680 + vpaddq $H2,$D2hi,$D2hi
3682 + vextracti64x4 \$1,$D1lo,%y#$T1
3683 + vextracti64x4 \$1,$D1hi,%y#$H1
3684 + vextracti64x4 \$1,$D2lo,%y#$T2
3685 + vextracti64x4 \$1,$D2hi,%y#$H2
3687 +######## switch back to %ymm
3688 +map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3689 +map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3690 +map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3693 + vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3694 + vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3695 + vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3696 + vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3697 + vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3698 + vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3700 + ################################################################
3701 + # partial reduction
3702 + vpsrlq \$44,$D0lo,$tmp
3703 + vpsllq \$8,$D0hi,$D0hi
3704 + vpandq $mask44,$D0lo,$H0
3705 + vpaddq $tmp,$D0hi,$D0hi
3707 + vpaddq $D0hi,$D1lo,$D1lo
3709 + vpsrlq \$44,$D1lo,$tmp
3710 + vpsllq \$8,$D1hi,$D1hi
3711 + vpandq $mask44,$D1lo,$H1
3712 + vpaddq $tmp,$D1hi,$D1hi
3714 + vpaddq $D1hi,$D2lo,$D2lo
3716 + vpsrlq \$42,$D2lo,$tmp
3717 + vpsllq \$10,$D2hi,$D2hi
3718 + vpandq $mask42,$D2lo,$H2
3719 + vpaddq $tmp,$D2hi,$D2hi
3721 + vpaddq $D2hi,$H0,$H0
3722 + vpsllq \$2,$D2hi,$D2hi
3724 + vpaddq $D2hi,$H0,$H0
3726 + vpsrlq \$44,$H0,$tmp # additional step
3727 + vpandq $mask44,$H0,$H0
3729 + vpaddq $tmp,$H1,$H1
3731 + ################################################################
3733 + vmovq %x#$H0,0($ctx)
3734 + vmovq %x#$H1,8($ctx)
3735 + vmovq %x#$H2,16($ctx)
3738 +.Lno_data_vpmadd52_8x:
3740 +.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3744 +.type poly1305_emit_base2_44,\@function,3
3746 +poly1305_emit_base2_44:
3747 + mov 0($ctx),%r8 # load hash value
3763 + add \$5,%r8 # compare to modulus
3767 + shr \$2,%r10 # did 130-bit value overflow?
3771 + add 0($nonce),%rax # accumulate nonce
3772 + adc 8($nonce),%rcx
3773 + mov %rax,0($mac) # write result
3777 +.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
3784 +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
3786 +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
3788 +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
3790 +.long 2,2,2,3,2,0,2,1
3792 +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
3795 +.long 0,1,1,2,2,3,7,7
3799 +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
3807 +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3808 +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3810 +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3811 +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3815 +.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3819 +{ # chacha20-poly1305 helpers
3820 +my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
3821 + ("%rdi","%rsi","%rdx","%rcx"); # Unix order
3823 +.globl xor128_encrypt_n_pad
3824 +.type xor128_encrypt_n_pad,\@abi-omnipotent
3826 +xor128_encrypt_n_pad:
3829 + mov $len,%r10 # put len aside
3830 + shr \$4,$len # len / 16
3834 + movdqu ($inp,$otp),%xmm0
3836 + movdqu %xmm0,($out,$otp)
3837 + movdqa %xmm0,($otp)
3842 + and \$15,%r10 # len % 16
3850 + mov ($inp,$otp),%al
3852 + mov %al,($out,$otp)
3856 + jnz .Loop_enc_byte
3868 +.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3870 +.globl xor128_decrypt_n_pad
3871 +.type xor128_decrypt_n_pad,\@abi-omnipotent
3873 +xor128_decrypt_n_pad:
3876 + mov $len,%r10 # put len aside
3877 + shr \$4,$len # len / 16
3881 + movdqu ($inp,$otp),%xmm0
3882 + movdqa ($otp),%xmm1
3884 + movdqu %xmm1,($out,$otp)
3885 + movdqa %xmm0,($otp)
3891 + and \$15,%r10 # len % 16
3900 + mov ($inp,$otp),%r11b
3903 + mov %al,($out,$otp)
3907 + jnz .Loop_dec_byte
3919 +.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3923 +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3924 +# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3932 +.extern __imp_RtlVirtualUnwind
3933 +.type se_handler,\@abi-omnipotent
3947 + mov 120($context),%rax # pull context->Rax
3948 + mov 248($context),%rbx # pull context->Rip
3950 + mov 8($disp),%rsi # disp->ImageBase
3951 + mov 56($disp),%r11 # disp->HandlerData
3953 + mov 0(%r11),%r10d # HandlerData[0]
3954 + lea (%rsi,%r10),%r10 # prologue label
3955 + cmp %r10,%rbx # context->Rip<.Lprologue
3956 + jb .Lcommon_seh_tail
3958 + mov 152($context),%rax # pull context->Rsp
3960 + mov 4(%r11),%r10d # HandlerData[1]
3961 + lea (%rsi,%r10),%r10 # epilogue label
3962 + cmp %r10,%rbx # context->Rip>=.Lepilogue
3963 + jae .Lcommon_seh_tail
3968 + mov -16(%rax),%rbp
3969 + mov -24(%rax),%r12
3970 + mov -32(%rax),%r13
3971 + mov -40(%rax),%r14
3972 + mov -48(%rax),%r15
3973 + mov %rbx,144($context) # restore context->Rbx
3974 + mov %rbp,160($context) # restore context->Rbp
3975 + mov %r12,216($context) # restore context->R12
3976 + mov %r13,224($context) # restore context->R13
3977 + mov %r14,232($context) # restore context->R14
3978 + mov %r15,240($context) # restore context->R14
3980 + jmp .Lcommon_seh_tail
3981 +.size se_handler,.-se_handler
3983 +.type avx_handler,\@abi-omnipotent
3997 + mov 120($context),%rax # pull context->Rax
3998 + mov 248($context),%rbx # pull context->Rip
4000 + mov 8($disp),%rsi # disp->ImageBase
4001 + mov 56($disp),%r11 # disp->HandlerData
4003 + mov 0(%r11),%r10d # HandlerData[0]
4004 + lea (%rsi,%r10),%r10 # prologue label
4005 + cmp %r10,%rbx # context->Rip<prologue label
4006 + jb .Lcommon_seh_tail
4008 + mov 152($context),%rax # pull context->Rsp
4010 + mov 4(%r11),%r10d # HandlerData[1]
4011 + lea (%rsi,%r10),%r10 # epilogue label
4012 + cmp %r10,%rbx # context->Rip>=epilogue label
4013 + jae .Lcommon_seh_tail
4015 + mov 208($context),%rax # pull context->R11
4017 + lea 0x50(%rax),%rsi
4018 + lea 0xf8(%rax),%rax
4019 + lea 512($context),%rdi # &context.Xmm6
4021 + .long 0xa548f3fc # cld; rep movsq
4026 + mov %rax,152($context) # restore context->Rsp
4027 + mov %rsi,168($context) # restore context->Rsi
4028 + mov %rdi,176($context) # restore context->Rdi
4030 + mov 40($disp),%rdi # disp->ContextRecord
4031 + mov $context,%rsi # context
4032 + mov \$154,%ecx # sizeof(CONTEXT)
4033 + .long 0xa548f3fc # cld; rep movsq
4036 + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4037 + mov 8(%rsi),%rdx # arg2, disp->ImageBase
4038 + mov 0(%rsi),%r8 # arg3, disp->ControlPc
4039 + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4040 + mov 40(%rsi),%r10 # disp->ContextRecord
4041 + lea 56(%rsi),%r11 # &disp->HandlerData
4042 + lea 24(%rsi),%r12 # &disp->EstablisherFrame
4043 + mov %r10,32(%rsp) # arg5
4044 + mov %r11,40(%rsp) # arg6
4045 + mov %r12,48(%rsp) # arg7
4046 + mov %rcx,56(%rsp) # arg8, (NULL)
4047 + call *__imp_RtlVirtualUnwind(%rip)
4049 + mov \$1,%eax # ExceptionContinueSearch
4061 +.size avx_handler,.-avx_handler
4065 + .rva .LSEH_begin_poly1305_init
4066 + .rva .LSEH_end_poly1305_init
4067 + .rva .LSEH_info_poly1305_init
4069 + .rva .LSEH_begin_poly1305_blocks
4070 + .rva .LSEH_end_poly1305_blocks
4071 + .rva .LSEH_info_poly1305_blocks
4073 + .rva .LSEH_begin_poly1305_emit
4074 + .rva .LSEH_end_poly1305_emit
4075 + .rva .LSEH_info_poly1305_emit
4077 +$code.=<<___ if ($avx);
4078 + .rva .LSEH_begin_poly1305_blocks_avx
4079 + .rva .Lbase2_64_avx
4080 + .rva .LSEH_info_poly1305_blocks_avx_1
4082 + .rva .Lbase2_64_avx
4084 + .rva .LSEH_info_poly1305_blocks_avx_2
4087 + .rva .LSEH_end_poly1305_blocks_avx
4088 + .rva .LSEH_info_poly1305_blocks_avx_3
4090 + .rva .LSEH_begin_poly1305_emit_avx
4091 + .rva .LSEH_end_poly1305_emit_avx
4092 + .rva .LSEH_info_poly1305_emit_avx
4094 +$code.=<<___ if ($avx>1);
4095 + .rva .LSEH_begin_poly1305_blocks_avx2
4096 + .rva .Lbase2_64_avx2
4097 + .rva .LSEH_info_poly1305_blocks_avx2_1
4099 + .rva .Lbase2_64_avx2
4101 + .rva .LSEH_info_poly1305_blocks_avx2_2
4104 + .rva .LSEH_end_poly1305_blocks_avx2
4105 + .rva .LSEH_info_poly1305_blocks_avx2_3
4107 +$code.=<<___ if ($avx>2);
4108 + .rva .LSEH_begin_poly1305_blocks_avx512
4109 + .rva .LSEH_end_poly1305_blocks_avx512
4110 + .rva .LSEH_info_poly1305_blocks_avx512
4115 +.LSEH_info_poly1305_init:
4118 + .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
4120 +.LSEH_info_poly1305_blocks:
4123 + .rva .Lblocks_body,.Lblocks_epilogue
4125 +.LSEH_info_poly1305_emit:
4128 + .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
4130 +$code.=<<___ if ($avx);
4131 +.LSEH_info_poly1305_blocks_avx_1:
4134 + .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
4136 +.LSEH_info_poly1305_blocks_avx_2:
4139 + .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
4141 +.LSEH_info_poly1305_blocks_avx_3:
4144 + .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
4146 +.LSEH_info_poly1305_emit_avx:
4149 + .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
4151 +$code.=<<___ if ($avx>1);
4152 +.LSEH_info_poly1305_blocks_avx2_1:
4155 + .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
4157 +.LSEH_info_poly1305_blocks_avx2_2:
4160 + .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
4162 +.LSEH_info_poly1305_blocks_avx2_3:
4165 + .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
4167 +$code.=<<___ if ($avx>2);
4168 +.LSEH_info_poly1305_blocks_avx512:
4171 + .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
4175 +foreach (split('\n',$code)) {
4176 + s/\`([^\`]*)\`/eval($1)/ge;
4177 + s/%r([a-z]+)#d/%e$1/g;
4178 + s/%r([0-9]+)#d/%r$1d/g;
4179 + s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;