crypto: x86/chacha20 - refactor to allow varying number of rounds
authorEric Biggers <ebiggers@google.com>
Wed, 5 Dec 2018 06:20:03 +0000 (22:20 -0800)
committerHerbert Xu <herbert@gondor.apana.org.au>
Thu, 13 Dec 2018 10:24:58 +0000 (18:24 +0800)
In preparation for adding XChaCha12 support, rename/refactor the x86_64
SIMD implementations of ChaCha20 to support different numbers of rounds.

Reviewed-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/x86/crypto/Makefile
arch/x86/crypto/chacha-avx2-x86_64.S [new file with mode: 0644]
arch/x86/crypto/chacha-avx512vl-x86_64.S [new file with mode: 0644]
arch/x86/crypto/chacha-ssse3-x86_64.S [new file with mode: 0644]
arch/x86/crypto/chacha20-avx2-x86_64.S [deleted file]
arch/x86/crypto/chacha20-avx512vl-x86_64.S [deleted file]
arch/x86/crypto/chacha20-ssse3-x86_64.S [deleted file]
arch/x86/crypto/chacha20_glue.c [deleted file]
arch/x86/crypto/chacha_glue.c [new file with mode: 0644]

index 0b31b16f49d8a45ed06433608c25ac8564bd65ee..45734e1cf96720c07eefef5c99e36bfa161cb54a 100644 (file)
@@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -78,7 +78,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
+chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
 aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
@@ -103,7 +103,7 @@ endif
 
 ifeq ($(avx2_supported),yes)
        camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
-       chacha20-x86_64-y += chacha20-avx2-x86_64.o
+       chacha-x86_64-y += chacha-avx2-x86_64.o
        serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
 
        morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
@@ -112,7 +112,7 @@ ifeq ($(avx2_supported),yes)
 endif
 
 ifeq ($(avx512_supported),yes)
-       chacha20-x86_64-y += chacha20-avx512vl-x86_64.o
+       chacha-x86_64-y += chacha-avx512vl-x86_64.o
 endif
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S
new file mode 100644 (file)
index 0000000..32903fd
--- /dev/null
@@ -0,0 +1,1025 @@
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.section       .rodata.cst32.ROT8, "aM", @progbits, 32
+.align 32
+ROT8:  .octa 0x0e0d0c0f0a09080b0605040702010003
+       .octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section       .rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+       .octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section       .rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
+CTRINC:        .octa 0x00000003000000020000000100000000
+       .octa 0x00000007000000060000000500000004
+
+.section       .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:        .octa 0x00000000000000000000000000000000
+       .octa 0x00000000000000000000000000000001
+
+.section       .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:        .octa 0x00000000000000000000000000000002
+       .octa 0x00000000000000000000000000000003
+
+.text
+
+ENTRY(chacha_2block_xor_avx2)
+       # %rdi: Input state matrix, s
+       # %rsi: up to 2 data blocks output, o
+       # %rdx: up to 2 data blocks input, i
+       # %rcx: input/output length in bytes
+       # %r8d: nrounds
+
+       # This function encrypts two ChaCha blocks by loading the state
+       # matrix twice across four AVX registers. It performs matrix operations
+       # on four words in each matrix in parallel, but requires shuffling to
+       # rearrange the words after each round.
+
+       vzeroupper
+
+       # x0..3[0-2] = s0..3
+       vbroadcasti128  0x00(%rdi),%ymm0
+       vbroadcasti128  0x10(%rdi),%ymm1
+       vbroadcasti128  0x20(%rdi),%ymm2
+       vbroadcasti128  0x30(%rdi),%ymm3
+
+       vpaddd          CTR2BL(%rip),%ymm3,%ymm3
+
+       vmovdqa         %ymm0,%ymm8
+       vmovdqa         %ymm1,%ymm9
+       vmovdqa         %ymm2,%ymm10
+       vmovdqa         %ymm3,%ymm11
+
+       vmovdqa         ROT8(%rip),%ymm4
+       vmovdqa         ROT16(%rip),%ymm5
+
+       mov             %rcx,%rax
+
+.Ldoubleround:
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxor           %ymm0,%ymm3,%ymm3
+       vpshufb         %ymm5,%ymm3,%ymm3
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxor           %ymm2,%ymm1,%ymm1
+       vmovdqa         %ymm1,%ymm6
+       vpslld          $12,%ymm6,%ymm6
+       vpsrld          $20,%ymm1,%ymm1
+       vpor            %ymm6,%ymm1,%ymm1
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxor           %ymm0,%ymm3,%ymm3
+       vpshufb         %ymm4,%ymm3,%ymm3
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxor           %ymm2,%ymm1,%ymm1
+       vmovdqa         %ymm1,%ymm7
+       vpslld          $7,%ymm7,%ymm7
+       vpsrld          $25,%ymm1,%ymm1
+       vpor            %ymm7,%ymm1,%ymm1
+
+       # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+       vpshufd         $0x39,%ymm1,%ymm1
+       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       vpshufd         $0x4e,%ymm2,%ymm2
+       # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+       vpshufd         $0x93,%ymm3,%ymm3
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxor           %ymm0,%ymm3,%ymm3
+       vpshufb         %ymm5,%ymm3,%ymm3
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxor           %ymm2,%ymm1,%ymm1
+       vmovdqa         %ymm1,%ymm6
+       vpslld          $12,%ymm6,%ymm6
+       vpsrld          $20,%ymm1,%ymm1
+       vpor            %ymm6,%ymm1,%ymm1
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxor           %ymm0,%ymm3,%ymm3
+       vpshufb         %ymm4,%ymm3,%ymm3
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxor           %ymm2,%ymm1,%ymm1
+       vmovdqa         %ymm1,%ymm7
+       vpslld          $7,%ymm7,%ymm7
+       vpsrld          $25,%ymm1,%ymm1
+       vpor            %ymm7,%ymm1,%ymm1
+
+       # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+       vpshufd         $0x93,%ymm1,%ymm1
+       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       vpshufd         $0x4e,%ymm2,%ymm2
+       # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+       vpshufd         $0x39,%ymm3,%ymm3
+
+       sub             $2,%r8d
+       jnz             .Ldoubleround
+
+       # o0 = i0 ^ (x0 + s0)
+       vpaddd          %ymm8,%ymm0,%ymm7
+       cmp             $0x10,%rax
+       jl              .Lxorpart2
+       vpxor           0x00(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x00(%rsi)
+       vextracti128    $1,%ymm7,%xmm0
+       # o1 = i1 ^ (x1 + s1)
+       vpaddd          %ymm9,%ymm1,%ymm7
+       cmp             $0x20,%rax
+       jl              .Lxorpart2
+       vpxor           0x10(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x10(%rsi)
+       vextracti128    $1,%ymm7,%xmm1
+       # o2 = i2 ^ (x2 + s2)
+       vpaddd          %ymm10,%ymm2,%ymm7
+       cmp             $0x30,%rax
+       jl              .Lxorpart2
+       vpxor           0x20(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x20(%rsi)
+       vextracti128    $1,%ymm7,%xmm2
+       # o3 = i3 ^ (x3 + s3)
+       vpaddd          %ymm11,%ymm3,%ymm7
+       cmp             $0x40,%rax
+       jl              .Lxorpart2
+       vpxor           0x30(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x30(%rsi)
+       vextracti128    $1,%ymm7,%xmm3
+
+       # xor and write second block
+       vmovdqa         %xmm0,%xmm7
+       cmp             $0x50,%rax
+       jl              .Lxorpart2
+       vpxor           0x40(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x40(%rsi)
+
+       vmovdqa         %xmm1,%xmm7
+       cmp             $0x60,%rax
+       jl              .Lxorpart2
+       vpxor           0x50(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x50(%rsi)
+
+       vmovdqa         %xmm2,%xmm7
+       cmp             $0x70,%rax
+       jl              .Lxorpart2
+       vpxor           0x60(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x60(%rsi)
+
+       vmovdqa         %xmm3,%xmm7
+       cmp             $0x80,%rax
+       jl              .Lxorpart2
+       vpxor           0x70(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x70(%rsi)
+
+.Ldone2:
+       vzeroupper
+       ret
+
+.Lxorpart2:
+       # xor remaining bytes from partial register into output
+       mov             %rax,%r9
+       and             $0x0f,%r9
+       jz              .Ldone2
+       and             $~0x0f,%rax
+
+       mov             %rsi,%r11
+
+       lea             8(%rsp),%r10
+       sub             $0x10,%rsp
+       and             $~31,%rsp
+
+       lea             (%rdx,%rax),%rsi
+       mov             %rsp,%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       vpxor           0x00(%rsp),%xmm7,%xmm7
+       vmovdqa         %xmm7,0x00(%rsp)
+
+       mov             %rsp,%rsi
+       lea             (%r11,%rax),%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       lea             -8(%r10),%rsp
+       jmp             .Ldone2
+
+ENDPROC(chacha_2block_xor_avx2)
+
+ENTRY(chacha_4block_xor_avx2)
+       # %rdi: Input state matrix, s
+       # %rsi: up to 4 data blocks output, o
+       # %rdx: up to 4 data blocks input, i
+       # %rcx: input/output length in bytes
+       # %r8d: nrounds
+
+       # This function encrypts four ChaCha blocks by loading the state
+       # matrix four times across eight AVX registers. It performs matrix
+       # operations on four words in two matrices in parallel, sequentially
+       # to the operations on the four words of the other two matrices. The
+       # required word shuffling has a rather high latency, we can do the
+       # arithmetic on two matrix-pairs without much slowdown.
+
+       vzeroupper
+
+       # x0..3[0-4] = s0..3
+       vbroadcasti128  0x00(%rdi),%ymm0
+       vbroadcasti128  0x10(%rdi),%ymm1
+       vbroadcasti128  0x20(%rdi),%ymm2
+       vbroadcasti128  0x30(%rdi),%ymm3
+
+       vmovdqa         %ymm0,%ymm4
+       vmovdqa         %ymm1,%ymm5
+       vmovdqa         %ymm2,%ymm6
+       vmovdqa         %ymm3,%ymm7
+
+       vpaddd          CTR2BL(%rip),%ymm3,%ymm3
+       vpaddd          CTR4BL(%rip),%ymm7,%ymm7
+
+       vmovdqa         %ymm0,%ymm11
+       vmovdqa         %ymm1,%ymm12
+       vmovdqa         %ymm2,%ymm13
+       vmovdqa         %ymm3,%ymm14
+       vmovdqa         %ymm7,%ymm15
+
+       vmovdqa         ROT8(%rip),%ymm8
+       vmovdqa         ROT16(%rip),%ymm9
+
+       mov             %rcx,%rax
+
+.Ldoubleround4:
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxor           %ymm0,%ymm3,%ymm3
+       vpshufb         %ymm9,%ymm3,%ymm3
+
+       vpaddd          %ymm5,%ymm4,%ymm4
+       vpxor           %ymm4,%ymm7,%ymm7
+       vpshufb         %ymm9,%ymm7,%ymm7
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxor           %ymm2,%ymm1,%ymm1
+       vmovdqa         %ymm1,%ymm10
+       vpslld          $12,%ymm10,%ymm10
+       vpsrld          $20,%ymm1,%ymm1
+       vpor            %ymm10,%ymm1,%ymm1
+
+       vpaddd          %ymm7,%ymm6,%ymm6
+       vpxor           %ymm6,%ymm5,%ymm5
+       vmovdqa         %ymm5,%ymm10
+       vpslld          $12,%ymm10,%ymm10
+       vpsrld          $20,%ymm5,%ymm5
+       vpor            %ymm10,%ymm5,%ymm5
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxor           %ymm0,%ymm3,%ymm3
+       vpshufb         %ymm8,%ymm3,%ymm3
+
+       vpaddd          %ymm5,%ymm4,%ymm4
+       vpxor           %ymm4,%ymm7,%ymm7
+       vpshufb         %ymm8,%ymm7,%ymm7
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxor           %ymm2,%ymm1,%ymm1
+       vmovdqa         %ymm1,%ymm10
+       vpslld          $7,%ymm10,%ymm10
+       vpsrld          $25,%ymm1,%ymm1
+       vpor            %ymm10,%ymm1,%ymm1
+
+       vpaddd          %ymm7,%ymm6,%ymm6
+       vpxor           %ymm6,%ymm5,%ymm5
+       vmovdqa         %ymm5,%ymm10
+       vpslld          $7,%ymm10,%ymm10
+       vpsrld          $25,%ymm5,%ymm5
+       vpor            %ymm10,%ymm5,%ymm5
+
+       # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+       vpshufd         $0x39,%ymm1,%ymm1
+       vpshufd         $0x39,%ymm5,%ymm5
+       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       vpshufd         $0x4e,%ymm2,%ymm2
+       vpshufd         $0x4e,%ymm6,%ymm6
+       # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+       vpshufd         $0x93,%ymm3,%ymm3
+       vpshufd         $0x93,%ymm7,%ymm7
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxor           %ymm0,%ymm3,%ymm3
+       vpshufb         %ymm9,%ymm3,%ymm3
+
+       vpaddd          %ymm5,%ymm4,%ymm4
+       vpxor           %ymm4,%ymm7,%ymm7
+       vpshufb         %ymm9,%ymm7,%ymm7
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxor           %ymm2,%ymm1,%ymm1
+       vmovdqa         %ymm1,%ymm10
+       vpslld          $12,%ymm10,%ymm10
+       vpsrld          $20,%ymm1,%ymm1
+       vpor            %ymm10,%ymm1,%ymm1
+
+       vpaddd          %ymm7,%ymm6,%ymm6
+       vpxor           %ymm6,%ymm5,%ymm5
+       vmovdqa         %ymm5,%ymm10
+       vpslld          $12,%ymm10,%ymm10
+       vpsrld          $20,%ymm5,%ymm5
+       vpor            %ymm10,%ymm5,%ymm5
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxor           %ymm0,%ymm3,%ymm3
+       vpshufb         %ymm8,%ymm3,%ymm3
+
+       vpaddd          %ymm5,%ymm4,%ymm4
+       vpxor           %ymm4,%ymm7,%ymm7
+       vpshufb         %ymm8,%ymm7,%ymm7
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxor           %ymm2,%ymm1,%ymm1
+       vmovdqa         %ymm1,%ymm10
+       vpslld          $7,%ymm10,%ymm10
+       vpsrld          $25,%ymm1,%ymm1
+       vpor            %ymm10,%ymm1,%ymm1
+
+       vpaddd          %ymm7,%ymm6,%ymm6
+       vpxor           %ymm6,%ymm5,%ymm5
+       vmovdqa         %ymm5,%ymm10
+       vpslld          $7,%ymm10,%ymm10
+       vpsrld          $25,%ymm5,%ymm5
+       vpor            %ymm10,%ymm5,%ymm5
+
+       # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+       vpshufd         $0x93,%ymm1,%ymm1
+       vpshufd         $0x93,%ymm5,%ymm5
+       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       vpshufd         $0x4e,%ymm2,%ymm2
+       vpshufd         $0x4e,%ymm6,%ymm6
+       # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+       vpshufd         $0x39,%ymm3,%ymm3
+       vpshufd         $0x39,%ymm7,%ymm7
+
+       sub             $2,%r8d
+       jnz             .Ldoubleround4
+
+       # o0 = i0 ^ (x0 + s0), first block
+       vpaddd          %ymm11,%ymm0,%ymm10
+       cmp             $0x10,%rax
+       jl              .Lxorpart4
+       vpxor           0x00(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x00(%rsi)
+       vextracti128    $1,%ymm10,%xmm0
+       # o1 = i1 ^ (x1 + s1), first block
+       vpaddd          %ymm12,%ymm1,%ymm10
+       cmp             $0x20,%rax
+       jl              .Lxorpart4
+       vpxor           0x10(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x10(%rsi)
+       vextracti128    $1,%ymm10,%xmm1
+       # o2 = i2 ^ (x2 + s2), first block
+       vpaddd          %ymm13,%ymm2,%ymm10
+       cmp             $0x30,%rax
+       jl              .Lxorpart4
+       vpxor           0x20(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x20(%rsi)
+       vextracti128    $1,%ymm10,%xmm2
+       # o3 = i3 ^ (x3 + s3), first block
+       vpaddd          %ymm14,%ymm3,%ymm10
+       cmp             $0x40,%rax
+       jl              .Lxorpart4
+       vpxor           0x30(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x30(%rsi)
+       vextracti128    $1,%ymm10,%xmm3
+
+       # xor and write second block
+       vmovdqa         %xmm0,%xmm10
+       cmp             $0x50,%rax
+       jl              .Lxorpart4
+       vpxor           0x40(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x40(%rsi)
+
+       vmovdqa         %xmm1,%xmm10
+       cmp             $0x60,%rax
+       jl              .Lxorpart4
+       vpxor           0x50(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x50(%rsi)
+
+       vmovdqa         %xmm2,%xmm10
+       cmp             $0x70,%rax
+       jl              .Lxorpart4
+       vpxor           0x60(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x60(%rsi)
+
+       vmovdqa         %xmm3,%xmm10
+       cmp             $0x80,%rax
+       jl              .Lxorpart4
+       vpxor           0x70(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x70(%rsi)
+
+       # o0 = i0 ^ (x0 + s0), third block
+       vpaddd          %ymm11,%ymm4,%ymm10
+       cmp             $0x90,%rax
+       jl              .Lxorpart4
+       vpxor           0x80(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x80(%rsi)
+       vextracti128    $1,%ymm10,%xmm4
+       # o1 = i1 ^ (x1 + s1), third block
+       vpaddd          %ymm12,%ymm5,%ymm10
+       cmp             $0xa0,%rax
+       jl              .Lxorpart4
+       vpxor           0x90(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x90(%rsi)
+       vextracti128    $1,%ymm10,%xmm5
+       # o2 = i2 ^ (x2 + s2), third block
+       vpaddd          %ymm13,%ymm6,%ymm10
+       cmp             $0xb0,%rax
+       jl              .Lxorpart4
+       vpxor           0xa0(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0xa0(%rsi)
+       vextracti128    $1,%ymm10,%xmm6
+       # o3 = i3 ^ (x3 + s3), third block
+       vpaddd          %ymm15,%ymm7,%ymm10
+       cmp             $0xc0,%rax
+       jl              .Lxorpart4
+       vpxor           0xb0(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0xb0(%rsi)
+       vextracti128    $1,%ymm10,%xmm7
+
+       # xor and write fourth block
+       vmovdqa         %xmm4,%xmm10
+       cmp             $0xd0,%rax
+       jl              .Lxorpart4
+       vpxor           0xc0(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0xc0(%rsi)
+
+       vmovdqa         %xmm5,%xmm10
+       cmp             $0xe0,%rax
+       jl              .Lxorpart4
+       vpxor           0xd0(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0xd0(%rsi)
+
+       vmovdqa         %xmm6,%xmm10
+       cmp             $0xf0,%rax
+       jl              .Lxorpart4
+       vpxor           0xe0(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0xe0(%rsi)
+
+       vmovdqa         %xmm7,%xmm10
+       cmp             $0x100,%rax
+       jl              .Lxorpart4
+       vpxor           0xf0(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0xf0(%rsi)
+
+.Ldone4:
+       vzeroupper
+       ret
+
+.Lxorpart4:
+       # xor remaining bytes from partial register into output
+       mov             %rax,%r9
+       and             $0x0f,%r9
+       jz              .Ldone4
+       and             $~0x0f,%rax
+
+       mov             %rsi,%r11
+
+       lea             8(%rsp),%r10
+       sub             $0x10,%rsp
+       and             $~31,%rsp
+
+       lea             (%rdx,%rax),%rsi
+       mov             %rsp,%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       vpxor           0x00(%rsp),%xmm10,%xmm10
+       vmovdqa         %xmm10,0x00(%rsp)
+
+       mov             %rsp,%rsi
+       lea             (%r11,%rax),%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       lea             -8(%r10),%rsp
+       jmp             .Ldone4
+
+ENDPROC(chacha_4block_xor_avx2)
+
+ENTRY(chacha_8block_xor_avx2)
+       # %rdi: Input state matrix, s
+       # %rsi: up to 8 data blocks output, o
+       # %rdx: up to 8 data blocks input, i
+       # %rcx: input/output length in bytes
+       # %r8d: nrounds
+
+       # This function encrypts eight consecutive ChaCha blocks by loading
+       # the state matrix in AVX registers eight times. As we need some
+       # scratch registers, we save the first four registers on the stack. The
+       # algorithm performs each operation on the corresponding word of each
+       # state matrix, hence requires no word shuffling. For final XORing step
+       # we transpose the matrix by interleaving 32-, 64- and then 128-bit
+       # words, which allows us to do XOR in AVX registers. 8/16-bit word
+       # rotation is done with the slightly better performing byte shuffling,
+       # 7/12-bit word rotation uses traditional shift+OR.
+
+       vzeroupper
+       # 4 * 32 byte stack, 32-byte aligned
+       lea             8(%rsp),%r10
+       and             $~31, %rsp
+       sub             $0x80, %rsp
+       mov             %rcx,%rax
+
+       # x0..15[0-7] = s[0..15]
+       vpbroadcastd    0x00(%rdi),%ymm0
+       vpbroadcastd    0x04(%rdi),%ymm1
+       vpbroadcastd    0x08(%rdi),%ymm2
+       vpbroadcastd    0x0c(%rdi),%ymm3
+       vpbroadcastd    0x10(%rdi),%ymm4
+       vpbroadcastd    0x14(%rdi),%ymm5
+       vpbroadcastd    0x18(%rdi),%ymm6
+       vpbroadcastd    0x1c(%rdi),%ymm7
+       vpbroadcastd    0x20(%rdi),%ymm8
+       vpbroadcastd    0x24(%rdi),%ymm9
+       vpbroadcastd    0x28(%rdi),%ymm10
+       vpbroadcastd    0x2c(%rdi),%ymm11
+       vpbroadcastd    0x30(%rdi),%ymm12
+       vpbroadcastd    0x34(%rdi),%ymm13
+       vpbroadcastd    0x38(%rdi),%ymm14
+       vpbroadcastd    0x3c(%rdi),%ymm15
+       # x0..3 on stack
+       vmovdqa         %ymm0,0x00(%rsp)
+       vmovdqa         %ymm1,0x20(%rsp)
+       vmovdqa         %ymm2,0x40(%rsp)
+       vmovdqa         %ymm3,0x60(%rsp)
+
+       vmovdqa         CTRINC(%rip),%ymm1
+       vmovdqa         ROT8(%rip),%ymm2
+       vmovdqa         ROT16(%rip),%ymm3
+
+       # x12 += counter values 0-3
+       vpaddd          %ymm1,%ymm12,%ymm12
+
+.Ldoubleround8:
+       # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+       vpaddd          0x00(%rsp),%ymm4,%ymm0
+       vmovdqa         %ymm0,0x00(%rsp)
+       vpxor           %ymm0,%ymm12,%ymm12
+       vpshufb         %ymm3,%ymm12,%ymm12
+       # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+       vpaddd          0x20(%rsp),%ymm5,%ymm0
+       vmovdqa         %ymm0,0x20(%rsp)
+       vpxor           %ymm0,%ymm13,%ymm13
+       vpshufb         %ymm3,%ymm13,%ymm13
+       # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+       vpaddd          0x40(%rsp),%ymm6,%ymm0
+       vmovdqa         %ymm0,0x40(%rsp)
+       vpxor           %ymm0,%ymm14,%ymm14
+       vpshufb         %ymm3,%ymm14,%ymm14
+       # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+       vpaddd          0x60(%rsp),%ymm7,%ymm0
+       vmovdqa         %ymm0,0x60(%rsp)
+       vpxor           %ymm0,%ymm15,%ymm15
+       vpshufb         %ymm3,%ymm15,%ymm15
+
+       # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+       vpaddd          %ymm12,%ymm8,%ymm8
+       vpxor           %ymm8,%ymm4,%ymm4
+       vpslld          $12,%ymm4,%ymm0
+       vpsrld          $20,%ymm4,%ymm4
+       vpor            %ymm0,%ymm4,%ymm4
+       # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+       vpaddd          %ymm13,%ymm9,%ymm9
+       vpxor           %ymm9,%ymm5,%ymm5
+       vpslld          $12,%ymm5,%ymm0
+       vpsrld          $20,%ymm5,%ymm5
+       vpor            %ymm0,%ymm5,%ymm5
+       # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+       vpaddd          %ymm14,%ymm10,%ymm10
+       vpxor           %ymm10,%ymm6,%ymm6
+       vpslld          $12,%ymm6,%ymm0
+       vpsrld          $20,%ymm6,%ymm6
+       vpor            %ymm0,%ymm6,%ymm6
+       # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+       vpaddd          %ymm15,%ymm11,%ymm11
+       vpxor           %ymm11,%ymm7,%ymm7
+       vpslld          $12,%ymm7,%ymm0
+       vpsrld          $20,%ymm7,%ymm7
+       vpor            %ymm0,%ymm7,%ymm7
+
+       # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+       vpaddd          0x00(%rsp),%ymm4,%ymm0
+       vmovdqa         %ymm0,0x00(%rsp)
+       vpxor           %ymm0,%ymm12,%ymm12
+       vpshufb         %ymm2,%ymm12,%ymm12
+       # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+       vpaddd          0x20(%rsp),%ymm5,%ymm0
+       vmovdqa         %ymm0,0x20(%rsp)
+       vpxor           %ymm0,%ymm13,%ymm13
+       vpshufb         %ymm2,%ymm13,%ymm13
+       # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+       vpaddd          0x40(%rsp),%ymm6,%ymm0
+       vmovdqa         %ymm0,0x40(%rsp)
+       vpxor           %ymm0,%ymm14,%ymm14
+       vpshufb         %ymm2,%ymm14,%ymm14
+       # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+       vpaddd          0x60(%rsp),%ymm7,%ymm0
+       vmovdqa         %ymm0,0x60(%rsp)
+       vpxor           %ymm0,%ymm15,%ymm15
+       vpshufb         %ymm2,%ymm15,%ymm15
+
+       # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+       vpaddd          %ymm12,%ymm8,%ymm8
+       vpxor           %ymm8,%ymm4,%ymm4
+       vpslld          $7,%ymm4,%ymm0
+       vpsrld          $25,%ymm4,%ymm4
+       vpor            %ymm0,%ymm4,%ymm4
+       # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+       vpaddd          %ymm13,%ymm9,%ymm9
+       vpxor           %ymm9,%ymm5,%ymm5
+       vpslld          $7,%ymm5,%ymm0
+       vpsrld          $25,%ymm5,%ymm5
+       vpor            %ymm0,%ymm5,%ymm5
+       # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+       vpaddd          %ymm14,%ymm10,%ymm10
+       vpxor           %ymm10,%ymm6,%ymm6
+       vpslld          $7,%ymm6,%ymm0
+       vpsrld          $25,%ymm6,%ymm6
+       vpor            %ymm0,%ymm6,%ymm6
+       # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+       vpaddd          %ymm15,%ymm11,%ymm11
+       vpxor           %ymm11,%ymm7,%ymm7
+       vpslld          $7,%ymm7,%ymm0
+       vpsrld          $25,%ymm7,%ymm7
+       vpor            %ymm0,%ymm7,%ymm7
+
+       # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+       vpaddd          0x00(%rsp),%ymm5,%ymm0
+       vmovdqa         %ymm0,0x00(%rsp)
+       vpxor           %ymm0,%ymm15,%ymm15
+       vpshufb         %ymm3,%ymm15,%ymm15
+       # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
+       vpaddd          0x20(%rsp),%ymm6,%ymm0
+       vmovdqa         %ymm0,0x20(%rsp)
+       vpxor           %ymm0,%ymm12,%ymm12
+       vpshufb         %ymm3,%ymm12,%ymm12
+       # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+       vpaddd          0x40(%rsp),%ymm7,%ymm0
+       vmovdqa         %ymm0,0x40(%rsp)
+       vpxor           %ymm0,%ymm13,%ymm13
+       vpshufb         %ymm3,%ymm13,%ymm13
+       # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+       vpaddd          0x60(%rsp),%ymm4,%ymm0
+       vmovdqa         %ymm0,0x60(%rsp)
+       vpxor           %ymm0,%ymm14,%ymm14
+       vpshufb         %ymm3,%ymm14,%ymm14
+
+       # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+       vpaddd          %ymm15,%ymm10,%ymm10
+       vpxor           %ymm10,%ymm5,%ymm5
+       vpslld          $12,%ymm5,%ymm0
+       vpsrld          $20,%ymm5,%ymm5
+       vpor            %ymm0,%ymm5,%ymm5
+       # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+       vpaddd          %ymm12,%ymm11,%ymm11
+       vpxor           %ymm11,%ymm6,%ymm6
+       vpslld          $12,%ymm6,%ymm0
+       vpsrld          $20,%ymm6,%ymm6
+       vpor            %ymm0,%ymm6,%ymm6
+       # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+       vpaddd          %ymm13,%ymm8,%ymm8
+       vpxor           %ymm8,%ymm7,%ymm7
+       vpslld          $12,%ymm7,%ymm0
+       vpsrld          $20,%ymm7,%ymm7
+       vpor            %ymm0,%ymm7,%ymm7
+       # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+       vpaddd          %ymm14,%ymm9,%ymm9
+       vpxor           %ymm9,%ymm4,%ymm4
+       vpslld          $12,%ymm4,%ymm0
+       vpsrld          $20,%ymm4,%ymm4
+       vpor            %ymm0,%ymm4,%ymm4
+
+       # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+       vpaddd          0x00(%rsp),%ymm5,%ymm0
+       vmovdqa         %ymm0,0x00(%rsp)
+       vpxor           %ymm0,%ymm15,%ymm15
+       vpshufb         %ymm2,%ymm15,%ymm15
+       # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+       vpaddd          0x20(%rsp),%ymm6,%ymm0
+       vmovdqa         %ymm0,0x20(%rsp)
+       vpxor           %ymm0,%ymm12,%ymm12
+       vpshufb         %ymm2,%ymm12,%ymm12
+       # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+       vpaddd          0x40(%rsp),%ymm7,%ymm0
+       vmovdqa         %ymm0,0x40(%rsp)
+       vpxor           %ymm0,%ymm13,%ymm13
+       vpshufb         %ymm2,%ymm13,%ymm13
+       # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+       vpaddd          0x60(%rsp),%ymm4,%ymm0
+       vmovdqa         %ymm0,0x60(%rsp)
+       vpxor           %ymm0,%ymm14,%ymm14
+       vpshufb         %ymm2,%ymm14,%ymm14
+
+       # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+       vpaddd          %ymm15,%ymm10,%ymm10
+       vpxor           %ymm10,%ymm5,%ymm5
+       vpslld          $7,%ymm5,%ymm0
+       vpsrld          $25,%ymm5,%ymm5
+       vpor            %ymm0,%ymm5,%ymm5
+       # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+       vpaddd          %ymm12,%ymm11,%ymm11
+       vpxor           %ymm11,%ymm6,%ymm6
+       vpslld          $7,%ymm6,%ymm0
+       vpsrld          $25,%ymm6,%ymm6
+       vpor            %ymm0,%ymm6,%ymm6
+       # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+       vpaddd          %ymm13,%ymm8,%ymm8
+       vpxor           %ymm8,%ymm7,%ymm7
+       vpslld          $7,%ymm7,%ymm0
+       vpsrld          $25,%ymm7,%ymm7
+       vpor            %ymm0,%ymm7,%ymm7
+       # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+       vpaddd          %ymm14,%ymm9,%ymm9
+       vpxor           %ymm9,%ymm4,%ymm4
+       vpslld          $7,%ymm4,%ymm0
+       vpsrld          $25,%ymm4,%ymm4
+       vpor            %ymm0,%ymm4,%ymm4
+
+       sub             $2,%r8d
+       jnz             .Ldoubleround8
+
+       # x0..15[0-3] += s[0..15]
+       vpbroadcastd    0x00(%rdi),%ymm0
+       vpaddd          0x00(%rsp),%ymm0,%ymm0
+       vmovdqa         %ymm0,0x00(%rsp)
+       vpbroadcastd    0x04(%rdi),%ymm0
+       vpaddd          0x20(%rsp),%ymm0,%ymm0
+       vmovdqa         %ymm0,0x20(%rsp)
+       vpbroadcastd    0x08(%rdi),%ymm0
+       vpaddd          0x40(%rsp),%ymm0,%ymm0
+       vmovdqa         %ymm0,0x40(%rsp)
+       vpbroadcastd    0x0c(%rdi),%ymm0
+       vpaddd          0x60(%rsp),%ymm0,%ymm0
+       vmovdqa         %ymm0,0x60(%rsp)
+       vpbroadcastd    0x10(%rdi),%ymm0
+       vpaddd          %ymm0,%ymm4,%ymm4
+       vpbroadcastd    0x14(%rdi),%ymm0
+       vpaddd          %ymm0,%ymm5,%ymm5
+       vpbroadcastd    0x18(%rdi),%ymm0
+       vpaddd          %ymm0,%ymm6,%ymm6
+       vpbroadcastd    0x1c(%rdi),%ymm0
+       vpaddd          %ymm0,%ymm7,%ymm7
+       vpbroadcastd    0x20(%rdi),%ymm0
+       vpaddd          %ymm0,%ymm8,%ymm8
+       vpbroadcastd    0x24(%rdi),%ymm0
+       vpaddd          %ymm0,%ymm9,%ymm9
+       vpbroadcastd    0x28(%rdi),%ymm0
+       vpaddd          %ymm0,%ymm10,%ymm10
+       vpbroadcastd    0x2c(%rdi),%ymm0
+       vpaddd          %ymm0,%ymm11,%ymm11
+       vpbroadcastd    0x30(%rdi),%ymm0
+       vpaddd          %ymm0,%ymm12,%ymm12
+       vpbroadcastd    0x34(%rdi),%ymm0
+       vpaddd          %ymm0,%ymm13,%ymm13
+       vpbroadcastd    0x38(%rdi),%ymm0
+       vpaddd          %ymm0,%ymm14,%ymm14
+       vpbroadcastd    0x3c(%rdi),%ymm0
+       vpaddd          %ymm0,%ymm15,%ymm15
+
+       # x12 += counter values 0-3
+       vpaddd          %ymm1,%ymm12,%ymm12
+
+       # interleave 32-bit words in state n, n+1
+       vmovdqa         0x00(%rsp),%ymm0
+       vmovdqa         0x20(%rsp),%ymm1
+       vpunpckldq      %ymm1,%ymm0,%ymm2
+       vpunpckhdq      %ymm1,%ymm0,%ymm1
+       vmovdqa         %ymm2,0x00(%rsp)
+       vmovdqa         %ymm1,0x20(%rsp)
+       vmovdqa         0x40(%rsp),%ymm0
+       vmovdqa         0x60(%rsp),%ymm1
+       vpunpckldq      %ymm1,%ymm0,%ymm2
+       vpunpckhdq      %ymm1,%ymm0,%ymm1
+       vmovdqa         %ymm2,0x40(%rsp)
+       vmovdqa         %ymm1,0x60(%rsp)
+       vmovdqa         %ymm4,%ymm0
+       vpunpckldq      %ymm5,%ymm0,%ymm4
+       vpunpckhdq      %ymm5,%ymm0,%ymm5
+       vmovdqa         %ymm6,%ymm0
+       vpunpckldq      %ymm7,%ymm0,%ymm6
+       vpunpckhdq      %ymm7,%ymm0,%ymm7
+       vmovdqa         %ymm8,%ymm0
+       vpunpckldq      %ymm9,%ymm0,%ymm8
+       vpunpckhdq      %ymm9,%ymm0,%ymm9
+       vmovdqa         %ymm10,%ymm0
+       vpunpckldq      %ymm11,%ymm0,%ymm10
+       vpunpckhdq      %ymm11,%ymm0,%ymm11
+       vmovdqa         %ymm12,%ymm0
+       vpunpckldq      %ymm13,%ymm0,%ymm12
+       vpunpckhdq      %ymm13,%ymm0,%ymm13
+       vmovdqa         %ymm14,%ymm0
+       vpunpckldq      %ymm15,%ymm0,%ymm14
+       vpunpckhdq      %ymm15,%ymm0,%ymm15
+
+       # interleave 64-bit words in state n, n+2
+       vmovdqa         0x00(%rsp),%ymm0
+       vmovdqa         0x40(%rsp),%ymm2
+       vpunpcklqdq     %ymm2,%ymm0,%ymm1
+       vpunpckhqdq     %ymm2,%ymm0,%ymm2
+       vmovdqa         %ymm1,0x00(%rsp)
+       vmovdqa         %ymm2,0x40(%rsp)
+       vmovdqa         0x20(%rsp),%ymm0
+       vmovdqa         0x60(%rsp),%ymm2
+       vpunpcklqdq     %ymm2,%ymm0,%ymm1
+       vpunpckhqdq     %ymm2,%ymm0,%ymm2
+       vmovdqa         %ymm1,0x20(%rsp)
+       vmovdqa         %ymm2,0x60(%rsp)
+       vmovdqa         %ymm4,%ymm0
+       vpunpcklqdq     %ymm6,%ymm0,%ymm4
+       vpunpckhqdq     %ymm6,%ymm0,%ymm6
+       vmovdqa         %ymm5,%ymm0
+       vpunpcklqdq     %ymm7,%ymm0,%ymm5
+       vpunpckhqdq     %ymm7,%ymm0,%ymm7
+       vmovdqa         %ymm8,%ymm0
+       vpunpcklqdq     %ymm10,%ymm0,%ymm8
+       vpunpckhqdq     %ymm10,%ymm0,%ymm10
+       vmovdqa         %ymm9,%ymm0
+       vpunpcklqdq     %ymm11,%ymm0,%ymm9
+       vpunpckhqdq     %ymm11,%ymm0,%ymm11
+       vmovdqa         %ymm12,%ymm0
+       vpunpcklqdq     %ymm14,%ymm0,%ymm12
+       vpunpckhqdq     %ymm14,%ymm0,%ymm14
+       vmovdqa         %ymm13,%ymm0
+       vpunpcklqdq     %ymm15,%ymm0,%ymm13
+       vpunpckhqdq     %ymm15,%ymm0,%ymm15
+
+       # interleave 128-bit words in state n, n+4
+       # xor/write first four blocks
+       vmovdqa         0x00(%rsp),%ymm1
+       vperm2i128      $0x20,%ymm4,%ymm1,%ymm0
+       cmp             $0x0020,%rax
+       jl              .Lxorpart8
+       vpxor           0x0000(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x0000(%rsi)
+       vperm2i128      $0x31,%ymm4,%ymm1,%ymm4
+
+       vperm2i128      $0x20,%ymm12,%ymm8,%ymm0
+       cmp             $0x0040,%rax
+       jl              .Lxorpart8
+       vpxor           0x0020(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x0020(%rsi)
+       vperm2i128      $0x31,%ymm12,%ymm8,%ymm12
+
+       vmovdqa         0x40(%rsp),%ymm1
+       vperm2i128      $0x20,%ymm6,%ymm1,%ymm0
+       cmp             $0x0060,%rax
+       jl              .Lxorpart8
+       vpxor           0x0040(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x0040(%rsi)
+       vperm2i128      $0x31,%ymm6,%ymm1,%ymm6
+
+       vperm2i128      $0x20,%ymm14,%ymm10,%ymm0
+       cmp             $0x0080,%rax
+       jl              .Lxorpart8
+       vpxor           0x0060(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x0060(%rsi)
+       vperm2i128      $0x31,%ymm14,%ymm10,%ymm14
+
+       vmovdqa         0x20(%rsp),%ymm1
+       vperm2i128      $0x20,%ymm5,%ymm1,%ymm0
+       cmp             $0x00a0,%rax
+       jl              .Lxorpart8
+       vpxor           0x0080(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x0080(%rsi)
+       vperm2i128      $0x31,%ymm5,%ymm1,%ymm5
+
+       vperm2i128      $0x20,%ymm13,%ymm9,%ymm0
+       cmp             $0x00c0,%rax
+       jl              .Lxorpart8
+       vpxor           0x00a0(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x00a0(%rsi)
+       vperm2i128      $0x31,%ymm13,%ymm9,%ymm13
+
+       vmovdqa         0x60(%rsp),%ymm1
+       vperm2i128      $0x20,%ymm7,%ymm1,%ymm0
+       cmp             $0x00e0,%rax
+       jl              .Lxorpart8
+       vpxor           0x00c0(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x00c0(%rsi)
+       vperm2i128      $0x31,%ymm7,%ymm1,%ymm7
+
+       vperm2i128      $0x20,%ymm15,%ymm11,%ymm0
+       cmp             $0x0100,%rax
+       jl              .Lxorpart8
+       vpxor           0x00e0(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x00e0(%rsi)
+       vperm2i128      $0x31,%ymm15,%ymm11,%ymm15
+
+       # xor remaining blocks, write to output
+       vmovdqa         %ymm4,%ymm0
+       cmp             $0x0120,%rax
+       jl              .Lxorpart8
+       vpxor           0x0100(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x0100(%rsi)
+
+       vmovdqa         %ymm12,%ymm0
+       cmp             $0x0140,%rax
+       jl              .Lxorpart8
+       vpxor           0x0120(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x0120(%rsi)
+
+       vmovdqa         %ymm6,%ymm0
+       cmp             $0x0160,%rax
+       jl              .Lxorpart8
+       vpxor           0x0140(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x0140(%rsi)
+
+       vmovdqa         %ymm14,%ymm0
+       cmp             $0x0180,%rax
+       jl              .Lxorpart8
+       vpxor           0x0160(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x0160(%rsi)
+
+       vmovdqa         %ymm5,%ymm0
+       cmp             $0x01a0,%rax
+       jl              .Lxorpart8
+       vpxor           0x0180(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x0180(%rsi)
+
+       vmovdqa         %ymm13,%ymm0
+       cmp             $0x01c0,%rax
+       jl              .Lxorpart8
+       vpxor           0x01a0(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x01a0(%rsi)
+
+       vmovdqa         %ymm7,%ymm0
+       cmp             $0x01e0,%rax
+       jl              .Lxorpart8
+       vpxor           0x01c0(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x01c0(%rsi)
+
+       vmovdqa         %ymm15,%ymm0
+       cmp             $0x0200,%rax
+       jl              .Lxorpart8
+       vpxor           0x01e0(%rdx),%ymm0,%ymm0
+       vmovdqu         %ymm0,0x01e0(%rsi)
+
+.Ldone8:
+       vzeroupper
+       lea             -8(%r10),%rsp
+       ret
+
+.Lxorpart8:
+       # xor remaining bytes from partial register into output
+       mov             %rax,%r9
+       and             $0x1f,%r9
+       jz              .Ldone8
+       and             $~0x1f,%rax
+
+       mov             %rsi,%r11
+
+       lea             (%rdx,%rax),%rsi
+       mov             %rsp,%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       vpxor           0x00(%rsp),%ymm0,%ymm0
+       vmovdqa         %ymm0,0x00(%rsp)
+
+       mov             %rsp,%rsi
+       lea             (%r11,%rax),%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       jmp             .Ldone8
+
+ENDPROC(chacha_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S
new file mode 100644 (file)
index 0000000..848f9c7
--- /dev/null
@@ -0,0 +1,836 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
+ *
+ * Copyright (C) 2018 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section       .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:        .octa 0x00000000000000000000000000000000
+       .octa 0x00000000000000000000000000000001
+
+.section       .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:        .octa 0x00000000000000000000000000000002
+       .octa 0x00000000000000000000000000000003
+
+.section       .rodata.cst32.CTR8BL, "aM", @progbits, 32
+.align 32
+CTR8BL:        .octa 0x00000003000000020000000100000000
+       .octa 0x00000007000000060000000500000004
+
+.text
+
+ENTRY(chacha_2block_xor_avx512vl)
+       # %rdi: Input state matrix, s
+       # %rsi: up to 2 data blocks output, o
+       # %rdx: up to 2 data blocks input, i
+       # %rcx: input/output length in bytes
+       # %r8d: nrounds
+
+       # This function encrypts two ChaCha blocks by loading the state
+       # matrix twice across four AVX registers. It performs matrix operations
+       # on four words in each matrix in parallel, but requires shuffling to
+       # rearrange the words after each round.
+
+       vzeroupper
+
+       # x0..3[0-2] = s0..3
+       vbroadcasti128  0x00(%rdi),%ymm0
+       vbroadcasti128  0x10(%rdi),%ymm1
+       vbroadcasti128  0x20(%rdi),%ymm2
+       vbroadcasti128  0x30(%rdi),%ymm3
+
+       vpaddd          CTR2BL(%rip),%ymm3,%ymm3
+
+       vmovdqa         %ymm0,%ymm8
+       vmovdqa         %ymm1,%ymm9
+       vmovdqa         %ymm2,%ymm10
+       vmovdqa         %ymm3,%ymm11
+
+.Ldoubleround:
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxord          %ymm0,%ymm3,%ymm3
+       vprold          $16,%ymm3,%ymm3
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxord          %ymm2,%ymm1,%ymm1
+       vprold          $12,%ymm1,%ymm1
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxord          %ymm0,%ymm3,%ymm3
+       vprold          $8,%ymm3,%ymm3
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxord          %ymm2,%ymm1,%ymm1
+       vprold          $7,%ymm1,%ymm1
+
+       # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+       vpshufd         $0x39,%ymm1,%ymm1
+       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       vpshufd         $0x4e,%ymm2,%ymm2
+       # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+       vpshufd         $0x93,%ymm3,%ymm3
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxord          %ymm0,%ymm3,%ymm3
+       vprold          $16,%ymm3,%ymm3
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxord          %ymm2,%ymm1,%ymm1
+       vprold          $12,%ymm1,%ymm1
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxord          %ymm0,%ymm3,%ymm3
+       vprold          $8,%ymm3,%ymm3
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxord          %ymm2,%ymm1,%ymm1
+       vprold          $7,%ymm1,%ymm1
+
+       # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+       vpshufd         $0x93,%ymm1,%ymm1
+       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       vpshufd         $0x4e,%ymm2,%ymm2
+       # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+       vpshufd         $0x39,%ymm3,%ymm3
+
+       sub             $2,%r8d
+       jnz             .Ldoubleround
+
+       # o0 = i0 ^ (x0 + s0)
+       vpaddd          %ymm8,%ymm0,%ymm7
+       cmp             $0x10,%rcx
+       jl              .Lxorpart2
+       vpxord          0x00(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x00(%rsi)
+       vextracti128    $1,%ymm7,%xmm0
+       # o1 = i1 ^ (x1 + s1)
+       vpaddd          %ymm9,%ymm1,%ymm7
+       cmp             $0x20,%rcx
+       jl              .Lxorpart2
+       vpxord          0x10(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x10(%rsi)
+       vextracti128    $1,%ymm7,%xmm1
+       # o2 = i2 ^ (x2 + s2)
+       vpaddd          %ymm10,%ymm2,%ymm7
+       cmp             $0x30,%rcx
+       jl              .Lxorpart2
+       vpxord          0x20(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x20(%rsi)
+       vextracti128    $1,%ymm7,%xmm2
+       # o3 = i3 ^ (x3 + s3)
+       vpaddd          %ymm11,%ymm3,%ymm7
+       cmp             $0x40,%rcx
+       jl              .Lxorpart2
+       vpxord          0x30(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x30(%rsi)
+       vextracti128    $1,%ymm7,%xmm3
+
+       # xor and write second block
+       vmovdqa         %xmm0,%xmm7
+       cmp             $0x50,%rcx
+       jl              .Lxorpart2
+       vpxord          0x40(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x40(%rsi)
+
+       vmovdqa         %xmm1,%xmm7
+       cmp             $0x60,%rcx
+       jl              .Lxorpart2
+       vpxord          0x50(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x50(%rsi)
+
+       vmovdqa         %xmm2,%xmm7
+       cmp             $0x70,%rcx
+       jl              .Lxorpart2
+       vpxord          0x60(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x60(%rsi)
+
+       vmovdqa         %xmm3,%xmm7
+       cmp             $0x80,%rcx
+       jl              .Lxorpart2
+       vpxord          0x70(%rdx),%xmm7,%xmm6
+       vmovdqu         %xmm6,0x70(%rsi)
+
+.Ldone2:
+       vzeroupper
+       ret
+
+.Lxorpart2:
+       # xor remaining bytes from partial register into output
+       mov             %rcx,%rax
+       and             $0xf,%rcx
+       jz              .Ldone8
+       mov             %rax,%r9
+       and             $~0xf,%r9
+
+       mov             $1,%rax
+       shld            %cl,%rax,%rax
+       sub             $1,%rax
+       kmovq           %rax,%k1
+
+       vmovdqu8        (%rdx,%r9),%xmm1{%k1}{z}
+       vpxord          %xmm7,%xmm1,%xmm1
+       vmovdqu8        %xmm1,(%rsi,%r9){%k1}
+
+       jmp             .Ldone2
+
+ENDPROC(chacha_2block_xor_avx512vl)
+
+ENTRY(chacha_4block_xor_avx512vl)
+       # %rdi: Input state matrix, s
+       # %rsi: up to 4 data blocks output, o
+       # %rdx: up to 4 data blocks input, i
+       # %rcx: input/output length in bytes
+       # %r8d: nrounds
+
+       # This function encrypts four ChaCha blocks by loading the state
+       # matrix four times across eight AVX registers. It performs matrix
+       # operations on four words in two matrices in parallel, sequentially
+       # to the operations on the four words of the other two matrices. The
+       # required word shuffling has a rather high latency, we can do the
+       # arithmetic on two matrix-pairs without much slowdown.
+
+       vzeroupper
+
+       # x0..3[0-4] = s0..3
+       vbroadcasti128  0x00(%rdi),%ymm0
+       vbroadcasti128  0x10(%rdi),%ymm1
+       vbroadcasti128  0x20(%rdi),%ymm2
+       vbroadcasti128  0x30(%rdi),%ymm3
+
+       vmovdqa         %ymm0,%ymm4
+       vmovdqa         %ymm1,%ymm5
+       vmovdqa         %ymm2,%ymm6
+       vmovdqa         %ymm3,%ymm7
+
+       vpaddd          CTR2BL(%rip),%ymm3,%ymm3
+       vpaddd          CTR4BL(%rip),%ymm7,%ymm7
+
+       vmovdqa         %ymm0,%ymm11
+       vmovdqa         %ymm1,%ymm12
+       vmovdqa         %ymm2,%ymm13
+       vmovdqa         %ymm3,%ymm14
+       vmovdqa         %ymm7,%ymm15
+
+.Ldoubleround4:
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxord          %ymm0,%ymm3,%ymm3
+       vprold          $16,%ymm3,%ymm3
+
+       vpaddd          %ymm5,%ymm4,%ymm4
+       vpxord          %ymm4,%ymm7,%ymm7
+       vprold          $16,%ymm7,%ymm7
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxord          %ymm2,%ymm1,%ymm1
+       vprold          $12,%ymm1,%ymm1
+
+       vpaddd          %ymm7,%ymm6,%ymm6
+       vpxord          %ymm6,%ymm5,%ymm5
+       vprold          $12,%ymm5,%ymm5
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxord          %ymm0,%ymm3,%ymm3
+       vprold          $8,%ymm3,%ymm3
+
+       vpaddd          %ymm5,%ymm4,%ymm4
+       vpxord          %ymm4,%ymm7,%ymm7
+       vprold          $8,%ymm7,%ymm7
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxord          %ymm2,%ymm1,%ymm1
+       vprold          $7,%ymm1,%ymm1
+
+       vpaddd          %ymm7,%ymm6,%ymm6
+       vpxord          %ymm6,%ymm5,%ymm5
+       vprold          $7,%ymm5,%ymm5
+
+       # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+       vpshufd         $0x39,%ymm1,%ymm1
+       vpshufd         $0x39,%ymm5,%ymm5
+       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       vpshufd         $0x4e,%ymm2,%ymm2
+       vpshufd         $0x4e,%ymm6,%ymm6
+       # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+       vpshufd         $0x93,%ymm3,%ymm3
+       vpshufd         $0x93,%ymm7,%ymm7
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxord          %ymm0,%ymm3,%ymm3
+       vprold          $16,%ymm3,%ymm3
+
+       vpaddd          %ymm5,%ymm4,%ymm4
+       vpxord          %ymm4,%ymm7,%ymm7
+       vprold          $16,%ymm7,%ymm7
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxord          %ymm2,%ymm1,%ymm1
+       vprold          $12,%ymm1,%ymm1
+
+       vpaddd          %ymm7,%ymm6,%ymm6
+       vpxord          %ymm6,%ymm5,%ymm5
+       vprold          $12,%ymm5,%ymm5
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       vpaddd          %ymm1,%ymm0,%ymm0
+       vpxord          %ymm0,%ymm3,%ymm3
+       vprold          $8,%ymm3,%ymm3
+
+       vpaddd          %ymm5,%ymm4,%ymm4
+       vpxord          %ymm4,%ymm7,%ymm7
+       vprold          $8,%ymm7,%ymm7
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       vpaddd          %ymm3,%ymm2,%ymm2
+       vpxord          %ymm2,%ymm1,%ymm1
+       vprold          $7,%ymm1,%ymm1
+
+       vpaddd          %ymm7,%ymm6,%ymm6
+       vpxord          %ymm6,%ymm5,%ymm5
+       vprold          $7,%ymm5,%ymm5
+
+       # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+       vpshufd         $0x93,%ymm1,%ymm1
+       vpshufd         $0x93,%ymm5,%ymm5
+       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       vpshufd         $0x4e,%ymm2,%ymm2
+       vpshufd         $0x4e,%ymm6,%ymm6
+       # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+       vpshufd         $0x39,%ymm3,%ymm3
+       vpshufd         $0x39,%ymm7,%ymm7
+
+       sub             $2,%r8d
+       jnz             .Ldoubleround4
+
+       # o0 = i0 ^ (x0 + s0), first block
+       vpaddd          %ymm11,%ymm0,%ymm10
+       cmp             $0x10,%rcx
+       jl              .Lxorpart4
+       vpxord          0x00(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x00(%rsi)
+       vextracti128    $1,%ymm10,%xmm0
+       # o1 = i1 ^ (x1 + s1), first block
+       vpaddd          %ymm12,%ymm1,%ymm10
+       cmp             $0x20,%rcx
+       jl              .Lxorpart4
+       vpxord          0x10(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x10(%rsi)
+       vextracti128    $1,%ymm10,%xmm1
+       # o2 = i2 ^ (x2 + s2), first block
+       vpaddd          %ymm13,%ymm2,%ymm10
+       cmp             $0x30,%rcx
+       jl              .Lxorpart4
+       vpxord          0x20(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x20(%rsi)
+       vextracti128    $1,%ymm10,%xmm2
+       # o3 = i3 ^ (x3 + s3), first block
+       vpaddd          %ymm14,%ymm3,%ymm10
+       cmp             $0x40,%rcx
+       jl              .Lxorpart4
+       vpxord          0x30(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x30(%rsi)
+       vextracti128    $1,%ymm10,%xmm3
+
+       # xor and write second block
+       vmovdqa         %xmm0,%xmm10
+       cmp             $0x50,%rcx
+       jl              .Lxorpart4
+       vpxord          0x40(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x40(%rsi)
+
+       vmovdqa         %xmm1,%xmm10
+       cmp             $0x60,%rcx
+       jl              .Lxorpart4
+       vpxord          0x50(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x50(%rsi)
+
+       vmovdqa         %xmm2,%xmm10
+       cmp             $0x70,%rcx
+       jl              .Lxorpart4
+       vpxord          0x60(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x60(%rsi)
+
+       vmovdqa         %xmm3,%xmm10
+       cmp             $0x80,%rcx
+       jl              .Lxorpart4
+       vpxord          0x70(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x70(%rsi)
+
+       # o0 = i0 ^ (x0 + s0), third block
+       vpaddd          %ymm11,%ymm4,%ymm10
+       cmp             $0x90,%rcx
+       jl              .Lxorpart4
+       vpxord          0x80(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x80(%rsi)
+       vextracti128    $1,%ymm10,%xmm4
+       # o1 = i1 ^ (x1 + s1), third block
+       vpaddd          %ymm12,%ymm5,%ymm10
+       cmp             $0xa0,%rcx
+       jl              .Lxorpart4
+       vpxord          0x90(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0x90(%rsi)
+       vextracti128    $1,%ymm10,%xmm5
+       # o2 = i2 ^ (x2 + s2), third block
+       vpaddd          %ymm13,%ymm6,%ymm10
+       cmp             $0xb0,%rcx
+       jl              .Lxorpart4
+       vpxord          0xa0(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0xa0(%rsi)
+       vextracti128    $1,%ymm10,%xmm6
+       # o3 = i3 ^ (x3 + s3), third block
+       vpaddd          %ymm15,%ymm7,%ymm10
+       cmp             $0xc0,%rcx
+       jl              .Lxorpart4
+       vpxord          0xb0(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0xb0(%rsi)
+       vextracti128    $1,%ymm10,%xmm7
+
+       # xor and write fourth block
+       vmovdqa         %xmm4,%xmm10
+       cmp             $0xd0,%rcx
+       jl              .Lxorpart4
+       vpxord          0xc0(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0xc0(%rsi)
+
+       vmovdqa         %xmm5,%xmm10
+       cmp             $0xe0,%rcx
+       jl              .Lxorpart4
+       vpxord          0xd0(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0xd0(%rsi)
+
+       vmovdqa         %xmm6,%xmm10
+       cmp             $0xf0,%rcx
+       jl              .Lxorpart4
+       vpxord          0xe0(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0xe0(%rsi)
+
+       vmovdqa         %xmm7,%xmm10
+       cmp             $0x100,%rcx
+       jl              .Lxorpart4
+       vpxord          0xf0(%rdx),%xmm10,%xmm9
+       vmovdqu         %xmm9,0xf0(%rsi)
+
+.Ldone4:
+       vzeroupper
+       ret
+
+.Lxorpart4:
+       # xor remaining bytes from partial register into output
+       mov             %rcx,%rax
+       and             $0xf,%rcx
+       jz              .Ldone8
+       mov             %rax,%r9
+       and             $~0xf,%r9
+
+       mov             $1,%rax
+       shld            %cl,%rax,%rax
+       sub             $1,%rax
+       kmovq           %rax,%k1
+
+       vmovdqu8        (%rdx,%r9),%xmm1{%k1}{z}
+       vpxord          %xmm10,%xmm1,%xmm1
+       vmovdqu8        %xmm1,(%rsi,%r9){%k1}
+
+       jmp             .Ldone4
+
+ENDPROC(chacha_4block_xor_avx512vl)
+
+ENTRY(chacha_8block_xor_avx512vl)
+       # %rdi: Input state matrix, s
+       # %rsi: up to 8 data blocks output, o
+       # %rdx: up to 8 data blocks input, i
+       # %rcx: input/output length in bytes
+       # %r8d: nrounds
+
+       # This function encrypts eight consecutive ChaCha blocks by loading
+       # the state matrix in AVX registers eight times. Compared to AVX2, this
+       # mostly benefits from the new rotate instructions in VL and the
+       # additional registers.
+
+       vzeroupper
+
+       # x0..15[0-7] = s[0..15]
+       vpbroadcastd    0x00(%rdi),%ymm0
+       vpbroadcastd    0x04(%rdi),%ymm1
+       vpbroadcastd    0x08(%rdi),%ymm2
+       vpbroadcastd    0x0c(%rdi),%ymm3
+       vpbroadcastd    0x10(%rdi),%ymm4
+       vpbroadcastd    0x14(%rdi),%ymm5
+       vpbroadcastd    0x18(%rdi),%ymm6
+       vpbroadcastd    0x1c(%rdi),%ymm7
+       vpbroadcastd    0x20(%rdi),%ymm8
+       vpbroadcastd    0x24(%rdi),%ymm9
+       vpbroadcastd    0x28(%rdi),%ymm10
+       vpbroadcastd    0x2c(%rdi),%ymm11
+       vpbroadcastd    0x30(%rdi),%ymm12
+       vpbroadcastd    0x34(%rdi),%ymm13
+       vpbroadcastd    0x38(%rdi),%ymm14
+       vpbroadcastd    0x3c(%rdi),%ymm15
+
+       # x12 += counter values 0-3
+       vpaddd          CTR8BL(%rip),%ymm12,%ymm12
+
+       vmovdqa64       %ymm0,%ymm16
+       vmovdqa64       %ymm1,%ymm17
+       vmovdqa64       %ymm2,%ymm18
+       vmovdqa64       %ymm3,%ymm19
+       vmovdqa64       %ymm4,%ymm20
+       vmovdqa64       %ymm5,%ymm21
+       vmovdqa64       %ymm6,%ymm22
+       vmovdqa64       %ymm7,%ymm23
+       vmovdqa64       %ymm8,%ymm24
+       vmovdqa64       %ymm9,%ymm25
+       vmovdqa64       %ymm10,%ymm26
+       vmovdqa64       %ymm11,%ymm27
+       vmovdqa64       %ymm12,%ymm28
+       vmovdqa64       %ymm13,%ymm29
+       vmovdqa64       %ymm14,%ymm30
+       vmovdqa64       %ymm15,%ymm31
+
+.Ldoubleround8:
+       # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+       vpaddd          %ymm0,%ymm4,%ymm0
+       vpxord          %ymm0,%ymm12,%ymm12
+       vprold          $16,%ymm12,%ymm12
+       # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+       vpaddd          %ymm1,%ymm5,%ymm1
+       vpxord          %ymm1,%ymm13,%ymm13
+       vprold          $16,%ymm13,%ymm13
+       # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+       vpaddd          %ymm2,%ymm6,%ymm2
+       vpxord          %ymm2,%ymm14,%ymm14
+       vprold          $16,%ymm14,%ymm14
+       # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+       vpaddd          %ymm3,%ymm7,%ymm3
+       vpxord          %ymm3,%ymm15,%ymm15
+       vprold          $16,%ymm15,%ymm15
+
+       # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+       vpaddd          %ymm12,%ymm8,%ymm8
+       vpxord          %ymm8,%ymm4,%ymm4
+       vprold          $12,%ymm4,%ymm4
+       # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+       vpaddd          %ymm13,%ymm9,%ymm9
+       vpxord          %ymm9,%ymm5,%ymm5
+       vprold          $12,%ymm5,%ymm5
+       # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+       vpaddd          %ymm14,%ymm10,%ymm10
+       vpxord          %ymm10,%ymm6,%ymm6
+       vprold          $12,%ymm6,%ymm6
+       # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+       vpaddd          %ymm15,%ymm11,%ymm11
+       vpxord          %ymm11,%ymm7,%ymm7
+       vprold          $12,%ymm7,%ymm7
+
+       # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+       vpaddd          %ymm0,%ymm4,%ymm0
+       vpxord          %ymm0,%ymm12,%ymm12
+       vprold          $8,%ymm12,%ymm12
+       # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+       vpaddd          %ymm1,%ymm5,%ymm1
+       vpxord          %ymm1,%ymm13,%ymm13
+       vprold          $8,%ymm13,%ymm13
+       # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+       vpaddd          %ymm2,%ymm6,%ymm2
+       vpxord          %ymm2,%ymm14,%ymm14
+       vprold          $8,%ymm14,%ymm14
+       # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+       vpaddd          %ymm3,%ymm7,%ymm3
+       vpxord          %ymm3,%ymm15,%ymm15
+       vprold          $8,%ymm15,%ymm15
+
+       # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+       vpaddd          %ymm12,%ymm8,%ymm8
+       vpxord          %ymm8,%ymm4,%ymm4
+       vprold          $7,%ymm4,%ymm4
+       # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+       vpaddd          %ymm13,%ymm9,%ymm9
+       vpxord          %ymm9,%ymm5,%ymm5
+       vprold          $7,%ymm5,%ymm5
+       # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+       vpaddd          %ymm14,%ymm10,%ymm10
+       vpxord          %ymm10,%ymm6,%ymm6
+       vprold          $7,%ymm6,%ymm6
+       # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+       vpaddd          %ymm15,%ymm11,%ymm11
+       vpxord          %ymm11,%ymm7,%ymm7
+       vprold          $7,%ymm7,%ymm7
+
+       # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+       vpaddd          %ymm0,%ymm5,%ymm0
+       vpxord          %ymm0,%ymm15,%ymm15
+       vprold          $16,%ymm15,%ymm15
+       # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+       vpaddd          %ymm1,%ymm6,%ymm1
+       vpxord          %ymm1,%ymm12,%ymm12
+       vprold          $16,%ymm12,%ymm12
+       # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+       vpaddd          %ymm2,%ymm7,%ymm2
+       vpxord          %ymm2,%ymm13,%ymm13
+       vprold          $16,%ymm13,%ymm13
+       # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+       vpaddd          %ymm3,%ymm4,%ymm3
+       vpxord          %ymm3,%ymm14,%ymm14
+       vprold          $16,%ymm14,%ymm14
+
+       # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+       vpaddd          %ymm15,%ymm10,%ymm10
+       vpxord          %ymm10,%ymm5,%ymm5
+       vprold          $12,%ymm5,%ymm5
+       # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+       vpaddd          %ymm12,%ymm11,%ymm11
+       vpxord          %ymm11,%ymm6,%ymm6
+       vprold          $12,%ymm6,%ymm6
+       # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+       vpaddd          %ymm13,%ymm8,%ymm8
+       vpxord          %ymm8,%ymm7,%ymm7
+       vprold          $12,%ymm7,%ymm7
+       # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+       vpaddd          %ymm14,%ymm9,%ymm9
+       vpxord          %ymm9,%ymm4,%ymm4
+       vprold          $12,%ymm4,%ymm4
+
+       # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+       vpaddd          %ymm0,%ymm5,%ymm0
+       vpxord          %ymm0,%ymm15,%ymm15
+       vprold          $8,%ymm15,%ymm15
+       # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+       vpaddd          %ymm1,%ymm6,%ymm1
+       vpxord          %ymm1,%ymm12,%ymm12
+       vprold          $8,%ymm12,%ymm12
+       # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+       vpaddd          %ymm2,%ymm7,%ymm2
+       vpxord          %ymm2,%ymm13,%ymm13
+       vprold          $8,%ymm13,%ymm13
+       # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+       vpaddd          %ymm3,%ymm4,%ymm3
+       vpxord          %ymm3,%ymm14,%ymm14
+       vprold          $8,%ymm14,%ymm14
+
+       # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+       vpaddd          %ymm15,%ymm10,%ymm10
+       vpxord          %ymm10,%ymm5,%ymm5
+       vprold          $7,%ymm5,%ymm5
+       # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+       vpaddd          %ymm12,%ymm11,%ymm11
+       vpxord          %ymm11,%ymm6,%ymm6
+       vprold          $7,%ymm6,%ymm6
+       # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+       vpaddd          %ymm13,%ymm8,%ymm8
+       vpxord          %ymm8,%ymm7,%ymm7
+       vprold          $7,%ymm7,%ymm7
+       # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+       vpaddd          %ymm14,%ymm9,%ymm9
+       vpxord          %ymm9,%ymm4,%ymm4
+       vprold          $7,%ymm4,%ymm4
+
+       sub             $2,%r8d
+       jnz             .Ldoubleround8
+
+       # x0..15[0-3] += s[0..15]
+       vpaddd          %ymm16,%ymm0,%ymm0
+       vpaddd          %ymm17,%ymm1,%ymm1
+       vpaddd          %ymm18,%ymm2,%ymm2
+       vpaddd          %ymm19,%ymm3,%ymm3
+       vpaddd          %ymm20,%ymm4,%ymm4
+       vpaddd          %ymm21,%ymm5,%ymm5
+       vpaddd          %ymm22,%ymm6,%ymm6
+       vpaddd          %ymm23,%ymm7,%ymm7
+       vpaddd          %ymm24,%ymm8,%ymm8
+       vpaddd          %ymm25,%ymm9,%ymm9
+       vpaddd          %ymm26,%ymm10,%ymm10
+       vpaddd          %ymm27,%ymm11,%ymm11
+       vpaddd          %ymm28,%ymm12,%ymm12
+       vpaddd          %ymm29,%ymm13,%ymm13
+       vpaddd          %ymm30,%ymm14,%ymm14
+       vpaddd          %ymm31,%ymm15,%ymm15
+
+       # interleave 32-bit words in state n, n+1
+       vpunpckldq      %ymm1,%ymm0,%ymm16
+       vpunpckhdq      %ymm1,%ymm0,%ymm17
+       vpunpckldq      %ymm3,%ymm2,%ymm18
+       vpunpckhdq      %ymm3,%ymm2,%ymm19
+       vpunpckldq      %ymm5,%ymm4,%ymm20
+       vpunpckhdq      %ymm5,%ymm4,%ymm21
+       vpunpckldq      %ymm7,%ymm6,%ymm22
+       vpunpckhdq      %ymm7,%ymm6,%ymm23
+       vpunpckldq      %ymm9,%ymm8,%ymm24
+       vpunpckhdq      %ymm9,%ymm8,%ymm25
+       vpunpckldq      %ymm11,%ymm10,%ymm26
+       vpunpckhdq      %ymm11,%ymm10,%ymm27
+       vpunpckldq      %ymm13,%ymm12,%ymm28
+       vpunpckhdq      %ymm13,%ymm12,%ymm29
+       vpunpckldq      %ymm15,%ymm14,%ymm30
+       vpunpckhdq      %ymm15,%ymm14,%ymm31
+
+       # interleave 64-bit words in state n, n+2
+       vpunpcklqdq     %ymm18,%ymm16,%ymm0
+       vpunpcklqdq     %ymm19,%ymm17,%ymm1
+       vpunpckhqdq     %ymm18,%ymm16,%ymm2
+       vpunpckhqdq     %ymm19,%ymm17,%ymm3
+       vpunpcklqdq     %ymm22,%ymm20,%ymm4
+       vpunpcklqdq     %ymm23,%ymm21,%ymm5
+       vpunpckhqdq     %ymm22,%ymm20,%ymm6
+       vpunpckhqdq     %ymm23,%ymm21,%ymm7
+       vpunpcklqdq     %ymm26,%ymm24,%ymm8
+       vpunpcklqdq     %ymm27,%ymm25,%ymm9
+       vpunpckhqdq     %ymm26,%ymm24,%ymm10
+       vpunpckhqdq     %ymm27,%ymm25,%ymm11
+       vpunpcklqdq     %ymm30,%ymm28,%ymm12
+       vpunpcklqdq     %ymm31,%ymm29,%ymm13
+       vpunpckhqdq     %ymm30,%ymm28,%ymm14
+       vpunpckhqdq     %ymm31,%ymm29,%ymm15
+
+       # interleave 128-bit words in state n, n+4
+       # xor/write first four blocks
+       vmovdqa64       %ymm0,%ymm16
+       vperm2i128      $0x20,%ymm4,%ymm0,%ymm0
+       cmp             $0x0020,%rcx
+       jl              .Lxorpart8
+       vpxord          0x0000(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x0000(%rsi)
+       vmovdqa64       %ymm16,%ymm0
+       vperm2i128      $0x31,%ymm4,%ymm0,%ymm4
+
+       vperm2i128      $0x20,%ymm12,%ymm8,%ymm0
+       cmp             $0x0040,%rcx
+       jl              .Lxorpart8
+       vpxord          0x0020(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x0020(%rsi)
+       vperm2i128      $0x31,%ymm12,%ymm8,%ymm12
+
+       vperm2i128      $0x20,%ymm6,%ymm2,%ymm0
+       cmp             $0x0060,%rcx
+       jl              .Lxorpart8
+       vpxord          0x0040(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x0040(%rsi)
+       vperm2i128      $0x31,%ymm6,%ymm2,%ymm6
+
+       vperm2i128      $0x20,%ymm14,%ymm10,%ymm0
+       cmp             $0x0080,%rcx
+       jl              .Lxorpart8
+       vpxord          0x0060(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x0060(%rsi)
+       vperm2i128      $0x31,%ymm14,%ymm10,%ymm14
+
+       vperm2i128      $0x20,%ymm5,%ymm1,%ymm0
+       cmp             $0x00a0,%rcx
+       jl              .Lxorpart8
+       vpxord          0x0080(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x0080(%rsi)
+       vperm2i128      $0x31,%ymm5,%ymm1,%ymm5
+
+       vperm2i128      $0x20,%ymm13,%ymm9,%ymm0
+       cmp             $0x00c0,%rcx
+       jl              .Lxorpart8
+       vpxord          0x00a0(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x00a0(%rsi)
+       vperm2i128      $0x31,%ymm13,%ymm9,%ymm13
+
+       vperm2i128      $0x20,%ymm7,%ymm3,%ymm0
+       cmp             $0x00e0,%rcx
+       jl              .Lxorpart8
+       vpxord          0x00c0(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x00c0(%rsi)
+       vperm2i128      $0x31,%ymm7,%ymm3,%ymm7
+
+       vperm2i128      $0x20,%ymm15,%ymm11,%ymm0
+       cmp             $0x0100,%rcx
+       jl              .Lxorpart8
+       vpxord          0x00e0(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x00e0(%rsi)
+       vperm2i128      $0x31,%ymm15,%ymm11,%ymm15
+
+       # xor remaining blocks, write to output
+       vmovdqa64       %ymm4,%ymm0
+       cmp             $0x0120,%rcx
+       jl              .Lxorpart8
+       vpxord          0x0100(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x0100(%rsi)
+
+       vmovdqa64       %ymm12,%ymm0
+       cmp             $0x0140,%rcx
+       jl              .Lxorpart8
+       vpxord          0x0120(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x0120(%rsi)
+
+       vmovdqa64       %ymm6,%ymm0
+       cmp             $0x0160,%rcx
+       jl              .Lxorpart8
+       vpxord          0x0140(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x0140(%rsi)
+
+       vmovdqa64       %ymm14,%ymm0
+       cmp             $0x0180,%rcx
+       jl              .Lxorpart8
+       vpxord          0x0160(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x0160(%rsi)
+
+       vmovdqa64       %ymm5,%ymm0
+       cmp             $0x01a0,%rcx
+       jl              .Lxorpart8
+       vpxord          0x0180(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x0180(%rsi)
+
+       vmovdqa64       %ymm13,%ymm0
+       cmp             $0x01c0,%rcx
+       jl              .Lxorpart8
+       vpxord          0x01a0(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x01a0(%rsi)
+
+       vmovdqa64       %ymm7,%ymm0
+       cmp             $0x01e0,%rcx
+       jl              .Lxorpart8
+       vpxord          0x01c0(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x01c0(%rsi)
+
+       vmovdqa64       %ymm15,%ymm0
+       cmp             $0x0200,%rcx
+       jl              .Lxorpart8
+       vpxord          0x01e0(%rdx),%ymm0,%ymm0
+       vmovdqu64       %ymm0,0x01e0(%rsi)
+
+.Ldone8:
+       vzeroupper
+       ret
+
+.Lxorpart8:
+       # xor remaining bytes from partial register into output
+       mov             %rcx,%rax
+       and             $0x1f,%rcx
+       jz              .Ldone8
+       mov             %rax,%r9
+       and             $~0x1f,%r9
+
+       mov             $1,%rax
+       shld            %cl,%rax,%rax
+       sub             $1,%rax
+       kmovq           %rax,%k1
+
+       vmovdqu8        (%rdx,%r9),%ymm1{%k1}{z}
+       vpxord          %ymm0,%ymm1,%ymm1
+       vmovdqu8        %ymm1,(%rsi,%r9){%k1}
+
+       jmp             .Ldone8
+
+ENDPROC(chacha_8block_xor_avx512vl)
diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S
new file mode 100644 (file)
index 0000000..c05a7a9
--- /dev/null
@@ -0,0 +1,795 @@
+/*
+ * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.section       .rodata.cst16.ROT8, "aM", @progbits, 16
+.align 16
+ROT8:  .octa 0x0e0d0c0f0a09080b0605040702010003
+.section       .rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+.section       .rodata.cst16.CTRINC, "aM", @progbits, 16
+.align 16
+CTRINC:        .octa 0x00000003000000020000000100000000
+
+.text
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
+ * function performs matrix operations on four words in parallel, but requires
+ * shuffling to rearrange the words after each round.  8/16-bit word rotation is
+ * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
+ * rotation uses traditional shift+OR.
+ *
+ * The round count is given in %r8d.
+ *
+ * Clobbers: %r8d, %xmm4-%xmm7
+ */
+chacha_permute:
+
+       movdqa          ROT8(%rip),%xmm4
+       movdqa          ROT16(%rip),%xmm5
+
+.Ldoubleround:
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       paddd           %xmm1,%xmm0
+       pxor            %xmm0,%xmm3
+       pshufb          %xmm5,%xmm3
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       paddd           %xmm3,%xmm2
+       pxor            %xmm2,%xmm1
+       movdqa          %xmm1,%xmm6
+       pslld           $12,%xmm6
+       psrld           $20,%xmm1
+       por             %xmm6,%xmm1
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       paddd           %xmm1,%xmm0
+       pxor            %xmm0,%xmm3
+       pshufb          %xmm4,%xmm3
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       paddd           %xmm3,%xmm2
+       pxor            %xmm2,%xmm1
+       movdqa          %xmm1,%xmm7
+       pslld           $7,%xmm7
+       psrld           $25,%xmm1
+       por             %xmm7,%xmm1
+
+       # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+       pshufd          $0x39,%xmm1,%xmm1
+       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       pshufd          $0x4e,%xmm2,%xmm2
+       # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+       pshufd          $0x93,%xmm3,%xmm3
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       paddd           %xmm1,%xmm0
+       pxor            %xmm0,%xmm3
+       pshufb          %xmm5,%xmm3
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       paddd           %xmm3,%xmm2
+       pxor            %xmm2,%xmm1
+       movdqa          %xmm1,%xmm6
+       pslld           $12,%xmm6
+       psrld           $20,%xmm1
+       por             %xmm6,%xmm1
+
+       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       paddd           %xmm1,%xmm0
+       pxor            %xmm0,%xmm3
+       pshufb          %xmm4,%xmm3
+
+       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       paddd           %xmm3,%xmm2
+       pxor            %xmm2,%xmm1
+       movdqa          %xmm1,%xmm7
+       pslld           $7,%xmm7
+       psrld           $25,%xmm1
+       por             %xmm7,%xmm1
+
+       # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+       pshufd          $0x93,%xmm1,%xmm1
+       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       pshufd          $0x4e,%xmm2,%xmm2
+       # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+       pshufd          $0x39,%xmm3,%xmm3
+
+       sub             $2,%r8d
+       jnz             .Ldoubleround
+
+       ret
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_ssse3)
+       # %rdi: Input state matrix, s
+       # %rsi: up to 1 data block output, o
+       # %rdx: up to 1 data block input, i
+       # %rcx: input/output length in bytes
+       # %r8d: nrounds
+       FRAME_BEGIN
+
+       # x0..3 = s0..3
+       movdqa          0x00(%rdi),%xmm0
+       movdqa          0x10(%rdi),%xmm1
+       movdqa          0x20(%rdi),%xmm2
+       movdqa          0x30(%rdi),%xmm3
+       movdqa          %xmm0,%xmm8
+       movdqa          %xmm1,%xmm9
+       movdqa          %xmm2,%xmm10
+       movdqa          %xmm3,%xmm11
+
+       mov             %rcx,%rax
+       call            chacha_permute
+
+       # o0 = i0 ^ (x0 + s0)
+       paddd           %xmm8,%xmm0
+       cmp             $0x10,%rax
+       jl              .Lxorpart
+       movdqu          0x00(%rdx),%xmm4
+       pxor            %xmm4,%xmm0
+       movdqu          %xmm0,0x00(%rsi)
+       # o1 = i1 ^ (x1 + s1)
+       paddd           %xmm9,%xmm1
+       movdqa          %xmm1,%xmm0
+       cmp             $0x20,%rax
+       jl              .Lxorpart
+       movdqu          0x10(%rdx),%xmm0
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x10(%rsi)
+       # o2 = i2 ^ (x2 + s2)
+       paddd           %xmm10,%xmm2
+       movdqa          %xmm2,%xmm0
+       cmp             $0x30,%rax
+       jl              .Lxorpart
+       movdqu          0x20(%rdx),%xmm0
+       pxor            %xmm2,%xmm0
+       movdqu          %xmm0,0x20(%rsi)
+       # o3 = i3 ^ (x3 + s3)
+       paddd           %xmm11,%xmm3
+       movdqa          %xmm3,%xmm0
+       cmp             $0x40,%rax
+       jl              .Lxorpart
+       movdqu          0x30(%rdx),%xmm0
+       pxor            %xmm3,%xmm0
+       movdqu          %xmm0,0x30(%rsi)
+
+.Ldone:
+       FRAME_END
+       ret
+
+.Lxorpart:
+       # xor remaining bytes from partial register into output
+       mov             %rax,%r9
+       and             $0x0f,%r9
+       jz              .Ldone
+       and             $~0x0f,%rax
+
+       mov             %rsi,%r11
+
+       lea             8(%rsp),%r10
+       sub             $0x10,%rsp
+       and             $~31,%rsp
+
+       lea             (%rdx,%rax),%rsi
+       mov             %rsp,%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       pxor            0x00(%rsp),%xmm0
+       movdqa          %xmm0,0x00(%rsp)
+
+       mov             %rsp,%rsi
+       lea             (%r11,%rax),%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       lea             -8(%r10),%rsp
+       jmp             .Ldone
+
+ENDPROC(chacha_block_xor_ssse3)
+
+ENTRY(hchacha_block_ssse3)
+       # %rdi: Input state matrix, s
+       # %rsi: output (8 32-bit words)
+       # %edx: nrounds
+       FRAME_BEGIN
+
+       movdqa          0x00(%rdi),%xmm0
+       movdqa          0x10(%rdi),%xmm1
+       movdqa          0x20(%rdi),%xmm2
+       movdqa          0x30(%rdi),%xmm3
+
+       mov             %edx,%r8d
+       call            chacha_permute
+
+       movdqu          %xmm0,0x00(%rsi)
+       movdqu          %xmm3,0x10(%rsi)
+
+       FRAME_END
+       ret
+ENDPROC(hchacha_block_ssse3)
+
+ENTRY(chacha_4block_xor_ssse3)
+       # %rdi: Input state matrix, s
+       # %rsi: up to 4 data blocks output, o
+       # %rdx: up to 4 data blocks input, i
+       # %rcx: input/output length in bytes
+       # %r8d: nrounds
+
+       # This function encrypts four consecutive ChaCha blocks by loading the
+       # the state matrix in SSE registers four times. As we need some scratch
+       # registers, we save the first four registers on the stack. The
+       # algorithm performs each operation on the corresponding word of each
+       # state matrix, hence requires no word shuffling. For final XORing step
+       # we transpose the matrix by interleaving 32- and then 64-bit words,
+       # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
+       # done with the slightly better performing SSSE3 byte shuffling,
+       # 7/12-bit word rotation uses traditional shift+OR.
+
+       lea             8(%rsp),%r10
+       sub             $0x80,%rsp
+       and             $~63,%rsp
+       mov             %rcx,%rax
+
+       # x0..15[0-3] = s0..3[0..3]
+       movq            0x00(%rdi),%xmm1
+       pshufd          $0x00,%xmm1,%xmm0
+       pshufd          $0x55,%xmm1,%xmm1
+       movq            0x08(%rdi),%xmm3
+       pshufd          $0x00,%xmm3,%xmm2
+       pshufd          $0x55,%xmm3,%xmm3
+       movq            0x10(%rdi),%xmm5
+       pshufd          $0x00,%xmm5,%xmm4
+       pshufd          $0x55,%xmm5,%xmm5
+       movq            0x18(%rdi),%xmm7
+       pshufd          $0x00,%xmm7,%xmm6
+       pshufd          $0x55,%xmm7,%xmm7
+       movq            0x20(%rdi),%xmm9
+       pshufd          $0x00,%xmm9,%xmm8
+       pshufd          $0x55,%xmm9,%xmm9
+       movq            0x28(%rdi),%xmm11
+       pshufd          $0x00,%xmm11,%xmm10
+       pshufd          $0x55,%xmm11,%xmm11
+       movq            0x30(%rdi),%xmm13
+       pshufd          $0x00,%xmm13,%xmm12
+       pshufd          $0x55,%xmm13,%xmm13
+       movq            0x38(%rdi),%xmm15
+       pshufd          $0x00,%xmm15,%xmm14
+       pshufd          $0x55,%xmm15,%xmm15
+       # x0..3 on stack
+       movdqa          %xmm0,0x00(%rsp)
+       movdqa          %xmm1,0x10(%rsp)
+       movdqa          %xmm2,0x20(%rsp)
+       movdqa          %xmm3,0x30(%rsp)
+
+       movdqa          CTRINC(%rip),%xmm1
+       movdqa          ROT8(%rip),%xmm2
+       movdqa          ROT16(%rip),%xmm3
+
+       # x12 += counter values 0-3
+       paddd           %xmm1,%xmm12
+
+.Ldoubleround4:
+       # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+       movdqa          0x00(%rsp),%xmm0
+       paddd           %xmm4,%xmm0
+       movdqa          %xmm0,0x00(%rsp)
+       pxor            %xmm0,%xmm12
+       pshufb          %xmm3,%xmm12
+       # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+       movdqa          0x10(%rsp),%xmm0
+       paddd           %xmm5,%xmm0
+       movdqa          %xmm0,0x10(%rsp)
+       pxor            %xmm0,%xmm13
+       pshufb          %xmm3,%xmm13
+       # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+       movdqa          0x20(%rsp),%xmm0
+       paddd           %xmm6,%xmm0
+       movdqa          %xmm0,0x20(%rsp)
+       pxor            %xmm0,%xmm14
+       pshufb          %xmm3,%xmm14
+       # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+       movdqa          0x30(%rsp),%xmm0
+       paddd           %xmm7,%xmm0
+       movdqa          %xmm0,0x30(%rsp)
+       pxor            %xmm0,%xmm15
+       pshufb          %xmm3,%xmm15
+
+       # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+       paddd           %xmm12,%xmm8
+       pxor            %xmm8,%xmm4
+       movdqa          %xmm4,%xmm0
+       pslld           $12,%xmm0
+       psrld           $20,%xmm4
+       por             %xmm0,%xmm4
+       # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+       paddd           %xmm13,%xmm9
+       pxor            %xmm9,%xmm5
+       movdqa          %xmm5,%xmm0
+       pslld           $12,%xmm0
+       psrld           $20,%xmm5
+       por             %xmm0,%xmm5
+       # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+       paddd           %xmm14,%xmm10
+       pxor            %xmm10,%xmm6
+       movdqa          %xmm6,%xmm0
+       pslld           $12,%xmm0
+       psrld           $20,%xmm6
+       por             %xmm0,%xmm6
+       # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+       paddd           %xmm15,%xmm11
+       pxor            %xmm11,%xmm7
+       movdqa          %xmm7,%xmm0
+       pslld           $12,%xmm0
+       psrld           $20,%xmm7
+       por             %xmm0,%xmm7
+
+       # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+       movdqa          0x00(%rsp),%xmm0
+       paddd           %xmm4,%xmm0
+       movdqa          %xmm0,0x00(%rsp)
+       pxor            %xmm0,%xmm12
+       pshufb          %xmm2,%xmm12
+       # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+       movdqa          0x10(%rsp),%xmm0
+       paddd           %xmm5,%xmm0
+       movdqa          %xmm0,0x10(%rsp)
+       pxor            %xmm0,%xmm13
+       pshufb          %xmm2,%xmm13
+       # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+       movdqa          0x20(%rsp),%xmm0
+       paddd           %xmm6,%xmm0
+       movdqa          %xmm0,0x20(%rsp)
+       pxor            %xmm0,%xmm14
+       pshufb          %xmm2,%xmm14
+       # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+       movdqa          0x30(%rsp),%xmm0
+       paddd           %xmm7,%xmm0
+       movdqa          %xmm0,0x30(%rsp)
+       pxor            %xmm0,%xmm15
+       pshufb          %xmm2,%xmm15
+
+       # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+       paddd           %xmm12,%xmm8
+       pxor            %xmm8,%xmm4
+       movdqa          %xmm4,%xmm0
+       pslld           $7,%xmm0
+       psrld           $25,%xmm4
+       por             %xmm0,%xmm4
+       # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+       paddd           %xmm13,%xmm9
+       pxor            %xmm9,%xmm5
+       movdqa          %xmm5,%xmm0
+       pslld           $7,%xmm0
+       psrld           $25,%xmm5
+       por             %xmm0,%xmm5
+       # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+       paddd           %xmm14,%xmm10
+       pxor            %xmm10,%xmm6
+       movdqa          %xmm6,%xmm0
+       pslld           $7,%xmm0
+       psrld           $25,%xmm6
+       por             %xmm0,%xmm6
+       # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+       paddd           %xmm15,%xmm11
+       pxor            %xmm11,%xmm7
+       movdqa          %xmm7,%xmm0
+       pslld           $7,%xmm0
+       psrld           $25,%xmm7
+       por             %xmm0,%xmm7
+
+       # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+       movdqa          0x00(%rsp),%xmm0
+       paddd           %xmm5,%xmm0
+       movdqa          %xmm0,0x00(%rsp)
+       pxor            %xmm0,%xmm15
+       pshufb          %xmm3,%xmm15
+       # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+       movdqa          0x10(%rsp),%xmm0
+       paddd           %xmm6,%xmm0
+       movdqa          %xmm0,0x10(%rsp)
+       pxor            %xmm0,%xmm12
+       pshufb          %xmm3,%xmm12
+       # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+       movdqa          0x20(%rsp),%xmm0
+       paddd           %xmm7,%xmm0
+       movdqa          %xmm0,0x20(%rsp)
+       pxor            %xmm0,%xmm13
+       pshufb          %xmm3,%xmm13
+       # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+       movdqa          0x30(%rsp),%xmm0
+       paddd           %xmm4,%xmm0
+       movdqa          %xmm0,0x30(%rsp)
+       pxor            %xmm0,%xmm14
+       pshufb          %xmm3,%xmm14
+
+       # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+       paddd           %xmm15,%xmm10
+       pxor            %xmm10,%xmm5
+       movdqa          %xmm5,%xmm0
+       pslld           $12,%xmm0
+       psrld           $20,%xmm5
+       por             %xmm0,%xmm5
+       # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+       paddd           %xmm12,%xmm11
+       pxor            %xmm11,%xmm6
+       movdqa          %xmm6,%xmm0
+       pslld           $12,%xmm0
+       psrld           $20,%xmm6
+       por             %xmm0,%xmm6
+       # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+       paddd           %xmm13,%xmm8
+       pxor            %xmm8,%xmm7
+       movdqa          %xmm7,%xmm0
+       pslld           $12,%xmm0
+       psrld           $20,%xmm7
+       por             %xmm0,%xmm7
+       # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+       paddd           %xmm14,%xmm9
+       pxor            %xmm9,%xmm4
+       movdqa          %xmm4,%xmm0
+       pslld           $12,%xmm0
+       psrld           $20,%xmm4
+       por             %xmm0,%xmm4
+
+       # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+       movdqa          0x00(%rsp),%xmm0
+       paddd           %xmm5,%xmm0
+       movdqa          %xmm0,0x00(%rsp)
+       pxor            %xmm0,%xmm15
+       pshufb          %xmm2,%xmm15
+       # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+       movdqa          0x10(%rsp),%xmm0
+       paddd           %xmm6,%xmm0
+       movdqa          %xmm0,0x10(%rsp)
+       pxor            %xmm0,%xmm12
+       pshufb          %xmm2,%xmm12
+       # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+       movdqa          0x20(%rsp),%xmm0
+       paddd           %xmm7,%xmm0
+       movdqa          %xmm0,0x20(%rsp)
+       pxor            %xmm0,%xmm13
+       pshufb          %xmm2,%xmm13
+       # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+       movdqa          0x30(%rsp),%xmm0
+       paddd           %xmm4,%xmm0
+       movdqa          %xmm0,0x30(%rsp)
+       pxor            %xmm0,%xmm14
+       pshufb          %xmm2,%xmm14
+
+       # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+       paddd           %xmm15,%xmm10
+       pxor            %xmm10,%xmm5
+       movdqa          %xmm5,%xmm0
+       pslld           $7,%xmm0
+       psrld           $25,%xmm5
+       por             %xmm0,%xmm5
+       # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+       paddd           %xmm12,%xmm11
+       pxor            %xmm11,%xmm6
+       movdqa          %xmm6,%xmm0
+       pslld           $7,%xmm0
+       psrld           $25,%xmm6
+       por             %xmm0,%xmm6
+       # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+       paddd           %xmm13,%xmm8
+       pxor            %xmm8,%xmm7
+       movdqa          %xmm7,%xmm0
+       pslld           $7,%xmm0
+       psrld           $25,%xmm7
+       por             %xmm0,%xmm7
+       # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+       paddd           %xmm14,%xmm9
+       pxor            %xmm9,%xmm4
+       movdqa          %xmm4,%xmm0
+       pslld           $7,%xmm0
+       psrld           $25,%xmm4
+       por             %xmm0,%xmm4
+
+       sub             $2,%r8d
+       jnz             .Ldoubleround4
+
+       # x0[0-3] += s0[0]
+       # x1[0-3] += s0[1]
+       movq            0x00(%rdi),%xmm3
+       pshufd          $0x00,%xmm3,%xmm2
+       pshufd          $0x55,%xmm3,%xmm3
+       paddd           0x00(%rsp),%xmm2
+       movdqa          %xmm2,0x00(%rsp)
+       paddd           0x10(%rsp),%xmm3
+       movdqa          %xmm3,0x10(%rsp)
+       # x2[0-3] += s0[2]
+       # x3[0-3] += s0[3]
+       movq            0x08(%rdi),%xmm3
+       pshufd          $0x00,%xmm3,%xmm2
+       pshufd          $0x55,%xmm3,%xmm3
+       paddd           0x20(%rsp),%xmm2
+       movdqa          %xmm2,0x20(%rsp)
+       paddd           0x30(%rsp),%xmm3
+       movdqa          %xmm3,0x30(%rsp)
+
+       # x4[0-3] += s1[0]
+       # x5[0-3] += s1[1]
+       movq            0x10(%rdi),%xmm3
+       pshufd          $0x00,%xmm3,%xmm2
+       pshufd          $0x55,%xmm3,%xmm3
+       paddd           %xmm2,%xmm4
+       paddd           %xmm3,%xmm5
+       # x6[0-3] += s1[2]
+       # x7[0-3] += s1[3]
+       movq            0x18(%rdi),%xmm3
+       pshufd          $0x00,%xmm3,%xmm2
+       pshufd          $0x55,%xmm3,%xmm3
+       paddd           %xmm2,%xmm6
+       paddd           %xmm3,%xmm7
+
+       # x8[0-3] += s2[0]
+       # x9[0-3] += s2[1]
+       movq            0x20(%rdi),%xmm3
+       pshufd          $0x00,%xmm3,%xmm2
+       pshufd          $0x55,%xmm3,%xmm3
+       paddd           %xmm2,%xmm8
+       paddd           %xmm3,%xmm9
+       # x10[0-3] += s2[2]
+       # x11[0-3] += s2[3]
+       movq            0x28(%rdi),%xmm3
+       pshufd          $0x00,%xmm3,%xmm2
+       pshufd          $0x55,%xmm3,%xmm3
+       paddd           %xmm2,%xmm10
+       paddd           %xmm3,%xmm11
+
+       # x12[0-3] += s3[0]
+       # x13[0-3] += s3[1]
+       movq            0x30(%rdi),%xmm3
+       pshufd          $0x00,%xmm3,%xmm2
+       pshufd          $0x55,%xmm3,%xmm3
+       paddd           %xmm2,%xmm12
+       paddd           %xmm3,%xmm13
+       # x14[0-3] += s3[2]
+       # x15[0-3] += s3[3]
+       movq            0x38(%rdi),%xmm3
+       pshufd          $0x00,%xmm3,%xmm2
+       pshufd          $0x55,%xmm3,%xmm3
+       paddd           %xmm2,%xmm14
+       paddd           %xmm3,%xmm15
+
+       # x12 += counter values 0-3
+       paddd           %xmm1,%xmm12
+
+       # interleave 32-bit words in state n, n+1
+       movdqa          0x00(%rsp),%xmm0
+       movdqa          0x10(%rsp),%xmm1
+       movdqa          %xmm0,%xmm2
+       punpckldq       %xmm1,%xmm2
+       punpckhdq       %xmm1,%xmm0
+       movdqa          %xmm2,0x00(%rsp)
+       movdqa          %xmm0,0x10(%rsp)
+       movdqa          0x20(%rsp),%xmm0
+       movdqa          0x30(%rsp),%xmm1
+       movdqa          %xmm0,%xmm2
+       punpckldq       %xmm1,%xmm2
+       punpckhdq       %xmm1,%xmm0
+       movdqa          %xmm2,0x20(%rsp)
+       movdqa          %xmm0,0x30(%rsp)
+       movdqa          %xmm4,%xmm0
+       punpckldq       %xmm5,%xmm4
+       punpckhdq       %xmm5,%xmm0
+       movdqa          %xmm0,%xmm5
+       movdqa          %xmm6,%xmm0
+       punpckldq       %xmm7,%xmm6
+       punpckhdq       %xmm7,%xmm0
+       movdqa          %xmm0,%xmm7
+       movdqa          %xmm8,%xmm0
+       punpckldq       %xmm9,%xmm8
+       punpckhdq       %xmm9,%xmm0
+       movdqa          %xmm0,%xmm9
+       movdqa          %xmm10,%xmm0
+       punpckldq       %xmm11,%xmm10
+       punpckhdq       %xmm11,%xmm0
+       movdqa          %xmm0,%xmm11
+       movdqa          %xmm12,%xmm0
+       punpckldq       %xmm13,%xmm12
+       punpckhdq       %xmm13,%xmm0
+       movdqa          %xmm0,%xmm13
+       movdqa          %xmm14,%xmm0
+       punpckldq       %xmm15,%xmm14
+       punpckhdq       %xmm15,%xmm0
+       movdqa          %xmm0,%xmm15
+
+       # interleave 64-bit words in state n, n+2
+       movdqa          0x00(%rsp),%xmm0
+       movdqa          0x20(%rsp),%xmm1
+       movdqa          %xmm0,%xmm2
+       punpcklqdq      %xmm1,%xmm2
+       punpckhqdq      %xmm1,%xmm0
+       movdqa          %xmm2,0x00(%rsp)
+       movdqa          %xmm0,0x20(%rsp)
+       movdqa          0x10(%rsp),%xmm0
+       movdqa          0x30(%rsp),%xmm1
+       movdqa          %xmm0,%xmm2
+       punpcklqdq      %xmm1,%xmm2
+       punpckhqdq      %xmm1,%xmm0
+       movdqa          %xmm2,0x10(%rsp)
+       movdqa          %xmm0,0x30(%rsp)
+       movdqa          %xmm4,%xmm0
+       punpcklqdq      %xmm6,%xmm4
+       punpckhqdq      %xmm6,%xmm0
+       movdqa          %xmm0,%xmm6
+       movdqa          %xmm5,%xmm0
+       punpcklqdq      %xmm7,%xmm5
+       punpckhqdq      %xmm7,%xmm0
+       movdqa          %xmm0,%xmm7
+       movdqa          %xmm8,%xmm0
+       punpcklqdq      %xmm10,%xmm8
+       punpckhqdq      %xmm10,%xmm0
+       movdqa          %xmm0,%xmm10
+       movdqa          %xmm9,%xmm0
+       punpcklqdq      %xmm11,%xmm9
+       punpckhqdq      %xmm11,%xmm0
+       movdqa          %xmm0,%xmm11
+       movdqa          %xmm12,%xmm0
+       punpcklqdq      %xmm14,%xmm12
+       punpckhqdq      %xmm14,%xmm0
+       movdqa          %xmm0,%xmm14
+       movdqa          %xmm13,%xmm0
+       punpcklqdq      %xmm15,%xmm13
+       punpckhqdq      %xmm15,%xmm0
+       movdqa          %xmm0,%xmm15
+
+       # xor with corresponding input, write to output
+       movdqa          0x00(%rsp),%xmm0
+       cmp             $0x10,%rax
+       jl              .Lxorpart4
+       movdqu          0x00(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x00(%rsi)
+
+       movdqu          %xmm4,%xmm0
+       cmp             $0x20,%rax
+       jl              .Lxorpart4
+       movdqu          0x10(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x10(%rsi)
+
+       movdqu          %xmm8,%xmm0
+       cmp             $0x30,%rax
+       jl              .Lxorpart4
+       movdqu          0x20(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x20(%rsi)
+
+       movdqu          %xmm12,%xmm0
+       cmp             $0x40,%rax
+       jl              .Lxorpart4
+       movdqu          0x30(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x30(%rsi)
+
+       movdqa          0x20(%rsp),%xmm0
+       cmp             $0x50,%rax
+       jl              .Lxorpart4
+       movdqu          0x40(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x40(%rsi)
+
+       movdqu          %xmm6,%xmm0
+       cmp             $0x60,%rax
+       jl              .Lxorpart4
+       movdqu          0x50(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x50(%rsi)
+
+       movdqu          %xmm10,%xmm0
+       cmp             $0x70,%rax
+       jl              .Lxorpart4
+       movdqu          0x60(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x60(%rsi)
+
+       movdqu          %xmm14,%xmm0
+       cmp             $0x80,%rax
+       jl              .Lxorpart4
+       movdqu          0x70(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x70(%rsi)
+
+       movdqa          0x10(%rsp),%xmm0
+       cmp             $0x90,%rax
+       jl              .Lxorpart4
+       movdqu          0x80(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x80(%rsi)
+
+       movdqu          %xmm5,%xmm0
+       cmp             $0xa0,%rax
+       jl              .Lxorpart4
+       movdqu          0x90(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x90(%rsi)
+
+       movdqu          %xmm9,%xmm0
+       cmp             $0xb0,%rax
+       jl              .Lxorpart4
+       movdqu          0xa0(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0xa0(%rsi)
+
+       movdqu          %xmm13,%xmm0
+       cmp             $0xc0,%rax
+       jl              .Lxorpart4
+       movdqu          0xb0(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0xb0(%rsi)
+
+       movdqa          0x30(%rsp),%xmm0
+       cmp             $0xd0,%rax
+       jl              .Lxorpart4
+       movdqu          0xc0(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0xc0(%rsi)
+
+       movdqu          %xmm7,%xmm0
+       cmp             $0xe0,%rax
+       jl              .Lxorpart4
+       movdqu          0xd0(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0xd0(%rsi)
+
+       movdqu          %xmm11,%xmm0
+       cmp             $0xf0,%rax
+       jl              .Lxorpart4
+       movdqu          0xe0(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0xe0(%rsi)
+
+       movdqu          %xmm15,%xmm0
+       cmp             $0x100,%rax
+       jl              .Lxorpart4
+       movdqu          0xf0(%rdx),%xmm1
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0xf0(%rsi)
+
+.Ldone4:
+       lea             -8(%r10),%rsp
+       ret
+
+.Lxorpart4:
+       # xor remaining bytes from partial register into output
+       mov             %rax,%r9
+       and             $0x0f,%r9
+       jz              .Ldone4
+       and             $~0x0f,%rax
+
+       mov             %rsi,%r11
+
+       lea             (%rdx,%rax),%rsi
+       mov             %rsp,%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       pxor            0x00(%rsp),%xmm0
+       movdqa          %xmm0,0x00(%rsp)
+
+       mov             %rsp,%rsi
+       lea             (%r11,%rax),%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       jmp             .Ldone4
+
+ENDPROC(chacha_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
deleted file mode 100644 (file)
index b6ab082..0000000
+++ /dev/null
@@ -1,1026 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-
-.section       .rodata.cst32.ROT8, "aM", @progbits, 32
-.align 32
-ROT8:  .octa 0x0e0d0c0f0a09080b0605040702010003
-       .octa 0x0e0d0c0f0a09080b0605040702010003
-
-.section       .rodata.cst32.ROT16, "aM", @progbits, 32
-.align 32
-ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
-       .octa 0x0d0c0f0e09080b0a0504070601000302
-
-.section       .rodata.cst32.CTRINC, "aM", @progbits, 32
-.align 32
-CTRINC:        .octa 0x00000003000000020000000100000000
-       .octa 0x00000007000000060000000500000004
-
-.section       .rodata.cst32.CTR2BL, "aM", @progbits, 32
-.align 32
-CTR2BL:        .octa 0x00000000000000000000000000000000
-       .octa 0x00000000000000000000000000000001
-
-.section       .rodata.cst32.CTR4BL, "aM", @progbits, 32
-.align 32
-CTR4BL:        .octa 0x00000000000000000000000000000002
-       .octa 0x00000000000000000000000000000003
-
-.text
-
-ENTRY(chacha20_2block_xor_avx2)
-       # %rdi: Input state matrix, s
-       # %rsi: up to 2 data blocks output, o
-       # %rdx: up to 2 data blocks input, i
-       # %rcx: input/output length in bytes
-
-       # This function encrypts two ChaCha20 blocks by loading the state
-       # matrix twice across four AVX registers. It performs matrix operations
-       # on four words in each matrix in parallel, but requires shuffling to
-       # rearrange the words after each round.
-
-       vzeroupper
-
-       # x0..3[0-2] = s0..3
-       vbroadcasti128  0x00(%rdi),%ymm0
-       vbroadcasti128  0x10(%rdi),%ymm1
-       vbroadcasti128  0x20(%rdi),%ymm2
-       vbroadcasti128  0x30(%rdi),%ymm3
-
-       vpaddd          CTR2BL(%rip),%ymm3,%ymm3
-
-       vmovdqa         %ymm0,%ymm8
-       vmovdqa         %ymm1,%ymm9
-       vmovdqa         %ymm2,%ymm10
-       vmovdqa         %ymm3,%ymm11
-
-       vmovdqa         ROT8(%rip),%ymm4
-       vmovdqa         ROT16(%rip),%ymm5
-
-       mov             %rcx,%rax
-       mov             $10,%ecx
-
-.Ldoubleround:
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxor           %ymm0,%ymm3,%ymm3
-       vpshufb         %ymm5,%ymm3,%ymm3
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxor           %ymm2,%ymm1,%ymm1
-       vmovdqa         %ymm1,%ymm6
-       vpslld          $12,%ymm6,%ymm6
-       vpsrld          $20,%ymm1,%ymm1
-       vpor            %ymm6,%ymm1,%ymm1
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxor           %ymm0,%ymm3,%ymm3
-       vpshufb         %ymm4,%ymm3,%ymm3
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxor           %ymm2,%ymm1,%ymm1
-       vmovdqa         %ymm1,%ymm7
-       vpslld          $7,%ymm7,%ymm7
-       vpsrld          $25,%ymm1,%ymm1
-       vpor            %ymm7,%ymm1,%ymm1
-
-       # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-       vpshufd         $0x39,%ymm1,%ymm1
-       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       vpshufd         $0x4e,%ymm2,%ymm2
-       # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-       vpshufd         $0x93,%ymm3,%ymm3
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxor           %ymm0,%ymm3,%ymm3
-       vpshufb         %ymm5,%ymm3,%ymm3
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxor           %ymm2,%ymm1,%ymm1
-       vmovdqa         %ymm1,%ymm6
-       vpslld          $12,%ymm6,%ymm6
-       vpsrld          $20,%ymm1,%ymm1
-       vpor            %ymm6,%ymm1,%ymm1
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxor           %ymm0,%ymm3,%ymm3
-       vpshufb         %ymm4,%ymm3,%ymm3
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxor           %ymm2,%ymm1,%ymm1
-       vmovdqa         %ymm1,%ymm7
-       vpslld          $7,%ymm7,%ymm7
-       vpsrld          $25,%ymm1,%ymm1
-       vpor            %ymm7,%ymm1,%ymm1
-
-       # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-       vpshufd         $0x93,%ymm1,%ymm1
-       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       vpshufd         $0x4e,%ymm2,%ymm2
-       # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-       vpshufd         $0x39,%ymm3,%ymm3
-
-       dec             %ecx
-       jnz             .Ldoubleround
-
-       # o0 = i0 ^ (x0 + s0)
-       vpaddd          %ymm8,%ymm0,%ymm7
-       cmp             $0x10,%rax
-       jl              .Lxorpart2
-       vpxor           0x00(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x00(%rsi)
-       vextracti128    $1,%ymm7,%xmm0
-       # o1 = i1 ^ (x1 + s1)
-       vpaddd          %ymm9,%ymm1,%ymm7
-       cmp             $0x20,%rax
-       jl              .Lxorpart2
-       vpxor           0x10(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x10(%rsi)
-       vextracti128    $1,%ymm7,%xmm1
-       # o2 = i2 ^ (x2 + s2)
-       vpaddd          %ymm10,%ymm2,%ymm7
-       cmp             $0x30,%rax
-       jl              .Lxorpart2
-       vpxor           0x20(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x20(%rsi)
-       vextracti128    $1,%ymm7,%xmm2
-       # o3 = i3 ^ (x3 + s3)
-       vpaddd          %ymm11,%ymm3,%ymm7
-       cmp             $0x40,%rax
-       jl              .Lxorpart2
-       vpxor           0x30(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x30(%rsi)
-       vextracti128    $1,%ymm7,%xmm3
-
-       # xor and write second block
-       vmovdqa         %xmm0,%xmm7
-       cmp             $0x50,%rax
-       jl              .Lxorpart2
-       vpxor           0x40(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x40(%rsi)
-
-       vmovdqa         %xmm1,%xmm7
-       cmp             $0x60,%rax
-       jl              .Lxorpart2
-       vpxor           0x50(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x50(%rsi)
-
-       vmovdqa         %xmm2,%xmm7
-       cmp             $0x70,%rax
-       jl              .Lxorpart2
-       vpxor           0x60(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x60(%rsi)
-
-       vmovdqa         %xmm3,%xmm7
-       cmp             $0x80,%rax
-       jl              .Lxorpart2
-       vpxor           0x70(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x70(%rsi)
-
-.Ldone2:
-       vzeroupper
-       ret
-
-.Lxorpart2:
-       # xor remaining bytes from partial register into output
-       mov             %rax,%r9
-       and             $0x0f,%r9
-       jz              .Ldone2
-       and             $~0x0f,%rax
-
-       mov             %rsi,%r11
-
-       lea             8(%rsp),%r10
-       sub             $0x10,%rsp
-       and             $~31,%rsp
-
-       lea             (%rdx,%rax),%rsi
-       mov             %rsp,%rdi
-       mov             %r9,%rcx
-       rep movsb
-
-       vpxor           0x00(%rsp),%xmm7,%xmm7
-       vmovdqa         %xmm7,0x00(%rsp)
-
-       mov             %rsp,%rsi
-       lea             (%r11,%rax),%rdi
-       mov             %r9,%rcx
-       rep movsb
-
-       lea             -8(%r10),%rsp
-       jmp             .Ldone2
-
-ENDPROC(chacha20_2block_xor_avx2)
-
-ENTRY(chacha20_4block_xor_avx2)
-       # %rdi: Input state matrix, s
-       # %rsi: up to 4 data blocks output, o
-       # %rdx: up to 4 data blocks input, i
-       # %rcx: input/output length in bytes
-
-       # This function encrypts four ChaCha20 block by loading the state
-       # matrix four times across eight AVX registers. It performs matrix
-       # operations on four words in two matrices in parallel, sequentially
-       # to the operations on the four words of the other two matrices. The
-       # required word shuffling has a rather high latency, we can do the
-       # arithmetic on two matrix-pairs without much slowdown.
-
-       vzeroupper
-
-       # x0..3[0-4] = s0..3
-       vbroadcasti128  0x00(%rdi),%ymm0
-       vbroadcasti128  0x10(%rdi),%ymm1
-       vbroadcasti128  0x20(%rdi),%ymm2
-       vbroadcasti128  0x30(%rdi),%ymm3
-
-       vmovdqa         %ymm0,%ymm4
-       vmovdqa         %ymm1,%ymm5
-       vmovdqa         %ymm2,%ymm6
-       vmovdqa         %ymm3,%ymm7
-
-       vpaddd          CTR2BL(%rip),%ymm3,%ymm3
-       vpaddd          CTR4BL(%rip),%ymm7,%ymm7
-
-       vmovdqa         %ymm0,%ymm11
-       vmovdqa         %ymm1,%ymm12
-       vmovdqa         %ymm2,%ymm13
-       vmovdqa         %ymm3,%ymm14
-       vmovdqa         %ymm7,%ymm15
-
-       vmovdqa         ROT8(%rip),%ymm8
-       vmovdqa         ROT16(%rip),%ymm9
-
-       mov             %rcx,%rax
-       mov             $10,%ecx
-
-.Ldoubleround4:
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxor           %ymm0,%ymm3,%ymm3
-       vpshufb         %ymm9,%ymm3,%ymm3
-
-       vpaddd          %ymm5,%ymm4,%ymm4
-       vpxor           %ymm4,%ymm7,%ymm7
-       vpshufb         %ymm9,%ymm7,%ymm7
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxor           %ymm2,%ymm1,%ymm1
-       vmovdqa         %ymm1,%ymm10
-       vpslld          $12,%ymm10,%ymm10
-       vpsrld          $20,%ymm1,%ymm1
-       vpor            %ymm10,%ymm1,%ymm1
-
-       vpaddd          %ymm7,%ymm6,%ymm6
-       vpxor           %ymm6,%ymm5,%ymm5
-       vmovdqa         %ymm5,%ymm10
-       vpslld          $12,%ymm10,%ymm10
-       vpsrld          $20,%ymm5,%ymm5
-       vpor            %ymm10,%ymm5,%ymm5
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxor           %ymm0,%ymm3,%ymm3
-       vpshufb         %ymm8,%ymm3,%ymm3
-
-       vpaddd          %ymm5,%ymm4,%ymm4
-       vpxor           %ymm4,%ymm7,%ymm7
-       vpshufb         %ymm8,%ymm7,%ymm7
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxor           %ymm2,%ymm1,%ymm1
-       vmovdqa         %ymm1,%ymm10
-       vpslld          $7,%ymm10,%ymm10
-       vpsrld          $25,%ymm1,%ymm1
-       vpor            %ymm10,%ymm1,%ymm1
-
-       vpaddd          %ymm7,%ymm6,%ymm6
-       vpxor           %ymm6,%ymm5,%ymm5
-       vmovdqa         %ymm5,%ymm10
-       vpslld          $7,%ymm10,%ymm10
-       vpsrld          $25,%ymm5,%ymm5
-       vpor            %ymm10,%ymm5,%ymm5
-
-       # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-       vpshufd         $0x39,%ymm1,%ymm1
-       vpshufd         $0x39,%ymm5,%ymm5
-       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       vpshufd         $0x4e,%ymm2,%ymm2
-       vpshufd         $0x4e,%ymm6,%ymm6
-       # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-       vpshufd         $0x93,%ymm3,%ymm3
-       vpshufd         $0x93,%ymm7,%ymm7
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxor           %ymm0,%ymm3,%ymm3
-       vpshufb         %ymm9,%ymm3,%ymm3
-
-       vpaddd          %ymm5,%ymm4,%ymm4
-       vpxor           %ymm4,%ymm7,%ymm7
-       vpshufb         %ymm9,%ymm7,%ymm7
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxor           %ymm2,%ymm1,%ymm1
-       vmovdqa         %ymm1,%ymm10
-       vpslld          $12,%ymm10,%ymm10
-       vpsrld          $20,%ymm1,%ymm1
-       vpor            %ymm10,%ymm1,%ymm1
-
-       vpaddd          %ymm7,%ymm6,%ymm6
-       vpxor           %ymm6,%ymm5,%ymm5
-       vmovdqa         %ymm5,%ymm10
-       vpslld          $12,%ymm10,%ymm10
-       vpsrld          $20,%ymm5,%ymm5
-       vpor            %ymm10,%ymm5,%ymm5
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxor           %ymm0,%ymm3,%ymm3
-       vpshufb         %ymm8,%ymm3,%ymm3
-
-       vpaddd          %ymm5,%ymm4,%ymm4
-       vpxor           %ymm4,%ymm7,%ymm7
-       vpshufb         %ymm8,%ymm7,%ymm7
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxor           %ymm2,%ymm1,%ymm1
-       vmovdqa         %ymm1,%ymm10
-       vpslld          $7,%ymm10,%ymm10
-       vpsrld          $25,%ymm1,%ymm1
-       vpor            %ymm10,%ymm1,%ymm1
-
-       vpaddd          %ymm7,%ymm6,%ymm6
-       vpxor           %ymm6,%ymm5,%ymm5
-       vmovdqa         %ymm5,%ymm10
-       vpslld          $7,%ymm10,%ymm10
-       vpsrld          $25,%ymm5,%ymm5
-       vpor            %ymm10,%ymm5,%ymm5
-
-       # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-       vpshufd         $0x93,%ymm1,%ymm1
-       vpshufd         $0x93,%ymm5,%ymm5
-       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       vpshufd         $0x4e,%ymm2,%ymm2
-       vpshufd         $0x4e,%ymm6,%ymm6
-       # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-       vpshufd         $0x39,%ymm3,%ymm3
-       vpshufd         $0x39,%ymm7,%ymm7
-
-       dec             %ecx
-       jnz             .Ldoubleround4
-
-       # o0 = i0 ^ (x0 + s0), first block
-       vpaddd          %ymm11,%ymm0,%ymm10
-       cmp             $0x10,%rax
-       jl              .Lxorpart4
-       vpxor           0x00(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x00(%rsi)
-       vextracti128    $1,%ymm10,%xmm0
-       # o1 = i1 ^ (x1 + s1), first block
-       vpaddd          %ymm12,%ymm1,%ymm10
-       cmp             $0x20,%rax
-       jl              .Lxorpart4
-       vpxor           0x10(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x10(%rsi)
-       vextracti128    $1,%ymm10,%xmm1
-       # o2 = i2 ^ (x2 + s2), first block
-       vpaddd          %ymm13,%ymm2,%ymm10
-       cmp             $0x30,%rax
-       jl              .Lxorpart4
-       vpxor           0x20(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x20(%rsi)
-       vextracti128    $1,%ymm10,%xmm2
-       # o3 = i3 ^ (x3 + s3), first block
-       vpaddd          %ymm14,%ymm3,%ymm10
-       cmp             $0x40,%rax
-       jl              .Lxorpart4
-       vpxor           0x30(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x30(%rsi)
-       vextracti128    $1,%ymm10,%xmm3
-
-       # xor and write second block
-       vmovdqa         %xmm0,%xmm10
-       cmp             $0x50,%rax
-       jl              .Lxorpart4
-       vpxor           0x40(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x40(%rsi)
-
-       vmovdqa         %xmm1,%xmm10
-       cmp             $0x60,%rax
-       jl              .Lxorpart4
-       vpxor           0x50(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x50(%rsi)
-
-       vmovdqa         %xmm2,%xmm10
-       cmp             $0x70,%rax
-       jl              .Lxorpart4
-       vpxor           0x60(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x60(%rsi)
-
-       vmovdqa         %xmm3,%xmm10
-       cmp             $0x80,%rax
-       jl              .Lxorpart4
-       vpxor           0x70(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x70(%rsi)
-
-       # o0 = i0 ^ (x0 + s0), third block
-       vpaddd          %ymm11,%ymm4,%ymm10
-       cmp             $0x90,%rax
-       jl              .Lxorpart4
-       vpxor           0x80(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x80(%rsi)
-       vextracti128    $1,%ymm10,%xmm4
-       # o1 = i1 ^ (x1 + s1), third block
-       vpaddd          %ymm12,%ymm5,%ymm10
-       cmp             $0xa0,%rax
-       jl              .Lxorpart4
-       vpxor           0x90(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x90(%rsi)
-       vextracti128    $1,%ymm10,%xmm5
-       # o2 = i2 ^ (x2 + s2), third block
-       vpaddd          %ymm13,%ymm6,%ymm10
-       cmp             $0xb0,%rax
-       jl              .Lxorpart4
-       vpxor           0xa0(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0xa0(%rsi)
-       vextracti128    $1,%ymm10,%xmm6
-       # o3 = i3 ^ (x3 + s3), third block
-       vpaddd          %ymm15,%ymm7,%ymm10
-       cmp             $0xc0,%rax
-       jl              .Lxorpart4
-       vpxor           0xb0(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0xb0(%rsi)
-       vextracti128    $1,%ymm10,%xmm7
-
-       # xor and write fourth block
-       vmovdqa         %xmm4,%xmm10
-       cmp             $0xd0,%rax
-       jl              .Lxorpart4
-       vpxor           0xc0(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0xc0(%rsi)
-
-       vmovdqa         %xmm5,%xmm10
-       cmp             $0xe0,%rax
-       jl              .Lxorpart4
-       vpxor           0xd0(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0xd0(%rsi)
-
-       vmovdqa         %xmm6,%xmm10
-       cmp             $0xf0,%rax
-       jl              .Lxorpart4
-       vpxor           0xe0(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0xe0(%rsi)
-
-       vmovdqa         %xmm7,%xmm10
-       cmp             $0x100,%rax
-       jl              .Lxorpart4
-       vpxor           0xf0(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0xf0(%rsi)
-
-.Ldone4:
-       vzeroupper
-       ret
-
-.Lxorpart4:
-       # xor remaining bytes from partial register into output
-       mov             %rax,%r9
-       and             $0x0f,%r9
-       jz              .Ldone4
-       and             $~0x0f,%rax
-
-       mov             %rsi,%r11
-
-       lea             8(%rsp),%r10
-       sub             $0x10,%rsp
-       and             $~31,%rsp
-
-       lea             (%rdx,%rax),%rsi
-       mov             %rsp,%rdi
-       mov             %r9,%rcx
-       rep movsb
-
-       vpxor           0x00(%rsp),%xmm10,%xmm10
-       vmovdqa         %xmm10,0x00(%rsp)
-
-       mov             %rsp,%rsi
-       lea             (%r11,%rax),%rdi
-       mov             %r9,%rcx
-       rep movsb
-
-       lea             -8(%r10),%rsp
-       jmp             .Ldone4
-
-ENDPROC(chacha20_4block_xor_avx2)
-
-ENTRY(chacha20_8block_xor_avx2)
-       # %rdi: Input state matrix, s
-       # %rsi: up to 8 data blocks output, o
-       # %rdx: up to 8 data blocks input, i
-       # %rcx: input/output length in bytes
-
-       # This function encrypts eight consecutive ChaCha20 blocks by loading
-       # the state matrix in AVX registers eight times. As we need some
-       # scratch registers, we save the first four registers on the stack. The
-       # algorithm performs each operation on the corresponding word of each
-       # state matrix, hence requires no word shuffling. For final XORing step
-       # we transpose the matrix by interleaving 32-, 64- and then 128-bit
-       # words, which allows us to do XOR in AVX registers. 8/16-bit word
-       # rotation is done with the slightly better performing byte shuffling,
-       # 7/12-bit word rotation uses traditional shift+OR.
-
-       vzeroupper
-       # 4 * 32 byte stack, 32-byte aligned
-       lea             8(%rsp),%r10
-       and             $~31, %rsp
-       sub             $0x80, %rsp
-       mov             %rcx,%rax
-
-       # x0..15[0-7] = s[0..15]
-       vpbroadcastd    0x00(%rdi),%ymm0
-       vpbroadcastd    0x04(%rdi),%ymm1
-       vpbroadcastd    0x08(%rdi),%ymm2
-       vpbroadcastd    0x0c(%rdi),%ymm3
-       vpbroadcastd    0x10(%rdi),%ymm4
-       vpbroadcastd    0x14(%rdi),%ymm5
-       vpbroadcastd    0x18(%rdi),%ymm6
-       vpbroadcastd    0x1c(%rdi),%ymm7
-       vpbroadcastd    0x20(%rdi),%ymm8
-       vpbroadcastd    0x24(%rdi),%ymm9
-       vpbroadcastd    0x28(%rdi),%ymm10
-       vpbroadcastd    0x2c(%rdi),%ymm11
-       vpbroadcastd    0x30(%rdi),%ymm12
-       vpbroadcastd    0x34(%rdi),%ymm13
-       vpbroadcastd    0x38(%rdi),%ymm14
-       vpbroadcastd    0x3c(%rdi),%ymm15
-       # x0..3 on stack
-       vmovdqa         %ymm0,0x00(%rsp)
-       vmovdqa         %ymm1,0x20(%rsp)
-       vmovdqa         %ymm2,0x40(%rsp)
-       vmovdqa         %ymm3,0x60(%rsp)
-
-       vmovdqa         CTRINC(%rip),%ymm1
-       vmovdqa         ROT8(%rip),%ymm2
-       vmovdqa         ROT16(%rip),%ymm3
-
-       # x12 += counter values 0-3
-       vpaddd          %ymm1,%ymm12,%ymm12
-
-       mov             $10,%ecx
-
-.Ldoubleround8:
-       # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-       vpaddd          0x00(%rsp),%ymm4,%ymm0
-       vmovdqa         %ymm0,0x00(%rsp)
-       vpxor           %ymm0,%ymm12,%ymm12
-       vpshufb         %ymm3,%ymm12,%ymm12
-       # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-       vpaddd          0x20(%rsp),%ymm5,%ymm0
-       vmovdqa         %ymm0,0x20(%rsp)
-       vpxor           %ymm0,%ymm13,%ymm13
-       vpshufb         %ymm3,%ymm13,%ymm13
-       # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-       vpaddd          0x40(%rsp),%ymm6,%ymm0
-       vmovdqa         %ymm0,0x40(%rsp)
-       vpxor           %ymm0,%ymm14,%ymm14
-       vpshufb         %ymm3,%ymm14,%ymm14
-       # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-       vpaddd          0x60(%rsp),%ymm7,%ymm0
-       vmovdqa         %ymm0,0x60(%rsp)
-       vpxor           %ymm0,%ymm15,%ymm15
-       vpshufb         %ymm3,%ymm15,%ymm15
-
-       # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-       vpaddd          %ymm12,%ymm8,%ymm8
-       vpxor           %ymm8,%ymm4,%ymm4
-       vpslld          $12,%ymm4,%ymm0
-       vpsrld          $20,%ymm4,%ymm4
-       vpor            %ymm0,%ymm4,%ymm4
-       # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-       vpaddd          %ymm13,%ymm9,%ymm9
-       vpxor           %ymm9,%ymm5,%ymm5
-       vpslld          $12,%ymm5,%ymm0
-       vpsrld          $20,%ymm5,%ymm5
-       vpor            %ymm0,%ymm5,%ymm5
-       # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-       vpaddd          %ymm14,%ymm10,%ymm10
-       vpxor           %ymm10,%ymm6,%ymm6
-       vpslld          $12,%ymm6,%ymm0
-       vpsrld          $20,%ymm6,%ymm6
-       vpor            %ymm0,%ymm6,%ymm6
-       # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-       vpaddd          %ymm15,%ymm11,%ymm11
-       vpxor           %ymm11,%ymm7,%ymm7
-       vpslld          $12,%ymm7,%ymm0
-       vpsrld          $20,%ymm7,%ymm7
-       vpor            %ymm0,%ymm7,%ymm7
-
-       # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-       vpaddd          0x00(%rsp),%ymm4,%ymm0
-       vmovdqa         %ymm0,0x00(%rsp)
-       vpxor           %ymm0,%ymm12,%ymm12
-       vpshufb         %ymm2,%ymm12,%ymm12
-       # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-       vpaddd          0x20(%rsp),%ymm5,%ymm0
-       vmovdqa         %ymm0,0x20(%rsp)
-       vpxor           %ymm0,%ymm13,%ymm13
-       vpshufb         %ymm2,%ymm13,%ymm13
-       # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-       vpaddd          0x40(%rsp),%ymm6,%ymm0
-       vmovdqa         %ymm0,0x40(%rsp)
-       vpxor           %ymm0,%ymm14,%ymm14
-       vpshufb         %ymm2,%ymm14,%ymm14
-       # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-       vpaddd          0x60(%rsp),%ymm7,%ymm0
-       vmovdqa         %ymm0,0x60(%rsp)
-       vpxor           %ymm0,%ymm15,%ymm15
-       vpshufb         %ymm2,%ymm15,%ymm15
-
-       # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-       vpaddd          %ymm12,%ymm8,%ymm8
-       vpxor           %ymm8,%ymm4,%ymm4
-       vpslld          $7,%ymm4,%ymm0
-       vpsrld          $25,%ymm4,%ymm4
-       vpor            %ymm0,%ymm4,%ymm4
-       # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-       vpaddd          %ymm13,%ymm9,%ymm9
-       vpxor           %ymm9,%ymm5,%ymm5
-       vpslld          $7,%ymm5,%ymm0
-       vpsrld          $25,%ymm5,%ymm5
-       vpor            %ymm0,%ymm5,%ymm5
-       # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-       vpaddd          %ymm14,%ymm10,%ymm10
-       vpxor           %ymm10,%ymm6,%ymm6
-       vpslld          $7,%ymm6,%ymm0
-       vpsrld          $25,%ymm6,%ymm6
-       vpor            %ymm0,%ymm6,%ymm6
-       # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-       vpaddd          %ymm15,%ymm11,%ymm11
-       vpxor           %ymm11,%ymm7,%ymm7
-       vpslld          $7,%ymm7,%ymm0
-       vpsrld          $25,%ymm7,%ymm7
-       vpor            %ymm0,%ymm7,%ymm7
-
-       # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-       vpaddd          0x00(%rsp),%ymm5,%ymm0
-       vmovdqa         %ymm0,0x00(%rsp)
-       vpxor           %ymm0,%ymm15,%ymm15
-       vpshufb         %ymm3,%ymm15,%ymm15
-       # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
-       vpaddd          0x20(%rsp),%ymm6,%ymm0
-       vmovdqa         %ymm0,0x20(%rsp)
-       vpxor           %ymm0,%ymm12,%ymm12
-       vpshufb         %ymm3,%ymm12,%ymm12
-       # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-       vpaddd          0x40(%rsp),%ymm7,%ymm0
-       vmovdqa         %ymm0,0x40(%rsp)
-       vpxor           %ymm0,%ymm13,%ymm13
-       vpshufb         %ymm3,%ymm13,%ymm13
-       # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-       vpaddd          0x60(%rsp),%ymm4,%ymm0
-       vmovdqa         %ymm0,0x60(%rsp)
-       vpxor           %ymm0,%ymm14,%ymm14
-       vpshufb         %ymm3,%ymm14,%ymm14
-
-       # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-       vpaddd          %ymm15,%ymm10,%ymm10
-       vpxor           %ymm10,%ymm5,%ymm5
-       vpslld          $12,%ymm5,%ymm0
-       vpsrld          $20,%ymm5,%ymm5
-       vpor            %ymm0,%ymm5,%ymm5
-       # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-       vpaddd          %ymm12,%ymm11,%ymm11
-       vpxor           %ymm11,%ymm6,%ymm6
-       vpslld          $12,%ymm6,%ymm0
-       vpsrld          $20,%ymm6,%ymm6
-       vpor            %ymm0,%ymm6,%ymm6
-       # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-       vpaddd          %ymm13,%ymm8,%ymm8
-       vpxor           %ymm8,%ymm7,%ymm7
-       vpslld          $12,%ymm7,%ymm0
-       vpsrld          $20,%ymm7,%ymm7
-       vpor            %ymm0,%ymm7,%ymm7
-       # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-       vpaddd          %ymm14,%ymm9,%ymm9
-       vpxor           %ymm9,%ymm4,%ymm4
-       vpslld          $12,%ymm4,%ymm0
-       vpsrld          $20,%ymm4,%ymm4
-       vpor            %ymm0,%ymm4,%ymm4
-
-       # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-       vpaddd          0x00(%rsp),%ymm5,%ymm0
-       vmovdqa         %ymm0,0x00(%rsp)
-       vpxor           %ymm0,%ymm15,%ymm15
-       vpshufb         %ymm2,%ymm15,%ymm15
-       # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-       vpaddd          0x20(%rsp),%ymm6,%ymm0
-       vmovdqa         %ymm0,0x20(%rsp)
-       vpxor           %ymm0,%ymm12,%ymm12
-       vpshufb         %ymm2,%ymm12,%ymm12
-       # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-       vpaddd          0x40(%rsp),%ymm7,%ymm0
-       vmovdqa         %ymm0,0x40(%rsp)
-       vpxor           %ymm0,%ymm13,%ymm13
-       vpshufb         %ymm2,%ymm13,%ymm13
-       # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-       vpaddd          0x60(%rsp),%ymm4,%ymm0
-       vmovdqa         %ymm0,0x60(%rsp)
-       vpxor           %ymm0,%ymm14,%ymm14
-       vpshufb         %ymm2,%ymm14,%ymm14
-
-       # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-       vpaddd          %ymm15,%ymm10,%ymm10
-       vpxor           %ymm10,%ymm5,%ymm5
-       vpslld          $7,%ymm5,%ymm0
-       vpsrld          $25,%ymm5,%ymm5
-       vpor            %ymm0,%ymm5,%ymm5
-       # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-       vpaddd          %ymm12,%ymm11,%ymm11
-       vpxor           %ymm11,%ymm6,%ymm6
-       vpslld          $7,%ymm6,%ymm0
-       vpsrld          $25,%ymm6,%ymm6
-       vpor            %ymm0,%ymm6,%ymm6
-       # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-       vpaddd          %ymm13,%ymm8,%ymm8
-       vpxor           %ymm8,%ymm7,%ymm7
-       vpslld          $7,%ymm7,%ymm0
-       vpsrld          $25,%ymm7,%ymm7
-       vpor            %ymm0,%ymm7,%ymm7
-       # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-       vpaddd          %ymm14,%ymm9,%ymm9
-       vpxor           %ymm9,%ymm4,%ymm4
-       vpslld          $7,%ymm4,%ymm0
-       vpsrld          $25,%ymm4,%ymm4
-       vpor            %ymm0,%ymm4,%ymm4
-
-       dec             %ecx
-       jnz             .Ldoubleround8
-
-       # x0..15[0-3] += s[0..15]
-       vpbroadcastd    0x00(%rdi),%ymm0
-       vpaddd          0x00(%rsp),%ymm0,%ymm0
-       vmovdqa         %ymm0,0x00(%rsp)
-       vpbroadcastd    0x04(%rdi),%ymm0
-       vpaddd          0x20(%rsp),%ymm0,%ymm0
-       vmovdqa         %ymm0,0x20(%rsp)
-       vpbroadcastd    0x08(%rdi),%ymm0
-       vpaddd          0x40(%rsp),%ymm0,%ymm0
-       vmovdqa         %ymm0,0x40(%rsp)
-       vpbroadcastd    0x0c(%rdi),%ymm0
-       vpaddd          0x60(%rsp),%ymm0,%ymm0
-       vmovdqa         %ymm0,0x60(%rsp)
-       vpbroadcastd    0x10(%rdi),%ymm0
-       vpaddd          %ymm0,%ymm4,%ymm4
-       vpbroadcastd    0x14(%rdi),%ymm0
-       vpaddd          %ymm0,%ymm5,%ymm5
-       vpbroadcastd    0x18(%rdi),%ymm0
-       vpaddd          %ymm0,%ymm6,%ymm6
-       vpbroadcastd    0x1c(%rdi),%ymm0
-       vpaddd          %ymm0,%ymm7,%ymm7
-       vpbroadcastd    0x20(%rdi),%ymm0
-       vpaddd          %ymm0,%ymm8,%ymm8
-       vpbroadcastd    0x24(%rdi),%ymm0
-       vpaddd          %ymm0,%ymm9,%ymm9
-       vpbroadcastd    0x28(%rdi),%ymm0
-       vpaddd          %ymm0,%ymm10,%ymm10
-       vpbroadcastd    0x2c(%rdi),%ymm0
-       vpaddd          %ymm0,%ymm11,%ymm11
-       vpbroadcastd    0x30(%rdi),%ymm0
-       vpaddd          %ymm0,%ymm12,%ymm12
-       vpbroadcastd    0x34(%rdi),%ymm0
-       vpaddd          %ymm0,%ymm13,%ymm13
-       vpbroadcastd    0x38(%rdi),%ymm0
-       vpaddd          %ymm0,%ymm14,%ymm14
-       vpbroadcastd    0x3c(%rdi),%ymm0
-       vpaddd          %ymm0,%ymm15,%ymm15
-
-       # x12 += counter values 0-3
-       vpaddd          %ymm1,%ymm12,%ymm12
-
-       # interleave 32-bit words in state n, n+1
-       vmovdqa         0x00(%rsp),%ymm0
-       vmovdqa         0x20(%rsp),%ymm1
-       vpunpckldq      %ymm1,%ymm0,%ymm2
-       vpunpckhdq      %ymm1,%ymm0,%ymm1
-       vmovdqa         %ymm2,0x00(%rsp)
-       vmovdqa         %ymm1,0x20(%rsp)
-       vmovdqa         0x40(%rsp),%ymm0
-       vmovdqa         0x60(%rsp),%ymm1
-       vpunpckldq      %ymm1,%ymm0,%ymm2
-       vpunpckhdq      %ymm1,%ymm0,%ymm1
-       vmovdqa         %ymm2,0x40(%rsp)
-       vmovdqa         %ymm1,0x60(%rsp)
-       vmovdqa         %ymm4,%ymm0
-       vpunpckldq      %ymm5,%ymm0,%ymm4
-       vpunpckhdq      %ymm5,%ymm0,%ymm5
-       vmovdqa         %ymm6,%ymm0
-       vpunpckldq      %ymm7,%ymm0,%ymm6
-       vpunpckhdq      %ymm7,%ymm0,%ymm7
-       vmovdqa         %ymm8,%ymm0
-       vpunpckldq      %ymm9,%ymm0,%ymm8
-       vpunpckhdq      %ymm9,%ymm0,%ymm9
-       vmovdqa         %ymm10,%ymm0
-       vpunpckldq      %ymm11,%ymm0,%ymm10
-       vpunpckhdq      %ymm11,%ymm0,%ymm11
-       vmovdqa         %ymm12,%ymm0
-       vpunpckldq      %ymm13,%ymm0,%ymm12
-       vpunpckhdq      %ymm13,%ymm0,%ymm13
-       vmovdqa         %ymm14,%ymm0
-       vpunpckldq      %ymm15,%ymm0,%ymm14
-       vpunpckhdq      %ymm15,%ymm0,%ymm15
-
-       # interleave 64-bit words in state n, n+2
-       vmovdqa         0x00(%rsp),%ymm0
-       vmovdqa         0x40(%rsp),%ymm2
-       vpunpcklqdq     %ymm2,%ymm0,%ymm1
-       vpunpckhqdq     %ymm2,%ymm0,%ymm2
-       vmovdqa         %ymm1,0x00(%rsp)
-       vmovdqa         %ymm2,0x40(%rsp)
-       vmovdqa         0x20(%rsp),%ymm0
-       vmovdqa         0x60(%rsp),%ymm2
-       vpunpcklqdq     %ymm2,%ymm0,%ymm1
-       vpunpckhqdq     %ymm2,%ymm0,%ymm2
-       vmovdqa         %ymm1,0x20(%rsp)
-       vmovdqa         %ymm2,0x60(%rsp)
-       vmovdqa         %ymm4,%ymm0
-       vpunpcklqdq     %ymm6,%ymm0,%ymm4
-       vpunpckhqdq     %ymm6,%ymm0,%ymm6
-       vmovdqa         %ymm5,%ymm0
-       vpunpcklqdq     %ymm7,%ymm0,%ymm5
-       vpunpckhqdq     %ymm7,%ymm0,%ymm7
-       vmovdqa         %ymm8,%ymm0
-       vpunpcklqdq     %ymm10,%ymm0,%ymm8
-       vpunpckhqdq     %ymm10,%ymm0,%ymm10
-       vmovdqa         %ymm9,%ymm0
-       vpunpcklqdq     %ymm11,%ymm0,%ymm9
-       vpunpckhqdq     %ymm11,%ymm0,%ymm11
-       vmovdqa         %ymm12,%ymm0
-       vpunpcklqdq     %ymm14,%ymm0,%ymm12
-       vpunpckhqdq     %ymm14,%ymm0,%ymm14
-       vmovdqa         %ymm13,%ymm0
-       vpunpcklqdq     %ymm15,%ymm0,%ymm13
-       vpunpckhqdq     %ymm15,%ymm0,%ymm15
-
-       # interleave 128-bit words in state n, n+4
-       # xor/write first four blocks
-       vmovdqa         0x00(%rsp),%ymm1
-       vperm2i128      $0x20,%ymm4,%ymm1,%ymm0
-       cmp             $0x0020,%rax
-       jl              .Lxorpart8
-       vpxor           0x0000(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x0000(%rsi)
-       vperm2i128      $0x31,%ymm4,%ymm1,%ymm4
-
-       vperm2i128      $0x20,%ymm12,%ymm8,%ymm0
-       cmp             $0x0040,%rax
-       jl              .Lxorpart8
-       vpxor           0x0020(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x0020(%rsi)
-       vperm2i128      $0x31,%ymm12,%ymm8,%ymm12
-
-       vmovdqa         0x40(%rsp),%ymm1
-       vperm2i128      $0x20,%ymm6,%ymm1,%ymm0
-       cmp             $0x0060,%rax
-       jl              .Lxorpart8
-       vpxor           0x0040(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x0040(%rsi)
-       vperm2i128      $0x31,%ymm6,%ymm1,%ymm6
-
-       vperm2i128      $0x20,%ymm14,%ymm10,%ymm0
-       cmp             $0x0080,%rax
-       jl              .Lxorpart8
-       vpxor           0x0060(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x0060(%rsi)
-       vperm2i128      $0x31,%ymm14,%ymm10,%ymm14
-
-       vmovdqa         0x20(%rsp),%ymm1
-       vperm2i128      $0x20,%ymm5,%ymm1,%ymm0
-       cmp             $0x00a0,%rax
-       jl              .Lxorpart8
-       vpxor           0x0080(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x0080(%rsi)
-       vperm2i128      $0x31,%ymm5,%ymm1,%ymm5
-
-       vperm2i128      $0x20,%ymm13,%ymm9,%ymm0
-       cmp             $0x00c0,%rax
-       jl              .Lxorpart8
-       vpxor           0x00a0(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x00a0(%rsi)
-       vperm2i128      $0x31,%ymm13,%ymm9,%ymm13
-
-       vmovdqa         0x60(%rsp),%ymm1
-       vperm2i128      $0x20,%ymm7,%ymm1,%ymm0
-       cmp             $0x00e0,%rax
-       jl              .Lxorpart8
-       vpxor           0x00c0(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x00c0(%rsi)
-       vperm2i128      $0x31,%ymm7,%ymm1,%ymm7
-
-       vperm2i128      $0x20,%ymm15,%ymm11,%ymm0
-       cmp             $0x0100,%rax
-       jl              .Lxorpart8
-       vpxor           0x00e0(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x00e0(%rsi)
-       vperm2i128      $0x31,%ymm15,%ymm11,%ymm15
-
-       # xor remaining blocks, write to output
-       vmovdqa         %ymm4,%ymm0
-       cmp             $0x0120,%rax
-       jl              .Lxorpart8
-       vpxor           0x0100(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x0100(%rsi)
-
-       vmovdqa         %ymm12,%ymm0
-       cmp             $0x0140,%rax
-       jl              .Lxorpart8
-       vpxor           0x0120(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x0120(%rsi)
-
-       vmovdqa         %ymm6,%ymm0
-       cmp             $0x0160,%rax
-       jl              .Lxorpart8
-       vpxor           0x0140(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x0140(%rsi)
-
-       vmovdqa         %ymm14,%ymm0
-       cmp             $0x0180,%rax
-       jl              .Lxorpart8
-       vpxor           0x0160(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x0160(%rsi)
-
-       vmovdqa         %ymm5,%ymm0
-       cmp             $0x01a0,%rax
-       jl              .Lxorpart8
-       vpxor           0x0180(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x0180(%rsi)
-
-       vmovdqa         %ymm13,%ymm0
-       cmp             $0x01c0,%rax
-       jl              .Lxorpart8
-       vpxor           0x01a0(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x01a0(%rsi)
-
-       vmovdqa         %ymm7,%ymm0
-       cmp             $0x01e0,%rax
-       jl              .Lxorpart8
-       vpxor           0x01c0(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x01c0(%rsi)
-
-       vmovdqa         %ymm15,%ymm0
-       cmp             $0x0200,%rax
-       jl              .Lxorpart8
-       vpxor           0x01e0(%rdx),%ymm0,%ymm0
-       vmovdqu         %ymm0,0x01e0(%rsi)
-
-.Ldone8:
-       vzeroupper
-       lea             -8(%r10),%rsp
-       ret
-
-.Lxorpart8:
-       # xor remaining bytes from partial register into output
-       mov             %rax,%r9
-       and             $0x1f,%r9
-       jz              .Ldone8
-       and             $~0x1f,%rax
-
-       mov             %rsi,%r11
-
-       lea             (%rdx,%rax),%rsi
-       mov             %rsp,%rdi
-       mov             %r9,%rcx
-       rep movsb
-
-       vpxor           0x00(%rsp),%ymm0,%ymm0
-       vmovdqa         %ymm0,0x00(%rsp)
-
-       mov             %rsp,%rsi
-       lea             (%r11,%rax),%rdi
-       mov             %r9,%rcx
-       rep movsb
-
-       jmp             .Ldone8
-
-ENDPROC(chacha20_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
deleted file mode 100644 (file)
index 55d34de..0000000
+++ /dev/null
@@ -1,839 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX-512VL functions
- *
- * Copyright (C) 2018 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section       .rodata.cst32.CTR2BL, "aM", @progbits, 32
-.align 32
-CTR2BL:        .octa 0x00000000000000000000000000000000
-       .octa 0x00000000000000000000000000000001
-
-.section       .rodata.cst32.CTR4BL, "aM", @progbits, 32
-.align 32
-CTR4BL:        .octa 0x00000000000000000000000000000002
-       .octa 0x00000000000000000000000000000003
-
-.section       .rodata.cst32.CTR8BL, "aM", @progbits, 32
-.align 32
-CTR8BL:        .octa 0x00000003000000020000000100000000
-       .octa 0x00000007000000060000000500000004
-
-.text
-
-ENTRY(chacha20_2block_xor_avx512vl)
-       # %rdi: Input state matrix, s
-       # %rsi: up to 2 data blocks output, o
-       # %rdx: up to 2 data blocks input, i
-       # %rcx: input/output length in bytes
-
-       # This function encrypts two ChaCha20 blocks by loading the state
-       # matrix twice across four AVX registers. It performs matrix operations
-       # on four words in each matrix in parallel, but requires shuffling to
-       # rearrange the words after each round.
-
-       vzeroupper
-
-       # x0..3[0-2] = s0..3
-       vbroadcasti128  0x00(%rdi),%ymm0
-       vbroadcasti128  0x10(%rdi),%ymm1
-       vbroadcasti128  0x20(%rdi),%ymm2
-       vbroadcasti128  0x30(%rdi),%ymm3
-
-       vpaddd          CTR2BL(%rip),%ymm3,%ymm3
-
-       vmovdqa         %ymm0,%ymm8
-       vmovdqa         %ymm1,%ymm9
-       vmovdqa         %ymm2,%ymm10
-       vmovdqa         %ymm3,%ymm11
-
-       mov             $10,%rax
-
-.Ldoubleround:
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxord          %ymm0,%ymm3,%ymm3
-       vprold          $16,%ymm3,%ymm3
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxord          %ymm2,%ymm1,%ymm1
-       vprold          $12,%ymm1,%ymm1
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxord          %ymm0,%ymm3,%ymm3
-       vprold          $8,%ymm3,%ymm3
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxord          %ymm2,%ymm1,%ymm1
-       vprold          $7,%ymm1,%ymm1
-
-       # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-       vpshufd         $0x39,%ymm1,%ymm1
-       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       vpshufd         $0x4e,%ymm2,%ymm2
-       # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-       vpshufd         $0x93,%ymm3,%ymm3
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxord          %ymm0,%ymm3,%ymm3
-       vprold          $16,%ymm3,%ymm3
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxord          %ymm2,%ymm1,%ymm1
-       vprold          $12,%ymm1,%ymm1
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxord          %ymm0,%ymm3,%ymm3
-       vprold          $8,%ymm3,%ymm3
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxord          %ymm2,%ymm1,%ymm1
-       vprold          $7,%ymm1,%ymm1
-
-       # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-       vpshufd         $0x93,%ymm1,%ymm1
-       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       vpshufd         $0x4e,%ymm2,%ymm2
-       # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-       vpshufd         $0x39,%ymm3,%ymm3
-
-       dec             %rax
-       jnz             .Ldoubleround
-
-       # o0 = i0 ^ (x0 + s0)
-       vpaddd          %ymm8,%ymm0,%ymm7
-       cmp             $0x10,%rcx
-       jl              .Lxorpart2
-       vpxord          0x00(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x00(%rsi)
-       vextracti128    $1,%ymm7,%xmm0
-       # o1 = i1 ^ (x1 + s1)
-       vpaddd          %ymm9,%ymm1,%ymm7
-       cmp             $0x20,%rcx
-       jl              .Lxorpart2
-       vpxord          0x10(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x10(%rsi)
-       vextracti128    $1,%ymm7,%xmm1
-       # o2 = i2 ^ (x2 + s2)
-       vpaddd          %ymm10,%ymm2,%ymm7
-       cmp             $0x30,%rcx
-       jl              .Lxorpart2
-       vpxord          0x20(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x20(%rsi)
-       vextracti128    $1,%ymm7,%xmm2
-       # o3 = i3 ^ (x3 + s3)
-       vpaddd          %ymm11,%ymm3,%ymm7
-       cmp             $0x40,%rcx
-       jl              .Lxorpart2
-       vpxord          0x30(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x30(%rsi)
-       vextracti128    $1,%ymm7,%xmm3
-
-       # xor and write second block
-       vmovdqa         %xmm0,%xmm7
-       cmp             $0x50,%rcx
-       jl              .Lxorpart2
-       vpxord          0x40(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x40(%rsi)
-
-       vmovdqa         %xmm1,%xmm7
-       cmp             $0x60,%rcx
-       jl              .Lxorpart2
-       vpxord          0x50(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x50(%rsi)
-
-       vmovdqa         %xmm2,%xmm7
-       cmp             $0x70,%rcx
-       jl              .Lxorpart2
-       vpxord          0x60(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x60(%rsi)
-
-       vmovdqa         %xmm3,%xmm7
-       cmp             $0x80,%rcx
-       jl              .Lxorpart2
-       vpxord          0x70(%rdx),%xmm7,%xmm6
-       vmovdqu         %xmm6,0x70(%rsi)
-
-.Ldone2:
-       vzeroupper
-       ret
-
-.Lxorpart2:
-       # xor remaining bytes from partial register into output
-       mov             %rcx,%rax
-       and             $0xf,%rcx
-       jz              .Ldone8
-       mov             %rax,%r9
-       and             $~0xf,%r9
-
-       mov             $1,%rax
-       shld            %cl,%rax,%rax
-       sub             $1,%rax
-       kmovq           %rax,%k1
-
-       vmovdqu8        (%rdx,%r9),%xmm1{%k1}{z}
-       vpxord          %xmm7,%xmm1,%xmm1
-       vmovdqu8        %xmm1,(%rsi,%r9){%k1}
-
-       jmp             .Ldone2
-
-ENDPROC(chacha20_2block_xor_avx512vl)
-
-ENTRY(chacha20_4block_xor_avx512vl)
-       # %rdi: Input state matrix, s
-       # %rsi: up to 4 data blocks output, o
-       # %rdx: up to 4 data blocks input, i
-       # %rcx: input/output length in bytes
-
-       # This function encrypts four ChaCha20 block by loading the state
-       # matrix four times across eight AVX registers. It performs matrix
-       # operations on four words in two matrices in parallel, sequentially
-       # to the operations on the four words of the other two matrices. The
-       # required word shuffling has a rather high latency, we can do the
-       # arithmetic on two matrix-pairs without much slowdown.
-
-       vzeroupper
-
-       # x0..3[0-4] = s0..3
-       vbroadcasti128  0x00(%rdi),%ymm0
-       vbroadcasti128  0x10(%rdi),%ymm1
-       vbroadcasti128  0x20(%rdi),%ymm2
-       vbroadcasti128  0x30(%rdi),%ymm3
-
-       vmovdqa         %ymm0,%ymm4
-       vmovdqa         %ymm1,%ymm5
-       vmovdqa         %ymm2,%ymm6
-       vmovdqa         %ymm3,%ymm7
-
-       vpaddd          CTR2BL(%rip),%ymm3,%ymm3
-       vpaddd          CTR4BL(%rip),%ymm7,%ymm7
-
-       vmovdqa         %ymm0,%ymm11
-       vmovdqa         %ymm1,%ymm12
-       vmovdqa         %ymm2,%ymm13
-       vmovdqa         %ymm3,%ymm14
-       vmovdqa         %ymm7,%ymm15
-
-       mov             $10,%rax
-
-.Ldoubleround4:
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxord          %ymm0,%ymm3,%ymm3
-       vprold          $16,%ymm3,%ymm3
-
-       vpaddd          %ymm5,%ymm4,%ymm4
-       vpxord          %ymm4,%ymm7,%ymm7
-       vprold          $16,%ymm7,%ymm7
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxord          %ymm2,%ymm1,%ymm1
-       vprold          $12,%ymm1,%ymm1
-
-       vpaddd          %ymm7,%ymm6,%ymm6
-       vpxord          %ymm6,%ymm5,%ymm5
-       vprold          $12,%ymm5,%ymm5
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxord          %ymm0,%ymm3,%ymm3
-       vprold          $8,%ymm3,%ymm3
-
-       vpaddd          %ymm5,%ymm4,%ymm4
-       vpxord          %ymm4,%ymm7,%ymm7
-       vprold          $8,%ymm7,%ymm7
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxord          %ymm2,%ymm1,%ymm1
-       vprold          $7,%ymm1,%ymm1
-
-       vpaddd          %ymm7,%ymm6,%ymm6
-       vpxord          %ymm6,%ymm5,%ymm5
-       vprold          $7,%ymm5,%ymm5
-
-       # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-       vpshufd         $0x39,%ymm1,%ymm1
-       vpshufd         $0x39,%ymm5,%ymm5
-       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       vpshufd         $0x4e,%ymm2,%ymm2
-       vpshufd         $0x4e,%ymm6,%ymm6
-       # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-       vpshufd         $0x93,%ymm3,%ymm3
-       vpshufd         $0x93,%ymm7,%ymm7
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxord          %ymm0,%ymm3,%ymm3
-       vprold          $16,%ymm3,%ymm3
-
-       vpaddd          %ymm5,%ymm4,%ymm4
-       vpxord          %ymm4,%ymm7,%ymm7
-       vprold          $16,%ymm7,%ymm7
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxord          %ymm2,%ymm1,%ymm1
-       vprold          $12,%ymm1,%ymm1
-
-       vpaddd          %ymm7,%ymm6,%ymm6
-       vpxord          %ymm6,%ymm5,%ymm5
-       vprold          $12,%ymm5,%ymm5
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       vpaddd          %ymm1,%ymm0,%ymm0
-       vpxord          %ymm0,%ymm3,%ymm3
-       vprold          $8,%ymm3,%ymm3
-
-       vpaddd          %ymm5,%ymm4,%ymm4
-       vpxord          %ymm4,%ymm7,%ymm7
-       vprold          $8,%ymm7,%ymm7
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       vpaddd          %ymm3,%ymm2,%ymm2
-       vpxord          %ymm2,%ymm1,%ymm1
-       vprold          $7,%ymm1,%ymm1
-
-       vpaddd          %ymm7,%ymm6,%ymm6
-       vpxord          %ymm6,%ymm5,%ymm5
-       vprold          $7,%ymm5,%ymm5
-
-       # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-       vpshufd         $0x93,%ymm1,%ymm1
-       vpshufd         $0x93,%ymm5,%ymm5
-       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       vpshufd         $0x4e,%ymm2,%ymm2
-       vpshufd         $0x4e,%ymm6,%ymm6
-       # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-       vpshufd         $0x39,%ymm3,%ymm3
-       vpshufd         $0x39,%ymm7,%ymm7
-
-       dec             %rax
-       jnz             .Ldoubleround4
-
-       # o0 = i0 ^ (x0 + s0), first block
-       vpaddd          %ymm11,%ymm0,%ymm10
-       cmp             $0x10,%rcx
-       jl              .Lxorpart4
-       vpxord          0x00(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x00(%rsi)
-       vextracti128    $1,%ymm10,%xmm0
-       # o1 = i1 ^ (x1 + s1), first block
-       vpaddd          %ymm12,%ymm1,%ymm10
-       cmp             $0x20,%rcx
-       jl              .Lxorpart4
-       vpxord          0x10(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x10(%rsi)
-       vextracti128    $1,%ymm10,%xmm1
-       # o2 = i2 ^ (x2 + s2), first block
-       vpaddd          %ymm13,%ymm2,%ymm10
-       cmp             $0x30,%rcx
-       jl              .Lxorpart4
-       vpxord          0x20(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x20(%rsi)
-       vextracti128    $1,%ymm10,%xmm2
-       # o3 = i3 ^ (x3 + s3), first block
-       vpaddd          %ymm14,%ymm3,%ymm10
-       cmp             $0x40,%rcx
-       jl              .Lxorpart4
-       vpxord          0x30(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x30(%rsi)
-       vextracti128    $1,%ymm10,%xmm3
-
-       # xor and write second block
-       vmovdqa         %xmm0,%xmm10
-       cmp             $0x50,%rcx
-       jl              .Lxorpart4
-       vpxord          0x40(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x40(%rsi)
-
-       vmovdqa         %xmm1,%xmm10
-       cmp             $0x60,%rcx
-       jl              .Lxorpart4
-       vpxord          0x50(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x50(%rsi)
-
-       vmovdqa         %xmm2,%xmm10
-       cmp             $0x70,%rcx
-       jl              .Lxorpart4
-       vpxord          0x60(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x60(%rsi)
-
-       vmovdqa         %xmm3,%xmm10
-       cmp             $0x80,%rcx
-       jl              .Lxorpart4
-       vpxord          0x70(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x70(%rsi)
-
-       # o0 = i0 ^ (x0 + s0), third block
-       vpaddd          %ymm11,%ymm4,%ymm10
-       cmp             $0x90,%rcx
-       jl              .Lxorpart4
-       vpxord          0x80(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x80(%rsi)
-       vextracti128    $1,%ymm10,%xmm4
-       # o1 = i1 ^ (x1 + s1), third block
-       vpaddd          %ymm12,%ymm5,%ymm10
-       cmp             $0xa0,%rcx
-       jl              .Lxorpart4
-       vpxord          0x90(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0x90(%rsi)
-       vextracti128    $1,%ymm10,%xmm5
-       # o2 = i2 ^ (x2 + s2), third block
-       vpaddd          %ymm13,%ymm6,%ymm10
-       cmp             $0xb0,%rcx
-       jl              .Lxorpart4
-       vpxord          0xa0(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0xa0(%rsi)
-       vextracti128    $1,%ymm10,%xmm6
-       # o3 = i3 ^ (x3 + s3), third block
-       vpaddd          %ymm15,%ymm7,%ymm10
-       cmp             $0xc0,%rcx
-       jl              .Lxorpart4
-       vpxord          0xb0(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0xb0(%rsi)
-       vextracti128    $1,%ymm10,%xmm7
-
-       # xor and write fourth block
-       vmovdqa         %xmm4,%xmm10
-       cmp             $0xd0,%rcx
-       jl              .Lxorpart4
-       vpxord          0xc0(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0xc0(%rsi)
-
-       vmovdqa         %xmm5,%xmm10
-       cmp             $0xe0,%rcx
-       jl              .Lxorpart4
-       vpxord          0xd0(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0xd0(%rsi)
-
-       vmovdqa         %xmm6,%xmm10
-       cmp             $0xf0,%rcx
-       jl              .Lxorpart4
-       vpxord          0xe0(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0xe0(%rsi)
-
-       vmovdqa         %xmm7,%xmm10
-       cmp             $0x100,%rcx
-       jl              .Lxorpart4
-       vpxord          0xf0(%rdx),%xmm10,%xmm9
-       vmovdqu         %xmm9,0xf0(%rsi)
-
-.Ldone4:
-       vzeroupper
-       ret
-
-.Lxorpart4:
-       # xor remaining bytes from partial register into output
-       mov             %rcx,%rax
-       and             $0xf,%rcx
-       jz              .Ldone8
-       mov             %rax,%r9
-       and             $~0xf,%r9
-
-       mov             $1,%rax
-       shld            %cl,%rax,%rax
-       sub             $1,%rax
-       kmovq           %rax,%k1
-
-       vmovdqu8        (%rdx,%r9),%xmm1{%k1}{z}
-       vpxord          %xmm10,%xmm1,%xmm1
-       vmovdqu8        %xmm1,(%rsi,%r9){%k1}
-
-       jmp             .Ldone4
-
-ENDPROC(chacha20_4block_xor_avx512vl)
-
-ENTRY(chacha20_8block_xor_avx512vl)
-       # %rdi: Input state matrix, s
-       # %rsi: up to 8 data blocks output, o
-       # %rdx: up to 8 data blocks input, i
-       # %rcx: input/output length in bytes
-
-       # This function encrypts eight consecutive ChaCha20 blocks by loading
-       # the state matrix in AVX registers eight times. Compared to AVX2, this
-       # mostly benefits from the new rotate instructions in VL and the
-       # additional registers.
-
-       vzeroupper
-
-       # x0..15[0-7] = s[0..15]
-       vpbroadcastd    0x00(%rdi),%ymm0
-       vpbroadcastd    0x04(%rdi),%ymm1
-       vpbroadcastd    0x08(%rdi),%ymm2
-       vpbroadcastd    0x0c(%rdi),%ymm3
-       vpbroadcastd    0x10(%rdi),%ymm4
-       vpbroadcastd    0x14(%rdi),%ymm5
-       vpbroadcastd    0x18(%rdi),%ymm6
-       vpbroadcastd    0x1c(%rdi),%ymm7
-       vpbroadcastd    0x20(%rdi),%ymm8
-       vpbroadcastd    0x24(%rdi),%ymm9
-       vpbroadcastd    0x28(%rdi),%ymm10
-       vpbroadcastd    0x2c(%rdi),%ymm11
-       vpbroadcastd    0x30(%rdi),%ymm12
-       vpbroadcastd    0x34(%rdi),%ymm13
-       vpbroadcastd    0x38(%rdi),%ymm14
-       vpbroadcastd    0x3c(%rdi),%ymm15
-
-       # x12 += counter values 0-3
-       vpaddd          CTR8BL(%rip),%ymm12,%ymm12
-
-       vmovdqa64       %ymm0,%ymm16
-       vmovdqa64       %ymm1,%ymm17
-       vmovdqa64       %ymm2,%ymm18
-       vmovdqa64       %ymm3,%ymm19
-       vmovdqa64       %ymm4,%ymm20
-       vmovdqa64       %ymm5,%ymm21
-       vmovdqa64       %ymm6,%ymm22
-       vmovdqa64       %ymm7,%ymm23
-       vmovdqa64       %ymm8,%ymm24
-       vmovdqa64       %ymm9,%ymm25
-       vmovdqa64       %ymm10,%ymm26
-       vmovdqa64       %ymm11,%ymm27
-       vmovdqa64       %ymm12,%ymm28
-       vmovdqa64       %ymm13,%ymm29
-       vmovdqa64       %ymm14,%ymm30
-       vmovdqa64       %ymm15,%ymm31
-
-       mov             $10,%eax
-
-.Ldoubleround8:
-       # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-       vpaddd          %ymm0,%ymm4,%ymm0
-       vpxord          %ymm0,%ymm12,%ymm12
-       vprold          $16,%ymm12,%ymm12
-       # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-       vpaddd          %ymm1,%ymm5,%ymm1
-       vpxord          %ymm1,%ymm13,%ymm13
-       vprold          $16,%ymm13,%ymm13
-       # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-       vpaddd          %ymm2,%ymm6,%ymm2
-       vpxord          %ymm2,%ymm14,%ymm14
-       vprold          $16,%ymm14,%ymm14
-       # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-       vpaddd          %ymm3,%ymm7,%ymm3
-       vpxord          %ymm3,%ymm15,%ymm15
-       vprold          $16,%ymm15,%ymm15
-
-       # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-       vpaddd          %ymm12,%ymm8,%ymm8
-       vpxord          %ymm8,%ymm4,%ymm4
-       vprold          $12,%ymm4,%ymm4
-       # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-       vpaddd          %ymm13,%ymm9,%ymm9
-       vpxord          %ymm9,%ymm5,%ymm5
-       vprold          $12,%ymm5,%ymm5
-       # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-       vpaddd          %ymm14,%ymm10,%ymm10
-       vpxord          %ymm10,%ymm6,%ymm6
-       vprold          $12,%ymm6,%ymm6
-       # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-       vpaddd          %ymm15,%ymm11,%ymm11
-       vpxord          %ymm11,%ymm7,%ymm7
-       vprold          $12,%ymm7,%ymm7
-
-       # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-       vpaddd          %ymm0,%ymm4,%ymm0
-       vpxord          %ymm0,%ymm12,%ymm12
-       vprold          $8,%ymm12,%ymm12
-       # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-       vpaddd          %ymm1,%ymm5,%ymm1
-       vpxord          %ymm1,%ymm13,%ymm13
-       vprold          $8,%ymm13,%ymm13
-       # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-       vpaddd          %ymm2,%ymm6,%ymm2
-       vpxord          %ymm2,%ymm14,%ymm14
-       vprold          $8,%ymm14,%ymm14
-       # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-       vpaddd          %ymm3,%ymm7,%ymm3
-       vpxord          %ymm3,%ymm15,%ymm15
-       vprold          $8,%ymm15,%ymm15
-
-       # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-       vpaddd          %ymm12,%ymm8,%ymm8
-       vpxord          %ymm8,%ymm4,%ymm4
-       vprold          $7,%ymm4,%ymm4
-       # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-       vpaddd          %ymm13,%ymm9,%ymm9
-       vpxord          %ymm9,%ymm5,%ymm5
-       vprold          $7,%ymm5,%ymm5
-       # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-       vpaddd          %ymm14,%ymm10,%ymm10
-       vpxord          %ymm10,%ymm6,%ymm6
-       vprold          $7,%ymm6,%ymm6
-       # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-       vpaddd          %ymm15,%ymm11,%ymm11
-       vpxord          %ymm11,%ymm7,%ymm7
-       vprold          $7,%ymm7,%ymm7
-
-       # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-       vpaddd          %ymm0,%ymm5,%ymm0
-       vpxord          %ymm0,%ymm15,%ymm15
-       vprold          $16,%ymm15,%ymm15
-       # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
-       vpaddd          %ymm1,%ymm6,%ymm1
-       vpxord          %ymm1,%ymm12,%ymm12
-       vprold          $16,%ymm12,%ymm12
-       # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-       vpaddd          %ymm2,%ymm7,%ymm2
-       vpxord          %ymm2,%ymm13,%ymm13
-       vprold          $16,%ymm13,%ymm13
-       # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-       vpaddd          %ymm3,%ymm4,%ymm3
-       vpxord          %ymm3,%ymm14,%ymm14
-       vprold          $16,%ymm14,%ymm14
-
-       # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-       vpaddd          %ymm15,%ymm10,%ymm10
-       vpxord          %ymm10,%ymm5,%ymm5
-       vprold          $12,%ymm5,%ymm5
-       # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-       vpaddd          %ymm12,%ymm11,%ymm11
-       vpxord          %ymm11,%ymm6,%ymm6
-       vprold          $12,%ymm6,%ymm6
-       # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-       vpaddd          %ymm13,%ymm8,%ymm8
-       vpxord          %ymm8,%ymm7,%ymm7
-       vprold          $12,%ymm7,%ymm7
-       # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-       vpaddd          %ymm14,%ymm9,%ymm9
-       vpxord          %ymm9,%ymm4,%ymm4
-       vprold          $12,%ymm4,%ymm4
-
-       # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-       vpaddd          %ymm0,%ymm5,%ymm0
-       vpxord          %ymm0,%ymm15,%ymm15
-       vprold          $8,%ymm15,%ymm15
-       # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-       vpaddd          %ymm1,%ymm6,%ymm1
-       vpxord          %ymm1,%ymm12,%ymm12
-       vprold          $8,%ymm12,%ymm12
-       # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-       vpaddd          %ymm2,%ymm7,%ymm2
-       vpxord          %ymm2,%ymm13,%ymm13
-       vprold          $8,%ymm13,%ymm13
-       # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-       vpaddd          %ymm3,%ymm4,%ymm3
-       vpxord          %ymm3,%ymm14,%ymm14
-       vprold          $8,%ymm14,%ymm14
-
-       # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-       vpaddd          %ymm15,%ymm10,%ymm10
-       vpxord          %ymm10,%ymm5,%ymm5
-       vprold          $7,%ymm5,%ymm5
-       # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-       vpaddd          %ymm12,%ymm11,%ymm11
-       vpxord          %ymm11,%ymm6,%ymm6
-       vprold          $7,%ymm6,%ymm6
-       # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-       vpaddd          %ymm13,%ymm8,%ymm8
-       vpxord          %ymm8,%ymm7,%ymm7
-       vprold          $7,%ymm7,%ymm7
-       # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-       vpaddd          %ymm14,%ymm9,%ymm9
-       vpxord          %ymm9,%ymm4,%ymm4
-       vprold          $7,%ymm4,%ymm4
-
-       dec             %eax
-       jnz             .Ldoubleround8
-
-       # x0..15[0-3] += s[0..15]
-       vpaddd          %ymm16,%ymm0,%ymm0
-       vpaddd          %ymm17,%ymm1,%ymm1
-       vpaddd          %ymm18,%ymm2,%ymm2
-       vpaddd          %ymm19,%ymm3,%ymm3
-       vpaddd          %ymm20,%ymm4,%ymm4
-       vpaddd          %ymm21,%ymm5,%ymm5
-       vpaddd          %ymm22,%ymm6,%ymm6
-       vpaddd          %ymm23,%ymm7,%ymm7
-       vpaddd          %ymm24,%ymm8,%ymm8
-       vpaddd          %ymm25,%ymm9,%ymm9
-       vpaddd          %ymm26,%ymm10,%ymm10
-       vpaddd          %ymm27,%ymm11,%ymm11
-       vpaddd          %ymm28,%ymm12,%ymm12
-       vpaddd          %ymm29,%ymm13,%ymm13
-       vpaddd          %ymm30,%ymm14,%ymm14
-       vpaddd          %ymm31,%ymm15,%ymm15
-
-       # interleave 32-bit words in state n, n+1
-       vpunpckldq      %ymm1,%ymm0,%ymm16
-       vpunpckhdq      %ymm1,%ymm0,%ymm17
-       vpunpckldq      %ymm3,%ymm2,%ymm18
-       vpunpckhdq      %ymm3,%ymm2,%ymm19
-       vpunpckldq      %ymm5,%ymm4,%ymm20
-       vpunpckhdq      %ymm5,%ymm4,%ymm21
-       vpunpckldq      %ymm7,%ymm6,%ymm22
-       vpunpckhdq      %ymm7,%ymm6,%ymm23
-       vpunpckldq      %ymm9,%ymm8,%ymm24
-       vpunpckhdq      %ymm9,%ymm8,%ymm25
-       vpunpckldq      %ymm11,%ymm10,%ymm26
-       vpunpckhdq      %ymm11,%ymm10,%ymm27
-       vpunpckldq      %ymm13,%ymm12,%ymm28
-       vpunpckhdq      %ymm13,%ymm12,%ymm29
-       vpunpckldq      %ymm15,%ymm14,%ymm30
-       vpunpckhdq      %ymm15,%ymm14,%ymm31
-
-       # interleave 64-bit words in state n, n+2
-       vpunpcklqdq     %ymm18,%ymm16,%ymm0
-       vpunpcklqdq     %ymm19,%ymm17,%ymm1
-       vpunpckhqdq     %ymm18,%ymm16,%ymm2
-       vpunpckhqdq     %ymm19,%ymm17,%ymm3
-       vpunpcklqdq     %ymm22,%ymm20,%ymm4
-       vpunpcklqdq     %ymm23,%ymm21,%ymm5
-       vpunpckhqdq     %ymm22,%ymm20,%ymm6
-       vpunpckhqdq     %ymm23,%ymm21,%ymm7
-       vpunpcklqdq     %ymm26,%ymm24,%ymm8
-       vpunpcklqdq     %ymm27,%ymm25,%ymm9
-       vpunpckhqdq     %ymm26,%ymm24,%ymm10
-       vpunpckhqdq     %ymm27,%ymm25,%ymm11
-       vpunpcklqdq     %ymm30,%ymm28,%ymm12
-       vpunpcklqdq     %ymm31,%ymm29,%ymm13
-       vpunpckhqdq     %ymm30,%ymm28,%ymm14
-       vpunpckhqdq     %ymm31,%ymm29,%ymm15
-
-       # interleave 128-bit words in state n, n+4
-       # xor/write first four blocks
-       vmovdqa64       %ymm0,%ymm16
-       vperm2i128      $0x20,%ymm4,%ymm0,%ymm0
-       cmp             $0x0020,%rcx
-       jl              .Lxorpart8
-       vpxord          0x0000(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x0000(%rsi)
-       vmovdqa64       %ymm16,%ymm0
-       vperm2i128      $0x31,%ymm4,%ymm0,%ymm4
-
-       vperm2i128      $0x20,%ymm12,%ymm8,%ymm0
-       cmp             $0x0040,%rcx
-       jl              .Lxorpart8
-       vpxord          0x0020(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x0020(%rsi)
-       vperm2i128      $0x31,%ymm12,%ymm8,%ymm12
-
-       vperm2i128      $0x20,%ymm6,%ymm2,%ymm0
-       cmp             $0x0060,%rcx
-       jl              .Lxorpart8
-       vpxord          0x0040(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x0040(%rsi)
-       vperm2i128      $0x31,%ymm6,%ymm2,%ymm6
-
-       vperm2i128      $0x20,%ymm14,%ymm10,%ymm0
-       cmp             $0x0080,%rcx
-       jl              .Lxorpart8
-       vpxord          0x0060(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x0060(%rsi)
-       vperm2i128      $0x31,%ymm14,%ymm10,%ymm14
-
-       vperm2i128      $0x20,%ymm5,%ymm1,%ymm0
-       cmp             $0x00a0,%rcx
-       jl              .Lxorpart8
-       vpxord          0x0080(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x0080(%rsi)
-       vperm2i128      $0x31,%ymm5,%ymm1,%ymm5
-
-       vperm2i128      $0x20,%ymm13,%ymm9,%ymm0
-       cmp             $0x00c0,%rcx
-       jl              .Lxorpart8
-       vpxord          0x00a0(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x00a0(%rsi)
-       vperm2i128      $0x31,%ymm13,%ymm9,%ymm13
-
-       vperm2i128      $0x20,%ymm7,%ymm3,%ymm0
-       cmp             $0x00e0,%rcx
-       jl              .Lxorpart8
-       vpxord          0x00c0(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x00c0(%rsi)
-       vperm2i128      $0x31,%ymm7,%ymm3,%ymm7
-
-       vperm2i128      $0x20,%ymm15,%ymm11,%ymm0
-       cmp             $0x0100,%rcx
-       jl              .Lxorpart8
-       vpxord          0x00e0(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x00e0(%rsi)
-       vperm2i128      $0x31,%ymm15,%ymm11,%ymm15
-
-       # xor remaining blocks, write to output
-       vmovdqa64       %ymm4,%ymm0
-       cmp             $0x0120,%rcx
-       jl              .Lxorpart8
-       vpxord          0x0100(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x0100(%rsi)
-
-       vmovdqa64       %ymm12,%ymm0
-       cmp             $0x0140,%rcx
-       jl              .Lxorpart8
-       vpxord          0x0120(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x0120(%rsi)
-
-       vmovdqa64       %ymm6,%ymm0
-       cmp             $0x0160,%rcx
-       jl              .Lxorpart8
-       vpxord          0x0140(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x0140(%rsi)
-
-       vmovdqa64       %ymm14,%ymm0
-       cmp             $0x0180,%rcx
-       jl              .Lxorpart8
-       vpxord          0x0160(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x0160(%rsi)
-
-       vmovdqa64       %ymm5,%ymm0
-       cmp             $0x01a0,%rcx
-       jl              .Lxorpart8
-       vpxord          0x0180(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x0180(%rsi)
-
-       vmovdqa64       %ymm13,%ymm0
-       cmp             $0x01c0,%rcx
-       jl              .Lxorpart8
-       vpxord          0x01a0(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x01a0(%rsi)
-
-       vmovdqa64       %ymm7,%ymm0
-       cmp             $0x01e0,%rcx
-       jl              .Lxorpart8
-       vpxord          0x01c0(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x01c0(%rsi)
-
-       vmovdqa64       %ymm15,%ymm0
-       cmp             $0x0200,%rcx
-       jl              .Lxorpart8
-       vpxord          0x01e0(%rdx),%ymm0,%ymm0
-       vmovdqu64       %ymm0,0x01e0(%rsi)
-
-.Ldone8:
-       vzeroupper
-       ret
-
-.Lxorpart8:
-       # xor remaining bytes from partial register into output
-       mov             %rcx,%rax
-       and             $0x1f,%rcx
-       jz              .Ldone8
-       mov             %rax,%r9
-       and             $~0x1f,%r9
-
-       mov             $1,%rax
-       shld            %cl,%rax,%rax
-       sub             $1,%rax
-       kmovq           %rax,%k1
-
-       vmovdqu8        (%rdx,%r9),%ymm1{%k1}{z}
-       vpxord          %ymm0,%ymm1,%ymm1
-       vmovdqu8        %ymm1,(%rsi,%r9){%k1}
-
-       jmp             .Ldone8
-
-ENDPROC(chacha20_8block_xor_avx512vl)
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S
deleted file mode 100644 (file)
index f679278..0000000
+++ /dev/null
@@ -1,792 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-.section       .rodata.cst16.ROT8, "aM", @progbits, 16
-.align 16
-ROT8:  .octa 0x0e0d0c0f0a09080b0605040702010003
-.section       .rodata.cst16.ROT16, "aM", @progbits, 16
-.align 16
-ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
-.section       .rodata.cst16.CTRINC, "aM", @progbits, 16
-.align 16
-CTRINC:        .octa 0x00000003000000020000000100000000
-
-.text
-
-/*
- * chacha20_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
- * function performs matrix operations on four words in parallel, but requires
- * shuffling to rearrange the words after each round.  8/16-bit word rotation is
- * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
- * rotation uses traditional shift+OR.
- *
- * Clobbers: %ecx, %xmm4-%xmm7
- */
-chacha20_permute:
-
-       movdqa          ROT8(%rip),%xmm4
-       movdqa          ROT16(%rip),%xmm5
-       mov             $10,%ecx
-
-.Ldoubleround:
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       paddd           %xmm1,%xmm0
-       pxor            %xmm0,%xmm3
-       pshufb          %xmm5,%xmm3
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       paddd           %xmm3,%xmm2
-       pxor            %xmm2,%xmm1
-       movdqa          %xmm1,%xmm6
-       pslld           $12,%xmm6
-       psrld           $20,%xmm1
-       por             %xmm6,%xmm1
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       paddd           %xmm1,%xmm0
-       pxor            %xmm0,%xmm3
-       pshufb          %xmm4,%xmm3
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       paddd           %xmm3,%xmm2
-       pxor            %xmm2,%xmm1
-       movdqa          %xmm1,%xmm7
-       pslld           $7,%xmm7
-       psrld           $25,%xmm1
-       por             %xmm7,%xmm1
-
-       # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-       pshufd          $0x39,%xmm1,%xmm1
-       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       pshufd          $0x4e,%xmm2,%xmm2
-       # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-       pshufd          $0x93,%xmm3,%xmm3
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       paddd           %xmm1,%xmm0
-       pxor            %xmm0,%xmm3
-       pshufb          %xmm5,%xmm3
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       paddd           %xmm3,%xmm2
-       pxor            %xmm2,%xmm1
-       movdqa          %xmm1,%xmm6
-       pslld           $12,%xmm6
-       psrld           $20,%xmm1
-       por             %xmm6,%xmm1
-
-       # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       paddd           %xmm1,%xmm0
-       pxor            %xmm0,%xmm3
-       pshufb          %xmm4,%xmm3
-
-       # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       paddd           %xmm3,%xmm2
-       pxor            %xmm2,%xmm1
-       movdqa          %xmm1,%xmm7
-       pslld           $7,%xmm7
-       psrld           $25,%xmm1
-       por             %xmm7,%xmm1
-
-       # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-       pshufd          $0x93,%xmm1,%xmm1
-       # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       pshufd          $0x4e,%xmm2,%xmm2
-       # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-       pshufd          $0x39,%xmm3,%xmm3
-
-       dec             %ecx
-       jnz             .Ldoubleround
-
-       ret
-ENDPROC(chacha20_permute)
-
-ENTRY(chacha20_block_xor_ssse3)
-       # %rdi: Input state matrix, s
-       # %rsi: up to 1 data block output, o
-       # %rdx: up to 1 data block input, i
-       # %rcx: input/output length in bytes
-       FRAME_BEGIN
-
-       # x0..3 = s0..3
-       movdqa          0x00(%rdi),%xmm0
-       movdqa          0x10(%rdi),%xmm1
-       movdqa          0x20(%rdi),%xmm2
-       movdqa          0x30(%rdi),%xmm3
-       movdqa          %xmm0,%xmm8
-       movdqa          %xmm1,%xmm9
-       movdqa          %xmm2,%xmm10
-       movdqa          %xmm3,%xmm11
-
-       mov             %rcx,%rax
-       call            chacha20_permute
-
-       # o0 = i0 ^ (x0 + s0)
-       paddd           %xmm8,%xmm0
-       cmp             $0x10,%rax
-       jl              .Lxorpart
-       movdqu          0x00(%rdx),%xmm4
-       pxor            %xmm4,%xmm0
-       movdqu          %xmm0,0x00(%rsi)
-       # o1 = i1 ^ (x1 + s1)
-       paddd           %xmm9,%xmm1
-       movdqa          %xmm1,%xmm0
-       cmp             $0x20,%rax
-       jl              .Lxorpart
-       movdqu          0x10(%rdx),%xmm0
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0x10(%rsi)
-       # o2 = i2 ^ (x2 + s2)
-       paddd           %xmm10,%xmm2
-       movdqa          %xmm2,%xmm0
-       cmp             $0x30,%rax
-       jl              .Lxorpart
-       movdqu          0x20(%rdx),%xmm0
-       pxor            %xmm2,%xmm0
-       movdqu          %xmm0,0x20(%rsi)
-       # o3 = i3 ^ (x3 + s3)
-       paddd           %xmm11,%xmm3
-       movdqa          %xmm3,%xmm0
-       cmp             $0x40,%rax
-       jl              .Lxorpart
-       movdqu          0x30(%rdx),%xmm0
-       pxor            %xmm3,%xmm0
-       movdqu          %xmm0,0x30(%rsi)
-
-.Ldone:
-       FRAME_END
-       ret
-
-.Lxorpart:
-       # xor remaining bytes from partial register into output
-       mov             %rax,%r9
-       and             $0x0f,%r9
-       jz              .Ldone
-       and             $~0x0f,%rax
-
-       mov             %rsi,%r11
-
-       lea             8(%rsp),%r10
-       sub             $0x10,%rsp
-       and             $~31,%rsp
-
-       lea             (%rdx,%rax),%rsi
-       mov             %rsp,%rdi
-       mov             %r9,%rcx
-       rep movsb
-
-       pxor            0x00(%rsp),%xmm0
-       movdqa          %xmm0,0x00(%rsp)
-
-       mov             %rsp,%rsi
-       lea             (%r11,%rax),%rdi
-       mov             %r9,%rcx
-       rep movsb
-
-       lea             -8(%r10),%rsp
-       jmp             .Ldone
-
-ENDPROC(chacha20_block_xor_ssse3)
-
-ENTRY(hchacha20_block_ssse3)
-       # %rdi: Input state matrix, s
-       # %rsi: output (8 32-bit words)
-       FRAME_BEGIN
-
-       movdqa          0x00(%rdi),%xmm0
-       movdqa          0x10(%rdi),%xmm1
-       movdqa          0x20(%rdi),%xmm2
-       movdqa          0x30(%rdi),%xmm3
-
-       call            chacha20_permute
-
-       movdqu          %xmm0,0x00(%rsi)
-       movdqu          %xmm3,0x10(%rsi)
-
-       FRAME_END
-       ret
-ENDPROC(hchacha20_block_ssse3)
-
-ENTRY(chacha20_4block_xor_ssse3)
-       # %rdi: Input state matrix, s
-       # %rsi: up to 4 data blocks output, o
-       # %rdx: up to 4 data blocks input, i
-       # %rcx: input/output length in bytes
-
-       # This function encrypts four consecutive ChaCha20 blocks by loading the
-       # the state matrix in SSE registers four times. As we need some scratch
-       # registers, we save the first four registers on the stack. The
-       # algorithm performs each operation on the corresponding word of each
-       # state matrix, hence requires no word shuffling. For final XORing step
-       # we transpose the matrix by interleaving 32- and then 64-bit words,
-       # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
-       # done with the slightly better performing SSSE3 byte shuffling,
-       # 7/12-bit word rotation uses traditional shift+OR.
-
-       lea             8(%rsp),%r10
-       sub             $0x80,%rsp
-       and             $~63,%rsp
-       mov             %rcx,%rax
-
-       # x0..15[0-3] = s0..3[0..3]
-       movq            0x00(%rdi),%xmm1
-       pshufd          $0x00,%xmm1,%xmm0
-       pshufd          $0x55,%xmm1,%xmm1
-       movq            0x08(%rdi),%xmm3
-       pshufd          $0x00,%xmm3,%xmm2
-       pshufd          $0x55,%xmm3,%xmm3
-       movq            0x10(%rdi),%xmm5
-       pshufd          $0x00,%xmm5,%xmm4
-       pshufd          $0x55,%xmm5,%xmm5
-       movq            0x18(%rdi),%xmm7
-       pshufd          $0x00,%xmm7,%xmm6
-       pshufd          $0x55,%xmm7,%xmm7
-       movq            0x20(%rdi),%xmm9
-       pshufd          $0x00,%xmm9,%xmm8
-       pshufd          $0x55,%xmm9,%xmm9
-       movq            0x28(%rdi),%xmm11
-       pshufd          $0x00,%xmm11,%xmm10
-       pshufd          $0x55,%xmm11,%xmm11
-       movq            0x30(%rdi),%xmm13
-       pshufd          $0x00,%xmm13,%xmm12
-       pshufd          $0x55,%xmm13,%xmm13
-       movq            0x38(%rdi),%xmm15
-       pshufd          $0x00,%xmm15,%xmm14
-       pshufd          $0x55,%xmm15,%xmm15
-       # x0..3 on stack
-       movdqa          %xmm0,0x00(%rsp)
-       movdqa          %xmm1,0x10(%rsp)
-       movdqa          %xmm2,0x20(%rsp)
-       movdqa          %xmm3,0x30(%rsp)
-
-       movdqa          CTRINC(%rip),%xmm1
-       movdqa          ROT8(%rip),%xmm2
-       movdqa          ROT16(%rip),%xmm3
-
-       # x12 += counter values 0-3
-       paddd           %xmm1,%xmm12
-
-       mov             $10,%ecx
-
-.Ldoubleround4:
-       # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-       movdqa          0x00(%rsp),%xmm0
-       paddd           %xmm4,%xmm0
-       movdqa          %xmm0,0x00(%rsp)
-       pxor            %xmm0,%xmm12
-       pshufb          %xmm3,%xmm12
-       # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-       movdqa          0x10(%rsp),%xmm0
-       paddd           %xmm5,%xmm0
-       movdqa          %xmm0,0x10(%rsp)
-       pxor            %xmm0,%xmm13
-       pshufb          %xmm3,%xmm13
-       # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-       movdqa          0x20(%rsp),%xmm0
-       paddd           %xmm6,%xmm0
-       movdqa          %xmm0,0x20(%rsp)
-       pxor            %xmm0,%xmm14
-       pshufb          %xmm3,%xmm14
-       # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-       movdqa          0x30(%rsp),%xmm0
-       paddd           %xmm7,%xmm0
-       movdqa          %xmm0,0x30(%rsp)
-       pxor            %xmm0,%xmm15
-       pshufb          %xmm3,%xmm15
-
-       # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-       paddd           %xmm12,%xmm8
-       pxor            %xmm8,%xmm4
-       movdqa          %xmm4,%xmm0
-       pslld           $12,%xmm0
-       psrld           $20,%xmm4
-       por             %xmm0,%xmm4
-       # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-       paddd           %xmm13,%xmm9
-       pxor            %xmm9,%xmm5
-       movdqa          %xmm5,%xmm0
-       pslld           $12,%xmm0
-       psrld           $20,%xmm5
-       por             %xmm0,%xmm5
-       # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-       paddd           %xmm14,%xmm10
-       pxor            %xmm10,%xmm6
-       movdqa          %xmm6,%xmm0
-       pslld           $12,%xmm0
-       psrld           $20,%xmm6
-       por             %xmm0,%xmm6
-       # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-       paddd           %xmm15,%xmm11
-       pxor            %xmm11,%xmm7
-       movdqa          %xmm7,%xmm0
-       pslld           $12,%xmm0
-       psrld           $20,%xmm7
-       por             %xmm0,%xmm7
-
-       # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-       movdqa          0x00(%rsp),%xmm0
-       paddd           %xmm4,%xmm0
-       movdqa          %xmm0,0x00(%rsp)
-       pxor            %xmm0,%xmm12
-       pshufb          %xmm2,%xmm12
-       # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-       movdqa          0x10(%rsp),%xmm0
-       paddd           %xmm5,%xmm0
-       movdqa          %xmm0,0x10(%rsp)
-       pxor            %xmm0,%xmm13
-       pshufb          %xmm2,%xmm13
-       # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-       movdqa          0x20(%rsp),%xmm0
-       paddd           %xmm6,%xmm0
-       movdqa          %xmm0,0x20(%rsp)
-       pxor            %xmm0,%xmm14
-       pshufb          %xmm2,%xmm14
-       # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-       movdqa          0x30(%rsp),%xmm0
-       paddd           %xmm7,%xmm0
-       movdqa          %xmm0,0x30(%rsp)
-       pxor            %xmm0,%xmm15
-       pshufb          %xmm2,%xmm15
-
-       # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-       paddd           %xmm12,%xmm8
-       pxor            %xmm8,%xmm4
-       movdqa          %xmm4,%xmm0
-       pslld           $7,%xmm0
-       psrld           $25,%xmm4
-       por             %xmm0,%xmm4
-       # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-       paddd           %xmm13,%xmm9
-       pxor            %xmm9,%xmm5
-       movdqa          %xmm5,%xmm0
-       pslld           $7,%xmm0
-       psrld           $25,%xmm5
-       por             %xmm0,%xmm5
-       # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-       paddd           %xmm14,%xmm10
-       pxor            %xmm10,%xmm6
-       movdqa          %xmm6,%xmm0
-       pslld           $7,%xmm0
-       psrld           $25,%xmm6
-       por             %xmm0,%xmm6
-       # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-       paddd           %xmm15,%xmm11
-       pxor            %xmm11,%xmm7
-       movdqa          %xmm7,%xmm0
-       pslld           $7,%xmm0
-       psrld           $25,%xmm7
-       por             %xmm0,%xmm7
-
-       # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-       movdqa          0x00(%rsp),%xmm0
-       paddd           %xmm5,%xmm0
-       movdqa          %xmm0,0x00(%rsp)
-       pxor            %xmm0,%xmm15
-       pshufb          %xmm3,%xmm15
-       # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
-       movdqa          0x10(%rsp),%xmm0
-       paddd           %xmm6,%xmm0
-       movdqa          %xmm0,0x10(%rsp)
-       pxor            %xmm0,%xmm12
-       pshufb          %xmm3,%xmm12
-       # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-       movdqa          0x20(%rsp),%xmm0
-       paddd           %xmm7,%xmm0
-       movdqa          %xmm0,0x20(%rsp)
-       pxor            %xmm0,%xmm13
-       pshufb          %xmm3,%xmm13
-       # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-       movdqa          0x30(%rsp),%xmm0
-       paddd           %xmm4,%xmm0
-       movdqa          %xmm0,0x30(%rsp)
-       pxor            %xmm0,%xmm14
-       pshufb          %xmm3,%xmm14
-
-       # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-       paddd           %xmm15,%xmm10
-       pxor            %xmm10,%xmm5
-       movdqa          %xmm5,%xmm0
-       pslld           $12,%xmm0
-       psrld           $20,%xmm5
-       por             %xmm0,%xmm5
-       # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-       paddd           %xmm12,%xmm11
-       pxor            %xmm11,%xmm6
-       movdqa          %xmm6,%xmm0
-       pslld           $12,%xmm0
-       psrld           $20,%xmm6
-       por             %xmm0,%xmm6
-       # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-       paddd           %xmm13,%xmm8
-       pxor            %xmm8,%xmm7
-       movdqa          %xmm7,%xmm0
-       pslld           $12,%xmm0
-       psrld           $20,%xmm7
-       por             %xmm0,%xmm7
-       # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-       paddd           %xmm14,%xmm9
-       pxor            %xmm9,%xmm4
-       movdqa          %xmm4,%xmm0
-       pslld           $12,%xmm0
-       psrld           $20,%xmm4
-       por             %xmm0,%xmm4
-
-       # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-       movdqa          0x00(%rsp),%xmm0
-       paddd           %xmm5,%xmm0
-       movdqa          %xmm0,0x00(%rsp)
-       pxor            %xmm0,%xmm15
-       pshufb          %xmm2,%xmm15
-       # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-       movdqa          0x10(%rsp),%xmm0
-       paddd           %xmm6,%xmm0
-       movdqa          %xmm0,0x10(%rsp)
-       pxor            %xmm0,%xmm12
-       pshufb          %xmm2,%xmm12
-       # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-       movdqa          0x20(%rsp),%xmm0
-       paddd           %xmm7,%xmm0
-       movdqa          %xmm0,0x20(%rsp)
-       pxor            %xmm0,%xmm13
-       pshufb          %xmm2,%xmm13
-       # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-       movdqa          0x30(%rsp),%xmm0
-       paddd           %xmm4,%xmm0
-       movdqa          %xmm0,0x30(%rsp)
-       pxor            %xmm0,%xmm14
-       pshufb          %xmm2,%xmm14
-
-       # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-       paddd           %xmm15,%xmm10
-       pxor            %xmm10,%xmm5
-       movdqa          %xmm5,%xmm0
-       pslld           $7,%xmm0
-       psrld           $25,%xmm5
-       por             %xmm0,%xmm5
-       # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-       paddd           %xmm12,%xmm11
-       pxor            %xmm11,%xmm6
-       movdqa          %xmm6,%xmm0
-       pslld           $7,%xmm0
-       psrld           $25,%xmm6
-       por             %xmm0,%xmm6
-       # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-       paddd           %xmm13,%xmm8
-       pxor            %xmm8,%xmm7
-       movdqa          %xmm7,%xmm0
-       pslld           $7,%xmm0
-       psrld           $25,%xmm7
-       por             %xmm0,%xmm7
-       # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-       paddd           %xmm14,%xmm9
-       pxor            %xmm9,%xmm4
-       movdqa          %xmm4,%xmm0
-       pslld           $7,%xmm0
-       psrld           $25,%xmm4
-       por             %xmm0,%xmm4
-
-       dec             %ecx
-       jnz             .Ldoubleround4
-
-       # x0[0-3] += s0[0]
-       # x1[0-3] += s0[1]
-       movq            0x00(%rdi),%xmm3
-       pshufd          $0x00,%xmm3,%xmm2
-       pshufd          $0x55,%xmm3,%xmm3
-       paddd           0x00(%rsp),%xmm2
-       movdqa          %xmm2,0x00(%rsp)
-       paddd           0x10(%rsp),%xmm3
-       movdqa          %xmm3,0x10(%rsp)
-       # x2[0-3] += s0[2]
-       # x3[0-3] += s0[3]
-       movq            0x08(%rdi),%xmm3
-       pshufd          $0x00,%xmm3,%xmm2
-       pshufd          $0x55,%xmm3,%xmm3
-       paddd           0x20(%rsp),%xmm2
-       movdqa          %xmm2,0x20(%rsp)
-       paddd           0x30(%rsp),%xmm3
-       movdqa          %xmm3,0x30(%rsp)
-
-       # x4[0-3] += s1[0]
-       # x5[0-3] += s1[1]
-       movq            0x10(%rdi),%xmm3
-       pshufd          $0x00,%xmm3,%xmm2
-       pshufd          $0x55,%xmm3,%xmm3
-       paddd           %xmm2,%xmm4
-       paddd           %xmm3,%xmm5
-       # x6[0-3] += s1[2]
-       # x7[0-3] += s1[3]
-       movq            0x18(%rdi),%xmm3
-       pshufd          $0x00,%xmm3,%xmm2
-       pshufd          $0x55,%xmm3,%xmm3
-       paddd           %xmm2,%xmm6
-       paddd           %xmm3,%xmm7
-
-       # x8[0-3] += s2[0]
-       # x9[0-3] += s2[1]
-       movq            0x20(%rdi),%xmm3
-       pshufd          $0x00,%xmm3,%xmm2
-       pshufd          $0x55,%xmm3,%xmm3
-       paddd           %xmm2,%xmm8
-       paddd           %xmm3,%xmm9
-       # x10[0-3] += s2[2]
-       # x11[0-3] += s2[3]
-       movq            0x28(%rdi),%xmm3
-       pshufd          $0x00,%xmm3,%xmm2
-       pshufd          $0x55,%xmm3,%xmm3
-       paddd           %xmm2,%xmm10
-       paddd           %xmm3,%xmm11
-
-       # x12[0-3] += s3[0]
-       # x13[0-3] += s3[1]
-       movq            0x30(%rdi),%xmm3
-       pshufd          $0x00,%xmm3,%xmm2
-       pshufd          $0x55,%xmm3,%xmm3
-       paddd           %xmm2,%xmm12
-       paddd           %xmm3,%xmm13
-       # x14[0-3] += s3[2]
-       # x15[0-3] += s3[3]
-       movq            0x38(%rdi),%xmm3
-       pshufd          $0x00,%xmm3,%xmm2
-       pshufd          $0x55,%xmm3,%xmm3
-       paddd           %xmm2,%xmm14
-       paddd           %xmm3,%xmm15
-
-       # x12 += counter values 0-3
-       paddd           %xmm1,%xmm12
-
-       # interleave 32-bit words in state n, n+1
-       movdqa          0x00(%rsp),%xmm0
-       movdqa          0x10(%rsp),%xmm1
-       movdqa          %xmm0,%xmm2
-       punpckldq       %xmm1,%xmm2
-       punpckhdq       %xmm1,%xmm0
-       movdqa          %xmm2,0x00(%rsp)
-       movdqa          %xmm0,0x10(%rsp)
-       movdqa          0x20(%rsp),%xmm0
-       movdqa          0x30(%rsp),%xmm1
-       movdqa          %xmm0,%xmm2
-       punpckldq       %xmm1,%xmm2
-       punpckhdq       %xmm1,%xmm0
-       movdqa          %xmm2,0x20(%rsp)
-       movdqa          %xmm0,0x30(%rsp)
-       movdqa          %xmm4,%xmm0
-       punpckldq       %xmm5,%xmm4
-       punpckhdq       %xmm5,%xmm0
-       movdqa          %xmm0,%xmm5
-       movdqa          %xmm6,%xmm0
-       punpckldq       %xmm7,%xmm6
-       punpckhdq       %xmm7,%xmm0
-       movdqa          %xmm0,%xmm7
-       movdqa          %xmm8,%xmm0
-       punpckldq       %xmm9,%xmm8
-       punpckhdq       %xmm9,%xmm0
-       movdqa          %xmm0,%xmm9
-       movdqa          %xmm10,%xmm0
-       punpckldq       %xmm11,%xmm10
-       punpckhdq       %xmm11,%xmm0
-       movdqa          %xmm0,%xmm11
-       movdqa          %xmm12,%xmm0
-       punpckldq       %xmm13,%xmm12
-       punpckhdq       %xmm13,%xmm0
-       movdqa          %xmm0,%xmm13
-       movdqa          %xmm14,%xmm0
-       punpckldq       %xmm15,%xmm14
-       punpckhdq       %xmm15,%xmm0
-       movdqa          %xmm0,%xmm15
-
-       # interleave 64-bit words in state n, n+2
-       movdqa          0x00(%rsp),%xmm0
-       movdqa          0x20(%rsp),%xmm1
-       movdqa          %xmm0,%xmm2
-       punpcklqdq      %xmm1,%xmm2
-       punpckhqdq      %xmm1,%xmm0
-       movdqa          %xmm2,0x00(%rsp)
-       movdqa          %xmm0,0x20(%rsp)
-       movdqa          0x10(%rsp),%xmm0
-       movdqa          0x30(%rsp),%xmm1
-       movdqa          %xmm0,%xmm2
-       punpcklqdq      %xmm1,%xmm2
-       punpckhqdq      %xmm1,%xmm0
-       movdqa          %xmm2,0x10(%rsp)
-       movdqa          %xmm0,0x30(%rsp)
-       movdqa          %xmm4,%xmm0
-       punpcklqdq      %xmm6,%xmm4
-       punpckhqdq      %xmm6,%xmm0
-       movdqa          %xmm0,%xmm6
-       movdqa          %xmm5,%xmm0
-       punpcklqdq      %xmm7,%xmm5
-       punpckhqdq      %xmm7,%xmm0
-       movdqa          %xmm0,%xmm7
-       movdqa          %xmm8,%xmm0
-       punpcklqdq      %xmm10,%xmm8
-       punpckhqdq      %xmm10,%xmm0
-       movdqa          %xmm0,%xmm10
-       movdqa          %xmm9,%xmm0
-       punpcklqdq      %xmm11,%xmm9
-       punpckhqdq      %xmm11,%xmm0
-       movdqa          %xmm0,%xmm11
-       movdqa          %xmm12,%xmm0
-       punpcklqdq      %xmm14,%xmm12
-       punpckhqdq      %xmm14,%xmm0
-       movdqa          %xmm0,%xmm14
-       movdqa          %xmm13,%xmm0
-       punpcklqdq      %xmm15,%xmm13
-       punpckhqdq      %xmm15,%xmm0
-       movdqa          %xmm0,%xmm15
-
-       # xor with corresponding input, write to output
-       movdqa          0x00(%rsp),%xmm0
-       cmp             $0x10,%rax
-       jl              .Lxorpart4
-       movdqu          0x00(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0x00(%rsi)
-
-       movdqu          %xmm4,%xmm0
-       cmp             $0x20,%rax
-       jl              .Lxorpart4
-       movdqu          0x10(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0x10(%rsi)
-
-       movdqu          %xmm8,%xmm0
-       cmp             $0x30,%rax
-       jl              .Lxorpart4
-       movdqu          0x20(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0x20(%rsi)
-
-       movdqu          %xmm12,%xmm0
-       cmp             $0x40,%rax
-       jl              .Lxorpart4
-       movdqu          0x30(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0x30(%rsi)
-
-       movdqa          0x20(%rsp),%xmm0
-       cmp             $0x50,%rax
-       jl              .Lxorpart4
-       movdqu          0x40(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0x40(%rsi)
-
-       movdqu          %xmm6,%xmm0
-       cmp             $0x60,%rax
-       jl              .Lxorpart4
-       movdqu          0x50(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0x50(%rsi)
-
-       movdqu          %xmm10,%xmm0
-       cmp             $0x70,%rax
-       jl              .Lxorpart4
-       movdqu          0x60(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0x60(%rsi)
-
-       movdqu          %xmm14,%xmm0
-       cmp             $0x80,%rax
-       jl              .Lxorpart4
-       movdqu          0x70(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0x70(%rsi)
-
-       movdqa          0x10(%rsp),%xmm0
-       cmp             $0x90,%rax
-       jl              .Lxorpart4
-       movdqu          0x80(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0x80(%rsi)
-
-       movdqu          %xmm5,%xmm0
-       cmp             $0xa0,%rax
-       jl              .Lxorpart4
-       movdqu          0x90(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0x90(%rsi)
-
-       movdqu          %xmm9,%xmm0
-       cmp             $0xb0,%rax
-       jl              .Lxorpart4
-       movdqu          0xa0(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0xa0(%rsi)
-
-       movdqu          %xmm13,%xmm0
-       cmp             $0xc0,%rax
-       jl              .Lxorpart4
-       movdqu          0xb0(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0xb0(%rsi)
-
-       movdqa          0x30(%rsp),%xmm0
-       cmp             $0xd0,%rax
-       jl              .Lxorpart4
-       movdqu          0xc0(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0xc0(%rsi)
-
-       movdqu          %xmm7,%xmm0
-       cmp             $0xe0,%rax
-       jl              .Lxorpart4
-       movdqu          0xd0(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0xd0(%rsi)
-
-       movdqu          %xmm11,%xmm0
-       cmp             $0xf0,%rax
-       jl              .Lxorpart4
-       movdqu          0xe0(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0xe0(%rsi)
-
-       movdqu          %xmm15,%xmm0
-       cmp             $0x100,%rax
-       jl              .Lxorpart4
-       movdqu          0xf0(%rdx),%xmm1
-       pxor            %xmm1,%xmm0
-       movdqu          %xmm0,0xf0(%rsi)
-
-.Ldone4:
-       lea             -8(%r10),%rsp
-       ret
-
-.Lxorpart4:
-       # xor remaining bytes from partial register into output
-       mov             %rax,%r9
-       and             $0x0f,%r9
-       jz              .Ldone4
-       and             $~0x0f,%rax
-
-       mov             %rsi,%r11
-
-       lea             (%rdx,%rax),%rsi
-       mov             %rsp,%rdi
-       mov             %r9,%rcx
-       rep movsb
-
-       pxor            0x00(%rsp),%xmm0
-       movdqa          %xmm0,0x00(%rsp)
-
-       mov             %rsp,%rsi
-       lea             (%r11,%rax),%rdi
-       mov             %r9,%rcx
-       rep movsb
-
-       jmp             .Ldone4
-
-ENDPROC(chacha20_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
deleted file mode 100644 (file)
index 70d388e..0000000
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/chacha.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/fpu/api.h>
-#include <asm/simd.h>
-
-#define CHACHA20_STATE_ALIGN 16
-
-asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
-                                        unsigned int len);
-asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
-                                         unsigned int len);
-asmlinkage void hchacha20_block_ssse3(const u32 *state, u32 *out);
-#ifdef CONFIG_AS_AVX2
-asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
-                                        unsigned int len);
-asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
-                                        unsigned int len);
-asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
-                                        unsigned int len);
-static bool chacha20_use_avx2;
-#ifdef CONFIG_AS_AVX512
-asmlinkage void chacha20_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
-                                            unsigned int len);
-asmlinkage void chacha20_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
-                                            unsigned int len);
-asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
-                                            unsigned int len);
-static bool chacha20_use_avx512vl;
-#endif
-#endif
-
-static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks)
-{
-       len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
-       return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
-}
-
-static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
-                           unsigned int bytes)
-{
-#ifdef CONFIG_AS_AVX2
-#ifdef CONFIG_AS_AVX512
-       if (chacha20_use_avx512vl) {
-               while (bytes >= CHACHA_BLOCK_SIZE * 8) {
-                       chacha20_8block_xor_avx512vl(state, dst, src, bytes);
-                       bytes -= CHACHA_BLOCK_SIZE * 8;
-                       src += CHACHA_BLOCK_SIZE * 8;
-                       dst += CHACHA_BLOCK_SIZE * 8;
-                       state[12] += 8;
-               }
-               if (bytes > CHACHA_BLOCK_SIZE * 4) {
-                       chacha20_8block_xor_avx512vl(state, dst, src, bytes);
-                       state[12] += chacha20_advance(bytes, 8);
-                       return;
-               }
-               if (bytes > CHACHA_BLOCK_SIZE * 2) {
-                       chacha20_4block_xor_avx512vl(state, dst, src, bytes);
-                       state[12] += chacha20_advance(bytes, 4);
-                       return;
-               }
-               if (bytes) {
-                       chacha20_2block_xor_avx512vl(state, dst, src, bytes);
-                       state[12] += chacha20_advance(bytes, 2);
-                       return;
-               }
-       }
-#endif
-       if (chacha20_use_avx2) {
-               while (bytes >= CHACHA_BLOCK_SIZE * 8) {
-                       chacha20_8block_xor_avx2(state, dst, src, bytes);
-                       bytes -= CHACHA_BLOCK_SIZE * 8;
-                       src += CHACHA_BLOCK_SIZE * 8;
-                       dst += CHACHA_BLOCK_SIZE * 8;
-                       state[12] += 8;
-               }
-               if (bytes > CHACHA_BLOCK_SIZE * 4) {
-                       chacha20_8block_xor_avx2(state, dst, src, bytes);
-                       state[12] += chacha20_advance(bytes, 8);
-                       return;
-               }
-               if (bytes > CHACHA_BLOCK_SIZE * 2) {
-                       chacha20_4block_xor_avx2(state, dst, src, bytes);
-                       state[12] += chacha20_advance(bytes, 4);
-                       return;
-               }
-               if (bytes > CHACHA_BLOCK_SIZE) {
-                       chacha20_2block_xor_avx2(state, dst, src, bytes);
-                       state[12] += chacha20_advance(bytes, 2);
-                       return;
-               }
-       }
-#endif
-       while (bytes >= CHACHA_BLOCK_SIZE * 4) {
-               chacha20_4block_xor_ssse3(state, dst, src, bytes);
-               bytes -= CHACHA_BLOCK_SIZE * 4;
-               src += CHACHA_BLOCK_SIZE * 4;
-               dst += CHACHA_BLOCK_SIZE * 4;
-               state[12] += 4;
-       }
-       if (bytes > CHACHA_BLOCK_SIZE) {
-               chacha20_4block_xor_ssse3(state, dst, src, bytes);
-               state[12] += chacha20_advance(bytes, 4);
-               return;
-       }
-       if (bytes) {
-               chacha20_block_xor_ssse3(state, dst, src, bytes);
-               state[12]++;
-       }
-}
-
-static int chacha20_simd_stream_xor(struct skcipher_request *req,
-                                   struct chacha_ctx *ctx, u8 *iv)
-{
-       u32 *state, state_buf[16 + 2] __aligned(8);
-       struct skcipher_walk walk;
-       int err;
-
-       BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
-       state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
-
-       err = skcipher_walk_virt(&walk, req, true);
-
-       crypto_chacha_init(state, ctx, iv);
-
-       while (walk.nbytes > 0) {
-               unsigned int nbytes = walk.nbytes;
-
-               if (nbytes < walk.total)
-                       nbytes = round_down(nbytes, walk.stride);
-
-               chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
-                               nbytes);
-
-               err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-       }
-
-       return err;
-}
-
-static int chacha20_simd(struct skcipher_request *req)
-{
-       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-       struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-       int err;
-
-       if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
-               return crypto_chacha_crypt(req);
-
-       kernel_fpu_begin();
-       err = chacha20_simd_stream_xor(req, ctx, req->iv);
-       kernel_fpu_end();
-       return err;
-}
-
-static int xchacha20_simd(struct skcipher_request *req)
-{
-       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-       struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-       struct chacha_ctx subctx;
-       u32 *state, state_buf[16 + 2] __aligned(8);
-       u8 real_iv[16];
-       int err;
-
-       if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
-               return crypto_xchacha_crypt(req);
-
-       BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
-       state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
-       crypto_chacha_init(state, ctx, req->iv);
-
-       kernel_fpu_begin();
-
-       hchacha20_block_ssse3(state, subctx.key);
-
-       memcpy(&real_iv[0], req->iv + 24, 8);
-       memcpy(&real_iv[8], req->iv + 16, 8);
-       err = chacha20_simd_stream_xor(req, &subctx, real_iv);
-
-       kernel_fpu_end();
-
-       return err;
-}
-
-static struct skcipher_alg algs[] = {
-       {
-               .base.cra_name          = "chacha20",
-               .base.cra_driver_name   = "chacha20-simd",
-               .base.cra_priority      = 300,
-               .base.cra_blocksize     = 1,
-               .base.cra_ctxsize       = sizeof(struct chacha_ctx),
-               .base.cra_module        = THIS_MODULE,
-
-               .min_keysize            = CHACHA_KEY_SIZE,
-               .max_keysize            = CHACHA_KEY_SIZE,
-               .ivsize                 = CHACHA_IV_SIZE,
-               .chunksize              = CHACHA_BLOCK_SIZE,
-               .setkey                 = crypto_chacha20_setkey,
-               .encrypt                = chacha20_simd,
-               .decrypt                = chacha20_simd,
-       }, {
-               .base.cra_name          = "xchacha20",
-               .base.cra_driver_name   = "xchacha20-simd",
-               .base.cra_priority      = 300,
-               .base.cra_blocksize     = 1,
-               .base.cra_ctxsize       = sizeof(struct chacha_ctx),
-               .base.cra_module        = THIS_MODULE,
-
-               .min_keysize            = CHACHA_KEY_SIZE,
-               .max_keysize            = CHACHA_KEY_SIZE,
-               .ivsize                 = XCHACHA_IV_SIZE,
-               .chunksize              = CHACHA_BLOCK_SIZE,
-               .setkey                 = crypto_chacha20_setkey,
-               .encrypt                = xchacha20_simd,
-               .decrypt                = xchacha20_simd,
-       },
-};
-
-static int __init chacha20_simd_mod_init(void)
-{
-       if (!boot_cpu_has(X86_FEATURE_SSSE3))
-               return -ENODEV;
-
-#ifdef CONFIG_AS_AVX2
-       chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
-                           boot_cpu_has(X86_FEATURE_AVX2) &&
-                           cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
-#ifdef CONFIG_AS_AVX512
-       chacha20_use_avx512vl = chacha20_use_avx2 &&
-                               boot_cpu_has(X86_FEATURE_AVX512VL) &&
-                               boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
-#endif
-#endif
-       return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit chacha20_simd_mod_fini(void)
-{
-       crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_init(chacha20_simd_mod_init);
-module_exit(chacha20_simd_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-simd");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-simd");
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
new file mode 100644 (file)
index 0000000..35fd02b
--- /dev/null
@@ -0,0 +1,270 @@
+/*
+ * x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+#define CHACHA_STATE_ALIGN 16
+
+asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+                                      unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+                                       unsigned int len, int nrounds);
+asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+                                      unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+                                      unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+                                      unsigned int len, int nrounds);
+static bool chacha_use_avx2;
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+                                          unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+                                          unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+                                          unsigned int len, int nrounds);
+static bool chacha_use_avx512vl;
+#endif
+#endif
+
+static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
+{
+       len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
+       return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
+                         unsigned int bytes, int nrounds)
+{
+#ifdef CONFIG_AS_AVX2
+#ifdef CONFIG_AS_AVX512
+       if (chacha_use_avx512vl) {
+               while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+                       chacha_8block_xor_avx512vl(state, dst, src, bytes,
+                                                  nrounds);
+                       bytes -= CHACHA_BLOCK_SIZE * 8;
+                       src += CHACHA_BLOCK_SIZE * 8;
+                       dst += CHACHA_BLOCK_SIZE * 8;
+                       state[12] += 8;
+               }
+               if (bytes > CHACHA_BLOCK_SIZE * 4) {
+                       chacha_8block_xor_avx512vl(state, dst, src, bytes,
+                                                  nrounds);
+                       state[12] += chacha_advance(bytes, 8);
+                       return;
+               }
+               if (bytes > CHACHA_BLOCK_SIZE * 2) {
+                       chacha_4block_xor_avx512vl(state, dst, src, bytes,
+                                                  nrounds);
+                       state[12] += chacha_advance(bytes, 4);
+                       return;
+               }
+               if (bytes) {
+                       chacha_2block_xor_avx512vl(state, dst, src, bytes,
+                                                  nrounds);
+                       state[12] += chacha_advance(bytes, 2);
+                       return;
+               }
+       }
+#endif
+       if (chacha_use_avx2) {
+               while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+                       chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+                       bytes -= CHACHA_BLOCK_SIZE * 8;
+                       src += CHACHA_BLOCK_SIZE * 8;
+                       dst += CHACHA_BLOCK_SIZE * 8;
+                       state[12] += 8;
+               }
+               if (bytes > CHACHA_BLOCK_SIZE * 4) {
+                       chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+                       state[12] += chacha_advance(bytes, 8);
+                       return;
+               }
+               if (bytes > CHACHA_BLOCK_SIZE * 2) {
+                       chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
+                       state[12] += chacha_advance(bytes, 4);
+                       return;
+               }
+               if (bytes > CHACHA_BLOCK_SIZE) {
+                       chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
+                       state[12] += chacha_advance(bytes, 2);
+                       return;
+               }
+       }
+#endif
+       while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+               chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+               bytes -= CHACHA_BLOCK_SIZE * 4;
+               src += CHACHA_BLOCK_SIZE * 4;
+               dst += CHACHA_BLOCK_SIZE * 4;
+               state[12] += 4;
+       }
+       if (bytes > CHACHA_BLOCK_SIZE) {
+               chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+               state[12] += chacha_advance(bytes, 4);
+               return;
+       }
+       if (bytes) {
+               chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
+               state[12]++;
+       }
+}
+
+static int chacha_simd_stream_xor(struct skcipher_request *req,
+                                 struct chacha_ctx *ctx, u8 *iv)
+{
+       u32 *state, state_buf[16 + 2] __aligned(8);
+       struct skcipher_walk walk;
+       int err;
+
+       BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+       state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+
+       err = skcipher_walk_virt(&walk, req, true);
+
+       crypto_chacha_init(state, ctx, iv);
+
+       while (walk.nbytes > 0) {
+               unsigned int nbytes = walk.nbytes;
+
+               if (nbytes < walk.total)
+                       nbytes = round_down(nbytes, walk.stride);
+
+               chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
+                             nbytes, ctx->nrounds);
+
+               err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+       }
+
+       return err;
+}
+
+static int chacha_simd(struct skcipher_request *req)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+       int err;
+
+       if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
+               return crypto_chacha_crypt(req);
+
+       kernel_fpu_begin();
+       err = chacha_simd_stream_xor(req, ctx, req->iv);
+       kernel_fpu_end();
+       return err;
+}
+
+static int xchacha_simd(struct skcipher_request *req)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct chacha_ctx subctx;
+       u32 *state, state_buf[16 + 2] __aligned(8);
+       u8 real_iv[16];
+       int err;
+
+       if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
+               return crypto_xchacha_crypt(req);
+
+       BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+       state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+       crypto_chacha_init(state, ctx, req->iv);
+
+       kernel_fpu_begin();
+
+       hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
+       subctx.nrounds = ctx->nrounds;
+
+       memcpy(&real_iv[0], req->iv + 24, 8);
+       memcpy(&real_iv[8], req->iv + 16, 8);
+       err = chacha_simd_stream_xor(req, &subctx, real_iv);
+
+       kernel_fpu_end();
+
+       return err;
+}
+
+static struct skcipher_alg algs[] = {
+       {
+               .base.cra_name          = "chacha20",
+               .base.cra_driver_name   = "chacha20-simd",
+               .base.cra_priority      = 300,
+               .base.cra_blocksize     = 1,
+               .base.cra_ctxsize       = sizeof(struct chacha_ctx),
+               .base.cra_module        = THIS_MODULE,
+
+               .min_keysize            = CHACHA_KEY_SIZE,
+               .max_keysize            = CHACHA_KEY_SIZE,
+               .ivsize                 = CHACHA_IV_SIZE,
+               .chunksize              = CHACHA_BLOCK_SIZE,
+               .setkey                 = crypto_chacha20_setkey,
+               .encrypt                = chacha_simd,
+               .decrypt                = chacha_simd,
+       }, {
+               .base.cra_name          = "xchacha20",
+               .base.cra_driver_name   = "xchacha20-simd",
+               .base.cra_priority      = 300,
+               .base.cra_blocksize     = 1,
+               .base.cra_ctxsize       = sizeof(struct chacha_ctx),
+               .base.cra_module        = THIS_MODULE,
+
+               .min_keysize            = CHACHA_KEY_SIZE,
+               .max_keysize            = CHACHA_KEY_SIZE,
+               .ivsize                 = XCHACHA_IV_SIZE,
+               .chunksize              = CHACHA_BLOCK_SIZE,
+               .setkey                 = crypto_chacha20_setkey,
+               .encrypt                = xchacha_simd,
+               .decrypt                = xchacha_simd,
+       },
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+       if (!boot_cpu_has(X86_FEATURE_SSSE3))
+               return -ENODEV;
+
+#ifdef CONFIG_AS_AVX2
+       chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+                         boot_cpu_has(X86_FEATURE_AVX2) &&
+                         cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifdef CONFIG_AS_AVX512
+       chacha_use_avx512vl = chacha_use_avx2 &&
+                             boot_cpu_has(X86_FEATURE_AVX512VL) &&
+                             boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
+#endif
+#endif
+       return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+       crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-simd");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-simd");