1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Ard Biesheuvel <ardb@kernel.org>
3 Date: Wed, 8 Jul 2020 12:11:18 +0300
4 Subject: [PATCH] crypto: x86/chacha-sse3 - use unaligned loads for state array
6 commit e79a31715193686e92dadb4caedfbb1f5de3659c upstream.
8 Due to the fact that the x86 port does not support allocating objects
9 on the stack with an alignment that exceeds 8 bytes, we have a rather
10 ugly hack in the x86 code for ChaCha to ensure that the state array is
11 aligned to 16 bytes, allowing the SSE3 implementation of the algorithm
14 Given that the performance benefit of using of aligned loads appears to
15 be limited (~0.25% for 1k blocks using tcrypt on a Corei7-8650U), and
16 the fact that this hack has leaked into generic ChaCha code, let's just
19 Cc: Martin Willi <martin@strongswan.org>
20 Cc: Herbert Xu <herbert@gondor.apana.org.au>
21 Cc: Eric Biggers <ebiggers@kernel.org>
22 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
23 Reviewed-by: Martin Willi <martin@strongswan.org>
24 Reviewed-by: Eric Biggers <ebiggers@google.com>
25 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
26 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
28 arch/x86/crypto/chacha-ssse3-x86_64.S | 16 ++++++++--------
29 arch/x86/crypto/chacha_glue.c | 17 ++---------------
30 include/crypto/chacha.h | 4 ----
31 3 files changed, 10 insertions(+), 27 deletions(-)
33 --- a/arch/x86/crypto/chacha-ssse3-x86_64.S
34 +++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
35 @@ -120,10 +120,10 @@ ENTRY(chacha_block_xor_ssse3)
39 - movdqa 0x00(%rdi),%xmm0
40 - movdqa 0x10(%rdi),%xmm1
41 - movdqa 0x20(%rdi),%xmm2
42 - movdqa 0x30(%rdi),%xmm3
43 + movdqu 0x00(%rdi),%xmm0
44 + movdqu 0x10(%rdi),%xmm1
45 + movdqu 0x20(%rdi),%xmm2
46 + movdqu 0x30(%rdi),%xmm3
50 @@ -205,10 +205,10 @@ ENTRY(hchacha_block_ssse3)
54 - movdqa 0x00(%rdi),%xmm0
55 - movdqa 0x10(%rdi),%xmm1
56 - movdqa 0x20(%rdi),%xmm2
57 - movdqa 0x30(%rdi),%xmm3
58 + movdqu 0x00(%rdi),%xmm0
59 + movdqu 0x10(%rdi),%xmm1
60 + movdqu 0x20(%rdi),%xmm2
61 + movdqu 0x30(%rdi),%xmm3
65 --- a/arch/x86/crypto/chacha_glue.c
66 +++ b/arch/x86/crypto/chacha_glue.c
68 #include <linux/module.h>
71 -#define CHACHA_STATE_ALIGN 16
73 asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
74 unsigned int len, int nrounds);
75 asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
76 @@ -125,8 +123,6 @@ static void chacha_dosimd(u32 *state, u8
78 void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
80 - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
82 if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
83 hchacha_block_generic(state, stream, nrounds);
85 @@ -139,8 +135,6 @@ EXPORT_SYMBOL(hchacha_block_arch);
87 void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
89 - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
91 chacha_init_generic(state, key, iv);
93 EXPORT_SYMBOL(chacha_init_arch);
94 @@ -148,8 +142,6 @@ EXPORT_SYMBOL(chacha_init_arch);
95 void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
98 - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
100 if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
101 bytes <= CHACHA_BLOCK_SIZE)
102 return chacha_crypt_generic(state, dst, src, bytes, nrounds);
103 @@ -171,15 +163,12 @@ EXPORT_SYMBOL(chacha_crypt_arch);
104 static int chacha_simd_stream_xor(struct skcipher_request *req,
105 const struct chacha_ctx *ctx, const u8 *iv)
107 - u32 *state, state_buf[16 + 2] __aligned(8);
108 + u32 state[CHACHA_STATE_WORDS] __aligned(8);
109 struct skcipher_walk walk;
112 err = skcipher_walk_virt(&walk, req, false);
114 - BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
115 - state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
117 chacha_init_generic(state, ctx->key, iv);
119 while (walk.nbytes > 0) {
120 @@ -218,12 +207,10 @@ static int xchacha_simd(struct skcipher_
122 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
123 struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
124 - u32 *state, state_buf[16 + 2] __aligned(8);
125 + u32 state[CHACHA_STATE_WORDS] __aligned(8);
126 struct chacha_ctx subctx;
129 - BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
130 - state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
131 chacha_init_generic(state, ctx->key, req->iv);
133 if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
134 --- a/include/crypto/chacha.h
135 +++ b/include/crypto/chacha.h
137 #define CHACHA_BLOCK_SIZE 64
138 #define CHACHAPOLY_IV_SIZE 12
140 -#ifdef CONFIG_X86_64
141 -#define CHACHA_STATE_WORDS ((CHACHA_BLOCK_SIZE + 12) / sizeof(u32))
143 #define CHACHA_STATE_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32))
146 /* 192-bit nonce, then 64-bit stream position */
147 #define XCHACHA_IV_SIZE 32