1 From 01c1104f551dae77125bb3d0f461f4084f2a98df Mon Sep 17 00:00:00 2001
2 From: Ard Biesheuvel <ardb@kernel.org>
3 Date: Fri, 8 Nov 2019 13:22:17 +0100
4 Subject: [PATCH 011/124] crypto: mips/chacha - wire up accelerated 32r2 code
7 Content-Type: text/plain; charset=UTF-8
8 Content-Transfer-Encoding: 8bit
10 commit 3a2f58f3ba4f6f44e33d1a48240d5eadb882cb59 upstream.
12 This integrates the accelerated MIPS 32r2 implementation of ChaCha
13 into both the API and library interfaces of the kernel crypto stack.
15 The significance of this is that, in addition to becoming available
16 as an accelerated library implementation, it can also be used by
17 existing crypto API code such as Adiantum (for block encryption on
18 ultra low performance cores) or IPsec using chacha20poly1305. These
19 are use cases that have already opted into using the abstract crypto
20 API. In order to support Adiantum, the core assembler routine has
21 been adapted to take the round count as a function argument rather
22 than hardcoding it to 20.
24 Co-developed-by: René van Dorst <opensource@vdorst.com>
25 Signed-off-by: René van Dorst <opensource@vdorst.com>
26 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
27 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
28 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
30 arch/mips/Makefile | 2 +-
31 arch/mips/crypto/Makefile | 4 +
32 arch/mips/crypto/chacha-core.S | 159 ++++++++++++++++++++++++---------
33 arch/mips/crypto/chacha-glue.c | 150 +++++++++++++++++++++++++++++++
35 5 files changed, 277 insertions(+), 44 deletions(-)
36 create mode 100644 arch/mips/crypto/chacha-glue.c
38 --- a/arch/mips/Makefile
39 +++ b/arch/mips/Makefile
40 @@ -334,7 +334,7 @@ libs-$(CONFIG_MIPS_FP_SUPPORT) += arch/m
41 # See arch/mips/Kbuild for content of core part of the kernel
44 -drivers-$(CONFIG_MIPS_CRC_SUPPORT) += arch/mips/crypto/
45 +drivers-y += arch/mips/crypto/
46 drivers-$(CONFIG_OPROFILE) += arch/mips/oprofile/
48 # suspend and hibernation support
49 --- a/arch/mips/crypto/Makefile
50 +++ b/arch/mips/crypto/Makefile
54 obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o
56 +obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
57 +chacha-mips-y := chacha-core.o chacha-glue.o
58 +AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
59 --- a/arch/mips/crypto/chacha-core.S
60 +++ b/arch/mips/crypto/chacha-core.S
62 #define CONCAT3(a,b,c) _CONCAT3(a,b,c)
64 #define STORE_UNALIGNED(x) \
65 -CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
66 +CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
68 lw T0, (x*4)(STATE); \
70 @@ -142,7 +142,7 @@ CONCAT3(.Lchacha20_mips_xor_unaligned_,
71 swr X ## x, (x*4)+LSB ## (OUT);
73 #define STORE_ALIGNED(x) \
74 -CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
75 +CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
77 lw T0, (x*4)(STATE); \
79 @@ -162,9 +162,9 @@ CONCAT3(.Lchacha20_mips_xor_aligned_, PL
80 * Every jumptable entry must be equal in size.
82 #define JMPTBL_ALIGNED(x) \
83 -.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
84 +.Lchacha_mips_jmptbl_aligned_ ## x: ; \
86 - b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
87 + b .Lchacha_mips_xor_aligned_ ## x ## _b; \
89 addu SAVED_X, X ## x, NONCE_0; \
91 @@ -173,9 +173,9 @@ CONCAT3(.Lchacha20_mips_xor_aligned_, PL
94 #define JMPTBL_UNALIGNED(x) \
95 -.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
96 +.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
98 - b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
99 + b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
101 addu SAVED_X, X ## x, NONCE_0; \
103 @@ -200,15 +200,18 @@ CONCAT3(.Lchacha20_mips_xor_aligned_, PL
107 -.globl chacha20_mips
110 +.globl chacha_crypt_arch
111 +.ent chacha_crypt_arch
113 .frame $sp, STACK_SIZE, $ra
115 + /* Load number of rounds */
118 addiu $sp, -STACK_SIZE
120 /* Return bytes = 0. */
121 - beqz BYTES, .Lchacha20_mips_end
122 + beqz BYTES, .Lchacha_mips_end
124 lw NONCE_0, 48(STATE)
126 @@ -228,18 +231,15 @@ chacha20_mips:
127 or IS_UNALIGNED, IN, OUT
128 andi IS_UNALIGNED, 0x3
130 - /* Set number of rounds */
133 - b .Lchacha20_rounds_start
134 + b .Lchacha_rounds_start
137 -.Loop_chacha20_rounds:
138 +.Loop_chacha_rounds:
139 addiu IN, CHACHA20_BLOCK_SIZE
140 addiu OUT, CHACHA20_BLOCK_SIZE
143 -.Lchacha20_rounds_start:
144 +.Lchacha_rounds_start:
148 @@ -259,7 +259,7 @@ chacha20_mips:
152 -.Loop_chacha20_xor_rounds:
153 +.Loop_chacha_xor_rounds:
155 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
156 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
157 @@ -269,31 +269,31 @@ chacha20_mips:
158 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
159 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
160 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
161 - bnez $at, .Loop_chacha20_xor_rounds
162 + bnez $at, .Loop_chacha_xor_rounds
164 addiu BYTES, -(CHACHA20_BLOCK_SIZE)
166 /* Is data src/dst unaligned? Jump */
167 - bnez IS_UNALIGNED, .Loop_chacha20_unaligned
168 + bnez IS_UNALIGNED, .Loop_chacha_unaligned
170 /* Set number rounds here to fill delayslot. */
172 + lw $at, (STACK_SIZE+16)($sp)
174 /* BYTES < 0, it has no full block. */
175 - bltz BYTES, .Lchacha20_mips_no_full_block_aligned
176 + bltz BYTES, .Lchacha_mips_no_full_block_aligned
178 FOR_EACH_WORD_REV(STORE_ALIGNED)
180 /* BYTES > 0? Loop again. */
181 - bgtz BYTES, .Loop_chacha20_rounds
182 + bgtz BYTES, .Loop_chacha_rounds
184 /* Place this here to fill delay slot */
187 /* BYTES < 0? Handle last bytes */
188 - bltz BYTES, .Lchacha20_mips_xor_bytes
189 + bltz BYTES, .Lchacha_mips_xor_bytes
191 -.Lchacha20_mips_xor_done:
192 +.Lchacha_mips_xor_done:
193 /* Restore used registers */
196 @@ -307,11 +307,11 @@ chacha20_mips:
197 /* Write NONCE_0 back to right location in state */
198 sw NONCE_0, 48(STATE)
200 -.Lchacha20_mips_end:
202 addiu $sp, STACK_SIZE
205 -.Lchacha20_mips_no_full_block_aligned:
206 +.Lchacha_mips_no_full_block_aligned:
207 /* Restore the offset on BYTES */
208 addiu BYTES, CHACHA20_BLOCK_SIZE
210 @@ -319,7 +319,7 @@ chacha20_mips:
211 andi $at, BYTES, MASK_U32
213 /* Load upper half of jump table addr */
214 - lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
215 + lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
217 /* Calculate lower half jump table offset */
219 @@ -328,7 +328,7 @@ chacha20_mips:
222 /* Add lower half jump table addr */
223 - addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
224 + addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
226 /* Read value from STATE */
228 @@ -342,31 +342,31 @@ chacha20_mips:
229 FOR_EACH_WORD(JMPTBL_ALIGNED)
232 -.Loop_chacha20_unaligned:
233 +.Loop_chacha_unaligned:
234 /* Set number rounds here to fill delayslot. */
236 + lw $at, (STACK_SIZE+16)($sp)
238 /* BYTES > 0, it has no full block. */
239 - bltz BYTES, .Lchacha20_mips_no_full_block_unaligned
240 + bltz BYTES, .Lchacha_mips_no_full_block_unaligned
242 FOR_EACH_WORD_REV(STORE_UNALIGNED)
244 /* BYTES > 0? Loop again. */
245 - bgtz BYTES, .Loop_chacha20_rounds
246 + bgtz BYTES, .Loop_chacha_rounds
248 /* Write NONCE_0 back to right location in state */
249 sw NONCE_0, 48(STATE)
252 /* Fall through to byte handling */
253 - bgez BYTES, .Lchacha20_mips_xor_done
254 -.Lchacha20_mips_xor_unaligned_0_b:
255 -.Lchacha20_mips_xor_aligned_0_b:
256 + bgez BYTES, .Lchacha_mips_xor_done
257 +.Lchacha_mips_xor_unaligned_0_b:
258 +.Lchacha_mips_xor_aligned_0_b:
259 /* Place this here to fill delay slot */
263 -.Lchacha20_mips_xor_bytes:
264 +.Lchacha_mips_xor_bytes:
268 @@ -376,22 +376,22 @@ chacha20_mips:
272 - beqz $at, .Lchacha20_mips_xor_done
273 + beqz $at, .Lchacha_mips_xor_done
280 - beqz $at, .Lchacha20_mips_xor_done
281 + beqz $at, .Lchacha_mips_xor_done
287 - b .Lchacha20_mips_xor_done
288 + b .Lchacha_mips_xor_done
290 -.Lchacha20_mips_no_full_block_unaligned:
291 +.Lchacha_mips_no_full_block_unaligned:
292 /* Restore the offset on BYTES */
293 addiu BYTES, CHACHA20_BLOCK_SIZE
295 @@ -399,7 +399,7 @@ chacha20_mips:
296 andi $at, BYTES, MASK_U32
298 /* Load upper half of jump table addr */
299 - lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
300 + lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
302 /* Calculate lower half jump table offset */
304 @@ -408,7 +408,7 @@ chacha20_mips:
307 /* Add lower half jump table addr */
308 - addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
309 + addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
311 /* Read value from STATE */
313 @@ -420,5 +420,78 @@ chacha20_mips:
316 FOR_EACH_WORD(JMPTBL_UNALIGNED)
318 +.end chacha_crypt_arch
338 +.globl hchacha_block_arch
339 +.ent hchacha_block_arch
341 + .frame $sp, STACK_SIZE, $ra
343 + addiu $sp, -STACK_SIZE
365 +.Loop_hchacha_xor_rounds:
367 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
368 + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
369 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
370 + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
371 + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
372 + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
373 + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
374 + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
375 + bnez $a2, .Loop_hchacha_xor_rounds
377 + /* Restore used register */
389 + addiu $sp, STACK_SIZE
391 +.end hchacha_block_arch
394 +++ b/arch/mips/crypto/chacha-glue.c
396 +// SPDX-License-Identifier: GPL-2.0
398 + * MIPS accelerated ChaCha and XChaCha stream ciphers,
399 + * including ChaCha20 (RFC7539)
401 + * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
404 +#include <asm/byteorder.h>
405 +#include <crypto/algapi.h>
406 +#include <crypto/internal/chacha.h>
407 +#include <crypto/internal/skcipher.h>
408 +#include <linux/kernel.h>
409 +#include <linux/module.h>
411 +asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
412 + unsigned int bytes, int nrounds);
413 +EXPORT_SYMBOL(chacha_crypt_arch);
415 +asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds);
416 +EXPORT_SYMBOL(hchacha_block_arch);
418 +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
420 + chacha_init_generic(state, key, iv);
422 +EXPORT_SYMBOL(chacha_init_arch);
424 +static int chacha_mips_stream_xor(struct skcipher_request *req,
425 + const struct chacha_ctx *ctx, const u8 *iv)
427 + struct skcipher_walk walk;
431 + err = skcipher_walk_virt(&walk, req, false);
433 + chacha_init_generic(state, ctx->key, iv);
435 + while (walk.nbytes > 0) {
436 + unsigned int nbytes = walk.nbytes;
438 + if (nbytes < walk.total)
439 + nbytes = round_down(nbytes, walk.stride);
441 + chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr,
442 + nbytes, ctx->nrounds);
443 + err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
449 +static int chacha_mips(struct skcipher_request *req)
451 + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
452 + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
454 + return chacha_mips_stream_xor(req, ctx, req->iv);
457 +static int xchacha_mips(struct skcipher_request *req)
459 + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
460 + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
461 + struct chacha_ctx subctx;
465 + chacha_init_generic(state, ctx->key, req->iv);
467 + hchacha_block(state, subctx.key, ctx->nrounds);
468 + subctx.nrounds = ctx->nrounds;
470 + memcpy(&real_iv[0], req->iv + 24, 8);
471 + memcpy(&real_iv[8], req->iv + 16, 8);
472 + return chacha_mips_stream_xor(req, &subctx, real_iv);
475 +static struct skcipher_alg algs[] = {
477 + .base.cra_name = "chacha20",
478 + .base.cra_driver_name = "chacha20-mips",
479 + .base.cra_priority = 200,
480 + .base.cra_blocksize = 1,
481 + .base.cra_ctxsize = sizeof(struct chacha_ctx),
482 + .base.cra_module = THIS_MODULE,
484 + .min_keysize = CHACHA_KEY_SIZE,
485 + .max_keysize = CHACHA_KEY_SIZE,
486 + .ivsize = CHACHA_IV_SIZE,
487 + .chunksize = CHACHA_BLOCK_SIZE,
488 + .setkey = chacha20_setkey,
489 + .encrypt = chacha_mips,
490 + .decrypt = chacha_mips,
492 + .base.cra_name = "xchacha20",
493 + .base.cra_driver_name = "xchacha20-mips",
494 + .base.cra_priority = 200,
495 + .base.cra_blocksize = 1,
496 + .base.cra_ctxsize = sizeof(struct chacha_ctx),
497 + .base.cra_module = THIS_MODULE,
499 + .min_keysize = CHACHA_KEY_SIZE,
500 + .max_keysize = CHACHA_KEY_SIZE,
501 + .ivsize = XCHACHA_IV_SIZE,
502 + .chunksize = CHACHA_BLOCK_SIZE,
503 + .setkey = chacha20_setkey,
504 + .encrypt = xchacha_mips,
505 + .decrypt = xchacha_mips,
507 + .base.cra_name = "xchacha12",
508 + .base.cra_driver_name = "xchacha12-mips",
509 + .base.cra_priority = 200,
510 + .base.cra_blocksize = 1,
511 + .base.cra_ctxsize = sizeof(struct chacha_ctx),
512 + .base.cra_module = THIS_MODULE,
514 + .min_keysize = CHACHA_KEY_SIZE,
515 + .max_keysize = CHACHA_KEY_SIZE,
516 + .ivsize = XCHACHA_IV_SIZE,
517 + .chunksize = CHACHA_BLOCK_SIZE,
518 + .setkey = chacha12_setkey,
519 + .encrypt = xchacha_mips,
520 + .decrypt = xchacha_mips,
524 +static int __init chacha_simd_mod_init(void)
526 + return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
529 +static void __exit chacha_simd_mod_fini(void)
531 + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
534 +module_init(chacha_simd_mod_init);
535 +module_exit(chacha_simd_mod_fini);
537 +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)");
538 +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
539 +MODULE_LICENSE("GPL v2");
540 +MODULE_ALIAS_CRYPTO("chacha20");
541 +MODULE_ALIAS_CRYPTO("chacha20-mips");
542 +MODULE_ALIAS_CRYPTO("xchacha20");
543 +MODULE_ALIAS_CRYPTO("xchacha20-mips");
544 +MODULE_ALIAS_CRYPTO("xchacha12");
545 +MODULE_ALIAS_CRYPTO("xchacha12-mips");
548 @@ -1423,6 +1423,12 @@ config CRYPTO_CHACHA20_X86_64
549 SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
550 XChaCha20, and XChaCha12 stream ciphers.
552 +config CRYPTO_CHACHA_MIPS
553 + tristate "ChaCha stream cipher algorithms (MIPS 32r2 optimized)"
554 + depends on CPU_MIPS32_R2
555 + select CRYPTO_BLKCIPHER
556 + select CRYPTO_ARCH_HAVE_LIB_CHACHA
559 tristate "SEED cipher algorithm"