1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Fri, 8 Nov 2019 13:22:16 +0100
4 Subject: [PATCH] crypto: mips/chacha - import 32r2 ChaCha code from Zinc
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
9 commit 49aa7c00eddf8d8f462b0256bd82e81762d7b0c6 upstream.
11 This imports the accelerated MIPS 32r2 ChaCha20 implementation from the
14 Co-developed-by: René van Dorst <opensource@vdorst.com>
15 Signed-off-by: René van Dorst <opensource@vdorst.com>
16 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
17 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
18 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
19 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
21 arch/mips/crypto/chacha-core.S | 424 +++++++++++++++++++++++++++++++++
22 1 file changed, 424 insertions(+)
23 create mode 100644 arch/mips/crypto/chacha-core.S
26 +++ b/arch/mips/crypto/chacha-core.S
28 +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
30 + * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
31 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
34 +#define MASK_U32 0x3c
35 +#define CHACHA20_BLOCK_SIZE 64
36 +#define STACK_SIZE 32
54 +/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
60 +/* Input arguments */
66 +/* Output argument */
67 +/* NONCE[0] is kept in a register and not in memory.
68 + * We don't want to touch original value in memory.
69 + * Must be incremented every loop iteration.
73 +/* SAVED_X and SAVED_CA are set in the jump table.
74 + * Use regs which are overwritten on exit else we don't leak clear data.
75 + * They are used to handling the last bytes which are not multiple of 4.
80 +#define IS_UNALIGNED $s7
82 +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
86 +#define ROTR(n) rotr n, 24
87 +#define CPU_TO_LE32(n) \
94 +#define CPU_TO_LE32(n)
98 +#define FOR_EACH_WORD(x) \
116 +#define FOR_EACH_WORD_REV(x) \
134 +#define PLUS_ONE_0 1
135 +#define PLUS_ONE_1 2
136 +#define PLUS_ONE_2 3
137 +#define PLUS_ONE_3 4
138 +#define PLUS_ONE_4 5
139 +#define PLUS_ONE_5 6
140 +#define PLUS_ONE_6 7
141 +#define PLUS_ONE_7 8
142 +#define PLUS_ONE_8 9
143 +#define PLUS_ONE_9 10
144 +#define PLUS_ONE_10 11
145 +#define PLUS_ONE_11 12
146 +#define PLUS_ONE_12 13
147 +#define PLUS_ONE_13 14
148 +#define PLUS_ONE_14 15
149 +#define PLUS_ONE_15 16
150 +#define PLUS_ONE(x) PLUS_ONE_ ## x
151 +#define _CONCAT3(a,b,c) a ## b ## c
152 +#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
154 +#define STORE_UNALIGNED(x) \
155 +CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
157 + lw T0, (x*4)(STATE); \
159 + lwl T1, (x*4)+MSB ## (IN); \
160 + lwr T1, (x*4)+LSB ## (IN); \
162 + addu X ## x, NONCE_0; \
166 + CPU_TO_LE32(X ## x); \
168 + swl X ## x, (x*4)+MSB ## (OUT); \
169 + swr X ## x, (x*4)+LSB ## (OUT);
171 +#define STORE_ALIGNED(x) \
172 +CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
174 + lw T0, (x*4)(STATE); \
176 + lw T1, (x*4) ## (IN); \
178 + addu X ## x, NONCE_0; \
182 + CPU_TO_LE32(X ## x); \
184 + sw X ## x, (x*4) ## (OUT);
186 +/* Jump table macro.
187 + * Used for setup and handling the last bytes, which are not multiple of 4.
188 + * X15 is free to store Xn
189 + * Every jumptable entry must be equal in size.
191 +#define JMPTBL_ALIGNED(x) \
192 +.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
194 + b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
196 + addu SAVED_X, X ## x, NONCE_0; \
198 + addu SAVED_X, X ## x, SAVED_CA; \
202 +#define JMPTBL_UNALIGNED(x) \
203 +.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
205 + b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
207 + addu SAVED_X, X ## x, NONCE_0; \
209 + addu SAVED_X, X ## x, SAVED_CA; \
213 +#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
230 +.globl chacha20_mips
233 + .frame $sp, STACK_SIZE, $ra
235 + addiu $sp, -STACK_SIZE
237 + /* Return bytes = 0. */
238 + beqz BYTES, .Lchacha20_mips_end
240 + lw NONCE_0, 48(STATE)
252 + /* Test IN or OUT is unaligned.
253 + * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
255 + or IS_UNALIGNED, IN, OUT
256 + andi IS_UNALIGNED, 0x3
258 + /* Set number of rounds */
261 + b .Lchacha20_rounds_start
264 +.Loop_chacha20_rounds:
265 + addiu IN, CHACHA20_BLOCK_SIZE
266 + addiu OUT, CHACHA20_BLOCK_SIZE
269 +.Lchacha20_rounds_start:
289 +.Loop_chacha20_xor_rounds:
291 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
292 + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
293 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
294 + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
295 + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
296 + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
297 + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
298 + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
299 + bnez $at, .Loop_chacha20_xor_rounds
301 + addiu BYTES, -(CHACHA20_BLOCK_SIZE)
303 + /* Is data src/dst unaligned? Jump */
304 + bnez IS_UNALIGNED, .Loop_chacha20_unaligned
306 + /* Set number rounds here to fill delayslot. */
309 + /* BYTES < 0, it has no full block. */
310 + bltz BYTES, .Lchacha20_mips_no_full_block_aligned
312 + FOR_EACH_WORD_REV(STORE_ALIGNED)
314 + /* BYTES > 0? Loop again. */
315 + bgtz BYTES, .Loop_chacha20_rounds
317 + /* Place this here to fill delay slot */
320 + /* BYTES < 0? Handle last bytes */
321 + bltz BYTES, .Lchacha20_mips_xor_bytes
323 +.Lchacha20_mips_xor_done:
324 + /* Restore used registers */
334 + /* Write NONCE_0 back to right location in state */
335 + sw NONCE_0, 48(STATE)
337 +.Lchacha20_mips_end:
338 + addiu $sp, STACK_SIZE
341 +.Lchacha20_mips_no_full_block_aligned:
342 + /* Restore the offset on BYTES */
343 + addiu BYTES, CHACHA20_BLOCK_SIZE
345 + /* Get number of full WORDS */
346 + andi $at, BYTES, MASK_U32
348 + /* Load upper half of jump table addr */
349 + lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
351 + /* Calculate lower half jump table offset */
354 + /* Add offset to STATE */
355 + addu T1, STATE, $at
357 + /* Add lower half jump table addr */
358 + addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
360 + /* Read value from STATE */
363 + /* Store remaining bytecounter as negative value */
364 + subu BYTES, $at, BYTES
369 + FOR_EACH_WORD(JMPTBL_ALIGNED)
372 +.Loop_chacha20_unaligned:
373 + /* Set number rounds here to fill delayslot. */
376 + /* BYTES > 0, it has no full block. */
377 + bltz BYTES, .Lchacha20_mips_no_full_block_unaligned
379 + FOR_EACH_WORD_REV(STORE_UNALIGNED)
381 + /* BYTES > 0? Loop again. */
382 + bgtz BYTES, .Loop_chacha20_rounds
384 + /* Write NONCE_0 back to right location in state */
385 + sw NONCE_0, 48(STATE)
388 + /* Fall through to byte handling */
389 + bgez BYTES, .Lchacha20_mips_xor_done
390 +.Lchacha20_mips_xor_unaligned_0_b:
391 +.Lchacha20_mips_xor_aligned_0_b:
392 + /* Place this here to fill delay slot */
396 +.Lchacha20_mips_xor_bytes:
401 + addiu $at, BYTES, 1
402 + CPU_TO_LE32(SAVED_X)
406 + beqz $at, .Lchacha20_mips_xor_done
409 + addiu $at, BYTES, 2
413 + beqz $at, .Lchacha20_mips_xor_done
419 + b .Lchacha20_mips_xor_done
421 +.Lchacha20_mips_no_full_block_unaligned:
422 + /* Restore the offset on BYTES */
423 + addiu BYTES, CHACHA20_BLOCK_SIZE
425 + /* Get number of full WORDS */
426 + andi $at, BYTES, MASK_U32
428 + /* Load upper half of jump table addr */
429 + lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
431 + /* Calculate lower half jump table offset */
434 + /* Add offset to STATE */
435 + addu T1, STATE, $at
437 + /* Add lower half jump table addr */
438 + addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
440 + /* Read value from STATE */
443 + /* Store remaining bytecounter as negative value */
444 + subu BYTES, $at, BYTES
449 + FOR_EACH_WORD(JMPTBL_UNALIGNED)