1 From a9f240ba1206fb080c1b3f727dfba1512035a82b Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Sun, 5 Jan 2020 22:40:46 -0500
4 Subject: [PATCH 041/124] crypto: poly1305 - add new 32 and 64-bit generic
7 commit 1c08a104360f3e18f4ee6346c21cc3923efb952e upstream.
9 These two C implementations from Zinc -- a 32x32 one and a 64x64 one,
10 depending on the platform -- come from Andrew Moon's public domain
11 poly1305-donna portable code, modified for usage in the kernel. The
12 precomputation in the 32-bit version and the use of 64x64 multiplies in
13 the 64-bit version make these perform better than the code it replaces.
14 Moon's code is also very widespread and has received many eyeballs of
17 There's a bit of interference between the x86 implementation, which
18 relies on internal details of the old scalar implementation. In the next
19 commit, the x86 implementation will be replaced with a faster one that
20 doesn't rely on this, so none of this matters much. But for now, to keep
21 this passing the tests, we inline the bits of the old implementation
22 that the x86 implementation relied on. Also, since we now support a
23 slightly larger key space, via the union, some offsets had to be fixed
26 Nonce calculation was folded in with the emit function, to take
27 advantage of 64x64 arithmetic. However, Adiantum appeared to rely on no
28 nonce handling in emit, so this path was conditionalized. We also
29 introduced a new struct, poly1305_core_key, to represent the precise
30 amount of space that particular implementation uses.
32 Testing with kbench9000, depending on the CPU, the update function for
33 the 32x32 version has been improved by 4%-7%, and for the 64x64 by
34 19%-30%. The 32x32 gains are small, but I think there's great value in
35 having a parallel implementation to the 64x64 one so that the two can be
36 compared side-by-side as nice stand-alone units.
38 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
39 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
40 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
42 arch/x86/crypto/poly1305-avx2-x86_64.S | 20 +--
43 arch/x86/crypto/poly1305_glue.c | 215 +++++++++++++++++++++++--
44 crypto/adiantum.c | 4 +-
45 crypto/nhpoly1305.c | 2 +-
46 crypto/poly1305_generic.c | 25 ++-
47 include/crypto/internal/poly1305.h | 45 ++----
48 include/crypto/nhpoly1305.h | 4 +-
49 include/crypto/poly1305.h | 26 ++-
50 lib/crypto/Makefile | 4 +-
51 lib/crypto/poly1305-donna32.c | 204 +++++++++++++++++++++++
52 lib/crypto/poly1305-donna64.c | 185 +++++++++++++++++++++
53 lib/crypto/poly1305.c | 169 +------------------
54 12 files changed, 675 insertions(+), 228 deletions(-)
55 create mode 100644 lib/crypto/poly1305-donna32.c
56 create mode 100644 lib/crypto/poly1305-donna64.c
58 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S
59 +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
60 @@ -34,16 +34,16 @@ ORMASK: .octa 0x000000000100000000000000
87 --- a/arch/x86/crypto/poly1305_glue.c
88 +++ b/arch/x86/crypto/poly1305_glue.c
89 @@ -25,6 +25,21 @@ asmlinkage void poly1305_4block_avx2(u32
90 static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
91 static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
93 +static inline u64 mlt(u64 a, u64 b)
98 +static inline u32 sr(u64 v, u_char n)
103 +static inline u32 and(u32 v, u32 mask)
108 static void poly1305_simd_mult(u32 *a, const u32 *b)
110 u8 m[POLY1305_BLOCK_SIZE];
111 @@ -36,6 +51,168 @@ static void poly1305_simd_mult(u32 *a, c
112 poly1305_block_sse2(a, m, b, 1);
115 +static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key)
117 + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
118 + key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff;
119 + key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03;
120 + key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff;
121 + key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff;
122 + key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
125 +static void poly1305_integer_blocks(struct poly1305_state *state,
126 + const struct poly1305_key *key,
128 + unsigned int nblocks, u32 hibit)
130 + u32 r0, r1, r2, r3, r4;
131 + u32 s1, s2, s3, s4;
132 + u32 h0, h1, h2, h3, h4;
133 + u64 d0, d1, d2, d3, d4;
157 + h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff;
158 + h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
159 + h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
160 + h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
161 + h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
164 + d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
165 + mlt(h3, s2) + mlt(h4, s1);
166 + d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
167 + mlt(h3, s3) + mlt(h4, s2);
168 + d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
169 + mlt(h3, s4) + mlt(h4, s3);
170 + d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
171 + mlt(h3, r0) + mlt(h4, s4);
172 + d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
173 + mlt(h3, r1) + mlt(h4, r0);
175 + /* (partial) h %= p */
176 + d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
177 + d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
178 + d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
179 + d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
180 + h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
181 + h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
183 + src += POLY1305_BLOCK_SIZE;
184 + } while (--nblocks);
193 +static void poly1305_integer_emit(const struct poly1305_state *state, void *dst)
195 + u32 h0, h1, h2, h3, h4;
196 + u32 g0, g1, g2, g3, g4;
199 + /* fully carry h */
206 + h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
207 + h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
208 + h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
209 + h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
210 + h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
212 + /* compute h + -p */
214 + g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
215 + g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
216 + g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
217 + g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
219 + /* select h if h < p, or h + -p if h >= p */
220 + mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
227 + h0 = (h0 & mask) | g0;
228 + h1 = (h1 & mask) | g1;
229 + h2 = (h2 & mask) | g2;
230 + h3 = (h3 & mask) | g3;
231 + h4 = (h4 & mask) | g4;
233 + /* h = h % (2^128) */
234 + put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0);
235 + put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4);
236 + put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8);
237 + put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12);
240 +void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
242 + poly1305_integer_setkey(desc->opaque_r, key);
243 + desc->s[0] = get_unaligned_le32(key + 16);
244 + desc->s[1] = get_unaligned_le32(key + 20);
245 + desc->s[2] = get_unaligned_le32(key + 24);
246 + desc->s[3] = get_unaligned_le32(key + 28);
247 + poly1305_core_init(&desc->h);
252 +EXPORT_SYMBOL_GPL(poly1305_init_arch);
254 +static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
255 + const u8 *src, unsigned int srclen)
258 + if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
259 + poly1305_integer_setkey(dctx->r, src);
260 + src += POLY1305_BLOCK_SIZE;
261 + srclen -= POLY1305_BLOCK_SIZE;
264 + if (srclen >= POLY1305_BLOCK_SIZE) {
265 + dctx->s[0] = get_unaligned_le32(src + 0);
266 + dctx->s[1] = get_unaligned_le32(src + 4);
267 + dctx->s[2] = get_unaligned_le32(src + 8);
268 + dctx->s[3] = get_unaligned_le32(src + 12);
269 + src += POLY1305_BLOCK_SIZE;
270 + srclen -= POLY1305_BLOCK_SIZE;
277 static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
278 const u8 *src, unsigned int srclen)
280 @@ -47,8 +224,8 @@ static unsigned int poly1305_scalar_bloc
283 if (srclen >= POLY1305_BLOCK_SIZE) {
284 - poly1305_core_blocks(&dctx->h, dctx->r, src,
285 - srclen / POLY1305_BLOCK_SIZE, 1);
286 + poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src,
287 + srclen / POLY1305_BLOCK_SIZE, 1);
288 srclen %= POLY1305_BLOCK_SIZE;
291 @@ -105,12 +282,6 @@ static unsigned int poly1305_simd_blocks
295 -void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
297 - poly1305_init_generic(desc, key);
299 -EXPORT_SYMBOL(poly1305_init_arch);
301 void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
304 @@ -158,9 +329,31 @@ void poly1305_update_arch(struct poly130
306 EXPORT_SYMBOL(poly1305_update_arch);
308 -void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest)
309 +void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst)
311 - poly1305_final_generic(desc, digest);
315 + if (unlikely(desc->buflen)) {
316 + desc->buf[desc->buflen++] = 1;
317 + memset(desc->buf + desc->buflen, 0,
318 + POLY1305_BLOCK_SIZE - desc->buflen);
319 + poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0);
322 + poly1305_integer_emit(&desc->h, digest);
324 + /* mac = (h + s) % (2^128) */
325 + f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
326 + put_unaligned_le32(f, dst + 0);
327 + f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
328 + put_unaligned_le32(f, dst + 4);
329 + f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
330 + put_unaligned_le32(f, dst + 8);
331 + f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
332 + put_unaligned_le32(f, dst + 12);
334 + *desc = (struct poly1305_desc_ctx){};
336 EXPORT_SYMBOL(poly1305_final_arch);
338 @@ -183,7 +376,7 @@ static int crypto_poly1305_final(struct
339 if (unlikely(!dctx->sset))
342 - poly1305_final_generic(dctx, dst);
343 + poly1305_final_arch(dctx, dst);
347 --- a/crypto/adiantum.c
348 +++ b/crypto/adiantum.c
349 @@ -72,7 +72,7 @@ struct adiantum_tfm_ctx {
350 struct crypto_skcipher *streamcipher;
351 struct crypto_cipher *blockcipher;
352 struct crypto_shash *hash;
353 - struct poly1305_key header_hash_key;
354 + struct poly1305_core_key header_hash_key;
357 struct adiantum_request_ctx {
358 @@ -249,7 +249,7 @@ static void adiantum_hash_header(struct
359 poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv,
360 TWEAK_SIZE / POLY1305_BLOCK_SIZE, 1);
362 - poly1305_core_emit(&state, &rctx->header_hash);
363 + poly1305_core_emit(&state, NULL, &rctx->header_hash);
366 /* Hash the left-hand part (the "bulk") of the message using NHPoly1305 */
367 --- a/crypto/nhpoly1305.c
368 +++ b/crypto/nhpoly1305.c
369 @@ -210,7 +210,7 @@ int crypto_nhpoly1305_final_helper(struc
370 if (state->nh_remaining)
371 process_nh_hash_value(state, key);
373 - poly1305_core_emit(&state->poly_state, dst);
374 + poly1305_core_emit(&state->poly_state, NULL, dst);
377 EXPORT_SYMBOL(crypto_nhpoly1305_final_helper);
378 --- a/crypto/poly1305_generic.c
379 +++ b/crypto/poly1305_generic.c
380 @@ -31,6 +31,29 @@ static int crypto_poly1305_init(struct s
384 +static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
385 + const u8 *src, unsigned int srclen)
388 + if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
389 + poly1305_core_setkey(&dctx->core_r, src);
390 + src += POLY1305_BLOCK_SIZE;
391 + srclen -= POLY1305_BLOCK_SIZE;
394 + if (srclen >= POLY1305_BLOCK_SIZE) {
395 + dctx->s[0] = get_unaligned_le32(src + 0);
396 + dctx->s[1] = get_unaligned_le32(src + 4);
397 + dctx->s[2] = get_unaligned_le32(src + 8);
398 + dctx->s[3] = get_unaligned_le32(src + 12);
399 + src += POLY1305_BLOCK_SIZE;
400 + srclen -= POLY1305_BLOCK_SIZE;
407 static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
410 @@ -42,7 +65,7 @@ static void poly1305_blocks(struct poly1
414 - poly1305_core_blocks(&dctx->h, dctx->r, src,
415 + poly1305_core_blocks(&dctx->h, &dctx->core_r, src,
416 srclen / POLY1305_BLOCK_SIZE, 1);
419 --- a/include/crypto/internal/poly1305.h
420 +++ b/include/crypto/internal/poly1305.h
422 #include <crypto/poly1305.h>
425 - * Poly1305 core functions. These implement the ε-almost-∆-universal hash
426 - * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
427 - * ("s key") at the end. They also only support block-aligned inputs.
428 + * Poly1305 core functions. These only accept whole blocks; the caller must
429 + * handle any needed block buffering and padding. 'hibit' must be 1 for any
430 + * full blocks, or 0 for the final block if it had to be padded. If 'nonce' is
431 + * non-NULL, then it's added at the end to compute the Poly1305 MAC. Otherwise,
432 + * only the ε-almost-∆-universal hash function (not the full MAC) is computed.
434 -void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key);
436 +void poly1305_core_setkey(struct poly1305_core_key *key, const u8 *raw_key);
437 static inline void poly1305_core_init(struct poly1305_state *state)
439 *state = (struct poly1305_state){};
442 void poly1305_core_blocks(struct poly1305_state *state,
443 - const struct poly1305_key *key, const void *src,
444 + const struct poly1305_core_key *key, const void *src,
445 unsigned int nblocks, u32 hibit);
446 -void poly1305_core_emit(const struct poly1305_state *state, void *dst);
449 - * Poly1305 requires a unique key for each tag, which implies that we can't set
450 - * it on the tfm that gets accessed by multiple users simultaneously. Instead we
451 - * expect the key as the first 32 bytes in the update() call.
454 -unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
455 - const u8 *src, unsigned int srclen)
458 - if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
459 - poly1305_core_setkey(dctx->r, src);
460 - src += POLY1305_BLOCK_SIZE;
461 - srclen -= POLY1305_BLOCK_SIZE;
464 - if (srclen >= POLY1305_BLOCK_SIZE) {
465 - dctx->s[0] = get_unaligned_le32(src + 0);
466 - dctx->s[1] = get_unaligned_le32(src + 4);
467 - dctx->s[2] = get_unaligned_le32(src + 8);
468 - dctx->s[3] = get_unaligned_le32(src + 12);
469 - src += POLY1305_BLOCK_SIZE;
470 - srclen -= POLY1305_BLOCK_SIZE;
476 +void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4],
480 --- a/include/crypto/nhpoly1305.h
481 +++ b/include/crypto/nhpoly1305.h
483 #define _NHPOLY1305_H
485 #include <crypto/hash.h>
486 -#include <crypto/poly1305.h>
487 +#include <crypto/internal/poly1305.h>
489 /* NH parameterization: */
492 #define NHPOLY1305_KEY_SIZE (POLY1305_BLOCK_SIZE + NH_KEY_BYTES)
494 struct nhpoly1305_key {
495 - struct poly1305_key poly_key;
496 + struct poly1305_core_key poly_key;
497 u32 nh_key[NH_KEY_WORDS];
500 --- a/include/crypto/poly1305.h
501 +++ b/include/crypto/poly1305.h
503 #define POLY1305_KEY_SIZE 32
504 #define POLY1305_DIGEST_SIZE 16
506 +/* The poly1305_key and poly1305_state types are mostly opaque and
507 + * implementation-defined. Limbs might be in base 2^64 or base 2^26, or
508 + * different yet. The union type provided keeps these 64-bit aligned for the
509 + * case in which this is implemented using 64x64 multiplies.
512 struct poly1305_key {
513 - u32 r[5]; /* key, base 2^26 */
520 +struct poly1305_core_key {
521 + struct poly1305_key key;
522 + struct poly1305_key precomputed_s;
525 struct poly1305_state {
526 - u32 h[5]; /* accumulator, base 2^26 */
533 struct poly1305_desc_ctx {
534 @@ -35,7 +52,10 @@ struct poly1305_desc_ctx {
536 struct poly1305_state h;
538 - struct poly1305_key r[CONFIG_CRYPTO_LIB_POLY1305_RSIZE];
540 + struct poly1305_key opaque_r[CONFIG_CRYPTO_LIB_POLY1305_RSIZE];
541 + struct poly1305_core_key core_r;
545 void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key);
546 --- a/lib/crypto/Makefile
547 +++ b/lib/crypto/Makefile
548 @@ -28,7 +28,9 @@ obj-$(CONFIG_CRYPTO_LIB_DES) += libdes
551 obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += libpoly1305.o
552 -libpoly1305-y := poly1305.o
553 +libpoly1305-y := poly1305-donna32.o
554 +libpoly1305-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o
555 +libpoly1305-y += poly1305.o
557 obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o
558 libsha256-y := sha256.o
560 +++ b/lib/crypto/poly1305-donna32.c
562 +// SPDX-License-Identifier: GPL-2.0 OR MIT
564 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
566 + * This is based in part on Andrew Moon's poly1305-donna, which is in the
570 +#include <linux/kernel.h>
571 +#include <asm/unaligned.h>
572 +#include <crypto/internal/poly1305.h>
574 +void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[16])
576 + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
577 + key->key.r[0] = (get_unaligned_le32(&raw_key[0])) & 0x3ffffff;
578 + key->key.r[1] = (get_unaligned_le32(&raw_key[3]) >> 2) & 0x3ffff03;
579 + key->key.r[2] = (get_unaligned_le32(&raw_key[6]) >> 4) & 0x3ffc0ff;
580 + key->key.r[3] = (get_unaligned_le32(&raw_key[9]) >> 6) & 0x3f03fff;
581 + key->key.r[4] = (get_unaligned_le32(&raw_key[12]) >> 8) & 0x00fffff;
584 + key->precomputed_s.r[0] = key->key.r[1] * 5;
585 + key->precomputed_s.r[1] = key->key.r[2] * 5;
586 + key->precomputed_s.r[2] = key->key.r[3] * 5;
587 + key->precomputed_s.r[3] = key->key.r[4] * 5;
589 +EXPORT_SYMBOL(poly1305_core_setkey);
591 +void poly1305_core_blocks(struct poly1305_state *state,
592 + const struct poly1305_core_key *key, const void *src,
593 + unsigned int nblocks, u32 hibit)
595 + const u8 *input = src;
596 + u32 r0, r1, r2, r3, r4;
597 + u32 s1, s2, s3, s4;
598 + u32 h0, h1, h2, h3, h4;
599 + u64 d0, d1, d2, d3, d4;
607 + r0 = key->key.r[0];
608 + r1 = key->key.r[1];
609 + r2 = key->key.r[2];
610 + r3 = key->key.r[3];
611 + r4 = key->key.r[4];
613 + s1 = key->precomputed_s.r[0];
614 + s2 = key->precomputed_s.r[1];
615 + s3 = key->precomputed_s.r[2];
616 + s4 = key->precomputed_s.r[3];
626 + h0 += (get_unaligned_le32(&input[0])) & 0x3ffffff;
627 + h1 += (get_unaligned_le32(&input[3]) >> 2) & 0x3ffffff;
628 + h2 += (get_unaligned_le32(&input[6]) >> 4) & 0x3ffffff;
629 + h3 += (get_unaligned_le32(&input[9]) >> 6) & 0x3ffffff;
630 + h4 += (get_unaligned_le32(&input[12]) >> 8) | hibit;
633 + d0 = ((u64)h0 * r0) + ((u64)h1 * s4) +
634 + ((u64)h2 * s3) + ((u64)h3 * s2) +
636 + d1 = ((u64)h0 * r1) + ((u64)h1 * r0) +
637 + ((u64)h2 * s4) + ((u64)h3 * s3) +
639 + d2 = ((u64)h0 * r2) + ((u64)h1 * r1) +
640 + ((u64)h2 * r0) + ((u64)h3 * s4) +
642 + d3 = ((u64)h0 * r3) + ((u64)h1 * r2) +
643 + ((u64)h2 * r1) + ((u64)h3 * r0) +
645 + d4 = ((u64)h0 * r4) + ((u64)h1 * r3) +
646 + ((u64)h2 * r2) + ((u64)h3 * r1) +
649 + /* (partial) h %= p */
650 + c = (u32)(d0 >> 26);
651 + h0 = (u32)d0 & 0x3ffffff;
653 + c = (u32)(d1 >> 26);
654 + h1 = (u32)d1 & 0x3ffffff;
656 + c = (u32)(d2 >> 26);
657 + h2 = (u32)d2 & 0x3ffffff;
659 + c = (u32)(d3 >> 26);
660 + h3 = (u32)d3 & 0x3ffffff;
662 + c = (u32)(d4 >> 26);
663 + h4 = (u32)d4 & 0x3ffffff;
666 + h0 = h0 & 0x3ffffff;
669 + input += POLY1305_BLOCK_SIZE;
670 + } while (--nblocks);
678 +EXPORT_SYMBOL(poly1305_core_blocks);
680 +void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4],
684 + u32 h0, h1, h2, h3, h4, c;
685 + u32 g0, g1, g2, g3, g4;
689 + /* fully carry h */
697 + h1 = h1 & 0x3ffffff;
700 + h2 = h2 & 0x3ffffff;
703 + h3 = h3 & 0x3ffffff;
706 + h4 = h4 & 0x3ffffff;
709 + h0 = h0 & 0x3ffffff;
712 + /* compute h + -p */
725 + g4 = h4 + c - (1UL << 26);
727 + /* select h if h < p, or h + -p if h >= p */
728 + mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
736 + h0 = (h0 & mask) | g0;
737 + h1 = (h1 & mask) | g1;
738 + h2 = (h2 & mask) | g2;
739 + h3 = (h3 & mask) | g3;
740 + h4 = (h4 & mask) | g4;
742 + /* h = h % (2^128) */
743 + h0 = ((h0) | (h1 << 26)) & 0xffffffff;
744 + h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
745 + h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
746 + h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
748 + if (likely(nonce)) {
749 + /* mac = (h + nonce) % (2^128) */
750 + f = (u64)h0 + nonce[0];
752 + f = (u64)h1 + nonce[1] + (f >> 32);
754 + f = (u64)h2 + nonce[2] + (f >> 32);
756 + f = (u64)h3 + nonce[3] + (f >> 32);
760 + put_unaligned_le32(h0, &mac[0]);
761 + put_unaligned_le32(h1, &mac[4]);
762 + put_unaligned_le32(h2, &mac[8]);
763 + put_unaligned_le32(h3, &mac[12]);
765 +EXPORT_SYMBOL(poly1305_core_emit);
767 +++ b/lib/crypto/poly1305-donna64.c
769 +// SPDX-License-Identifier: GPL-2.0 OR MIT
771 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
773 + * This is based in part on Andrew Moon's poly1305-donna, which is in the
777 +#include <linux/kernel.h>
778 +#include <asm/unaligned.h>
779 +#include <crypto/internal/poly1305.h>
781 +typedef __uint128_t u128;
783 +void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[16])
787 + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
788 + t0 = get_unaligned_le64(&raw_key[0]);
789 + t1 = get_unaligned_le64(&raw_key[8]);
791 + key->key.r64[0] = t0 & 0xffc0fffffffULL;
792 + key->key.r64[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffffULL;
793 + key->key.r64[2] = ((t1 >> 24)) & 0x00ffffffc0fULL;
796 + key->precomputed_s.r64[0] = key->key.r64[1] * 20;
797 + key->precomputed_s.r64[1] = key->key.r64[2] * 20;
799 +EXPORT_SYMBOL(poly1305_core_setkey);
801 +void poly1305_core_blocks(struct poly1305_state *state,
802 + const struct poly1305_core_key *key, const void *src,
803 + unsigned int nblocks, u32 hibit)
805 + const u8 *input = src;
811 + u128 d0, d1, d2, d;
816 + hibit64 = ((u64)hibit) << 40;
818 + r0 = key->key.r64[0];
819 + r1 = key->key.r64[1];
820 + r2 = key->key.r64[2];
822 + h0 = state->h64[0];
823 + h1 = state->h64[1];
824 + h2 = state->h64[2];
826 + s1 = key->precomputed_s.r64[0];
827 + s2 = key->precomputed_s.r64[1];
833 + t0 = get_unaligned_le64(&input[0]);
834 + t1 = get_unaligned_le64(&input[8]);
836 + h0 += t0 & 0xfffffffffffULL;
837 + h1 += ((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL;
838 + h2 += (((t1 >> 24)) & 0x3ffffffffffULL) | hibit64;
841 + d0 = (u128)h0 * r0;
846 + d1 = (u128)h0 * r1;
851 + d2 = (u128)h0 * r2;
857 + /* (partial) h %= p */
858 + c = (u64)(d0 >> 44);
859 + h0 = (u64)d0 & 0xfffffffffffULL;
861 + c = (u64)(d1 >> 44);
862 + h1 = (u64)d1 & 0xfffffffffffULL;
864 + c = (u64)(d2 >> 42);
865 + h2 = (u64)d2 & 0x3ffffffffffULL;
868 + h0 = h0 & 0xfffffffffffULL;
871 + input += POLY1305_BLOCK_SIZE;
872 + } while (--nblocks);
874 + state->h64[0] = h0;
875 + state->h64[1] = h1;
876 + state->h64[2] = h2;
878 +EXPORT_SYMBOL(poly1305_core_blocks);
880 +void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4],
888 + /* fully carry h */
889 + h0 = state->h64[0];
890 + h1 = state->h64[1];
891 + h2 = state->h64[2];
894 + h1 &= 0xfffffffffffULL;
897 + h2 &= 0x3ffffffffffULL;
900 + h0 &= 0xfffffffffffULL;
903 + h1 &= 0xfffffffffffULL;
906 + h2 &= 0x3ffffffffffULL;
909 + h0 &= 0xfffffffffffULL;
912 + /* compute h + -p */
915 + g0 &= 0xfffffffffffULL;
918 + g1 &= 0xfffffffffffULL;
919 + g2 = h2 + c - (1ULL << 42);
921 + /* select h if h < p, or h + -p if h >= p */
922 + c = (g2 >> ((sizeof(u64) * 8) - 1)) - 1;
927 + h0 = (h0 & c) | g0;
928 + h1 = (h1 & c) | g1;
929 + h2 = (h2 & c) | g2;
931 + if (likely(nonce)) {
932 + /* h = (h + nonce) */
933 + t0 = ((u64)nonce[1] << 32) | nonce[0];
934 + t1 = ((u64)nonce[3] << 32) | nonce[2];
936 + h0 += t0 & 0xfffffffffffULL;
938 + h0 &= 0xfffffffffffULL;
939 + h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL) + c;
941 + h1 &= 0xfffffffffffULL;
942 + h2 += (((t1 >> 24)) & 0x3ffffffffffULL) + c;
943 + h2 &= 0x3ffffffffffULL;
946 + /* mac = h % (2^128) */
947 + h0 = h0 | (h1 << 44);
948 + h1 = (h1 >> 20) | (h2 << 24);
950 + put_unaligned_le64(h0, &mac[0]);
951 + put_unaligned_le64(h1, &mac[8]);
953 +EXPORT_SYMBOL(poly1305_core_emit);
954 --- a/lib/crypto/poly1305.c
955 +++ b/lib/crypto/poly1305.c
957 #include <linux/module.h>
958 #include <asm/unaligned.h>
960 -static inline u64 mlt(u64 a, u64 b)
965 -static inline u32 sr(u64 v, u_char n)
970 -static inline u32 and(u32 v, u32 mask)
975 -void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
977 - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
978 - key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff;
979 - key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03;
980 - key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff;
981 - key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff;
982 - key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
984 -EXPORT_SYMBOL_GPL(poly1305_core_setkey);
986 -void poly1305_core_blocks(struct poly1305_state *state,
987 - const struct poly1305_key *key, const void *src,
988 - unsigned int nblocks, u32 hibit)
990 - u32 r0, r1, r2, r3, r4;
991 - u32 s1, s2, s3, s4;
992 - u32 h0, h1, h2, h3, h4;
993 - u64 d0, d1, d2, d3, d4;
1017 - h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff;
1018 - h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
1019 - h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
1020 - h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
1021 - h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
1024 - d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
1025 - mlt(h3, s2) + mlt(h4, s1);
1026 - d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
1027 - mlt(h3, s3) + mlt(h4, s2);
1028 - d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
1029 - mlt(h3, s4) + mlt(h4, s3);
1030 - d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
1031 - mlt(h3, r0) + mlt(h4, s4);
1032 - d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
1033 - mlt(h3, r1) + mlt(h4, r0);
1035 - /* (partial) h %= p */
1036 - d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
1037 - d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
1038 - d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
1039 - d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
1040 - h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
1041 - h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
1043 - src += POLY1305_BLOCK_SIZE;
1044 - } while (--nblocks);
1052 -EXPORT_SYMBOL_GPL(poly1305_core_blocks);
1054 -void poly1305_core_emit(const struct poly1305_state *state, void *dst)
1056 - u32 h0, h1, h2, h3, h4;
1057 - u32 g0, g1, g2, g3, g4;
1060 - /* fully carry h */
1067 - h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
1068 - h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
1069 - h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
1070 - h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
1071 - h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
1073 - /* compute h + -p */
1075 - g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
1076 - g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
1077 - g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
1078 - g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
1080 - /* select h if h < p, or h + -p if h >= p */
1081 - mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
1088 - h0 = (h0 & mask) | g0;
1089 - h1 = (h1 & mask) | g1;
1090 - h2 = (h2 & mask) | g2;
1091 - h3 = (h3 & mask) | g3;
1092 - h4 = (h4 & mask) | g4;
1094 - /* h = h % (2^128) */
1095 - put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0);
1096 - put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4);
1097 - put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8);
1098 - put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12);
1100 -EXPORT_SYMBOL_GPL(poly1305_core_emit);
1102 void poly1305_init_generic(struct poly1305_desc_ctx *desc, const u8 *key)
1104 - poly1305_core_setkey(desc->r, key);
1105 + poly1305_core_setkey(&desc->core_r, key);
1106 desc->s[0] = get_unaligned_le32(key + 16);
1107 desc->s[1] = get_unaligned_le32(key + 20);
1108 desc->s[2] = get_unaligned_le32(key + 24);
1109 @@ -164,7 +22,7 @@ void poly1305_init_generic(struct poly13
1110 poly1305_core_init(&desc->h);
1116 EXPORT_SYMBOL_GPL(poly1305_init_generic);
1118 @@ -181,13 +39,14 @@ void poly1305_update_generic(struct poly
1119 desc->buflen += bytes;
1121 if (desc->buflen == POLY1305_BLOCK_SIZE) {
1122 - poly1305_core_blocks(&desc->h, desc->r, desc->buf, 1, 1);
1123 + poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf,
1129 if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
1130 - poly1305_core_blocks(&desc->h, desc->r, src,
1131 + poly1305_core_blocks(&desc->h, &desc->core_r, src,
1132 nbytes / POLY1305_BLOCK_SIZE, 1);
1133 src += nbytes - (nbytes % POLY1305_BLOCK_SIZE);
1134 nbytes %= POLY1305_BLOCK_SIZE;
1135 @@ -202,28 +61,14 @@ EXPORT_SYMBOL_GPL(poly1305_update_generi
1137 void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *dst)
1142 if (unlikely(desc->buflen)) {
1143 desc->buf[desc->buflen++] = 1;
1144 memset(desc->buf + desc->buflen, 0,
1145 POLY1305_BLOCK_SIZE - desc->buflen);
1146 - poly1305_core_blocks(&desc->h, desc->r, desc->buf, 1, 0);
1147 + poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf, 1, 0);
1150 - poly1305_core_emit(&desc->h, digest);
1152 - /* mac = (h + s) % (2^128) */
1153 - f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
1154 - put_unaligned_le32(f, dst + 0);
1155 - f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
1156 - put_unaligned_le32(f, dst + 4);
1157 - f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
1158 - put_unaligned_le32(f, dst + 8);
1159 - f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
1160 - put_unaligned_le32(f, dst + 12);
1162 + poly1305_core_emit(&desc->h, desc->s, dst);
1163 *desc = (struct poly1305_desc_ctx){};
1165 EXPORT_SYMBOL_GPL(poly1305_final_generic);