938d700da2b67178f863078cd663bc9084215b0c
[openwrt/openwrt.git] /
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Mon, 20 Jan 2020 18:18:15 +0100
4 Subject: [PATCH] crypto: x86/curve25519 - replace with formally verified
5 implementation
6
7 commit 07b586fe06625b0b610dc3d3a969c51913d143d4 upstream.
8
9 This comes from INRIA's HACL*/Vale. It implements the same algorithm and
10 implementation strategy as the code it replaces, only this code has been
11 formally verified, sans the base point multiplication, which uses code
12 similar to prior, only it uses the formally verified field arithmetic
13 alongside reproducable ladder generation steps. This doesn't have a
14 pure-bmi2 version, which means haswell no longer benefits, but the
15 increased (doubled) code complexity is not worth it for a single
16 generation of chips that's already old.
17
18 Performance-wise, this is around 1% slower on older microarchitectures,
19 and slightly faster on newer microarchitectures, mainly 10nm ones or
20 backports of 10nm to 14nm. This implementation is "everest" below:
21
22 Xeon E5-2680 v4 (Broadwell)
23
24 armfazh: 133340 cycles per call
25 everest: 133436 cycles per call
26
27 Xeon Gold 5120 (Sky Lake Server)
28
29 armfazh: 112636 cycles per call
30 everest: 113906 cycles per call
31
32 Core i5-6300U (Sky Lake Client)
33
34 armfazh: 116810 cycles per call
35 everest: 117916 cycles per call
36
37 Core i7-7600U (Kaby Lake)
38
39 armfazh: 119523 cycles per call
40 everest: 119040 cycles per call
41
42 Core i7-8750H (Coffee Lake)
43
44 armfazh: 113914 cycles per call
45 everest: 113650 cycles per call
46
47 Core i9-9880H (Coffee Lake Refresh)
48
49 armfazh: 112616 cycles per call
50 everest: 114082 cycles per call
51
52 Core i3-8121U (Cannon Lake)
53
54 armfazh: 113202 cycles per call
55 everest: 111382 cycles per call
56
57 Core i7-8265U (Whiskey Lake)
58
59 armfazh: 127307 cycles per call
60 everest: 127697 cycles per call
61
62 Core i7-8550U (Kaby Lake Refresh)
63
64 armfazh: 127522 cycles per call
65 everest: 127083 cycles per call
66
67 Xeon Platinum 8275CL (Cascade Lake)
68
69 armfazh: 114380 cycles per call
70 everest: 114656 cycles per call
71
72 Achieving these kind of results with formally verified code is quite
73 remarkable, especialy considering that performance is favorable for
74 newer chips.
75
76 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
77 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
78 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
79 ---
80 arch/x86/crypto/curve25519-x86_64.c | 3546 ++++++++++-----------------
81 1 file changed, 1292 insertions(+), 2254 deletions(-)
82
83 --- a/arch/x86/crypto/curve25519-x86_64.c
84 +++ b/arch/x86/crypto/curve25519-x86_64.c
85 @@ -1,8 +1,7 @@
86 -// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
87 +// SPDX-License-Identifier: GPL-2.0 OR MIT
88 /*
89 - * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved.
90 - * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
91 - * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
92 + * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
93 + * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
94 */
95
96 #include <crypto/curve25519.h>
97 @@ -16,2337 +15,1378 @@
98 #include <asm/cpufeature.h>
99 #include <asm/processor.h>
100
101 -static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2);
102 -static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx);
103 -
104 -enum { NUM_WORDS_ELTFP25519 = 4 };
105 -typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519];
106 -typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519];
107 -
108 -#define mul_eltfp25519_1w_adx(c, a, b) do { \
109 - mul_256x256_integer_adx(m.buffer, a, b); \
110 - red_eltfp25519_1w_adx(c, m.buffer); \
111 -} while (0)
112 -
113 -#define mul_eltfp25519_1w_bmi2(c, a, b) do { \
114 - mul_256x256_integer_bmi2(m.buffer, a, b); \
115 - red_eltfp25519_1w_bmi2(c, m.buffer); \
116 -} while (0)
117 -
118 -#define sqr_eltfp25519_1w_adx(a) do { \
119 - sqr_256x256_integer_adx(m.buffer, a); \
120 - red_eltfp25519_1w_adx(a, m.buffer); \
121 -} while (0)
122 -
123 -#define sqr_eltfp25519_1w_bmi2(a) do { \
124 - sqr_256x256_integer_bmi2(m.buffer, a); \
125 - red_eltfp25519_1w_bmi2(a, m.buffer); \
126 -} while (0)
127 -
128 -#define mul_eltfp25519_2w_adx(c, a, b) do { \
129 - mul2_256x256_integer_adx(m.buffer, a, b); \
130 - red_eltfp25519_2w_adx(c, m.buffer); \
131 -} while (0)
132 -
133 -#define mul_eltfp25519_2w_bmi2(c, a, b) do { \
134 - mul2_256x256_integer_bmi2(m.buffer, a, b); \
135 - red_eltfp25519_2w_bmi2(c, m.buffer); \
136 -} while (0)
137 -
138 -#define sqr_eltfp25519_2w_adx(a) do { \
139 - sqr2_256x256_integer_adx(m.buffer, a); \
140 - red_eltfp25519_2w_adx(a, m.buffer); \
141 -} while (0)
142 -
143 -#define sqr_eltfp25519_2w_bmi2(a) do { \
144 - sqr2_256x256_integer_bmi2(m.buffer, a); \
145 - red_eltfp25519_2w_bmi2(a, m.buffer); \
146 -} while (0)
147 -
148 -#define sqrn_eltfp25519_1w_adx(a, times) do { \
149 - int ____counter = (times); \
150 - while (____counter-- > 0) \
151 - sqr_eltfp25519_1w_adx(a); \
152 -} while (0)
153 -
154 -#define sqrn_eltfp25519_1w_bmi2(a, times) do { \
155 - int ____counter = (times); \
156 - while (____counter-- > 0) \
157 - sqr_eltfp25519_1w_bmi2(a); \
158 -} while (0)
159 -
160 -#define copy_eltfp25519_1w(C, A) do { \
161 - (C)[0] = (A)[0]; \
162 - (C)[1] = (A)[1]; \
163 - (C)[2] = (A)[2]; \
164 - (C)[3] = (A)[3]; \
165 -} while (0)
166 -
167 -#define setzero_eltfp25519_1w(C) do { \
168 - (C)[0] = 0; \
169 - (C)[1] = 0; \
170 - (C)[2] = 0; \
171 - (C)[3] = 0; \
172 -} while (0)
173 -
174 -__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = {
175 - /* 1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL,
176 - 0xffffffffffffffffUL, 0x5fffffffffffffffUL,
177 - /* 2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL,
178 - 0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL,
179 - /* 3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL,
180 - 0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL,
181 - /* 4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL,
182 - 0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL,
183 - /* 5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL,
184 - 0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL,
185 - /* 6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL,
186 - 0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL,
187 - /* 7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL,
188 - 0xc1c20d06231f7614UL, 0x2938218da274f972UL,
189 - /* 8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL,
190 - 0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL,
191 - /* 9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL,
192 - 0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL,
193 - /* 10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL,
194 - 0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL,
195 - /* 11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL,
196 - 0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL,
197 - /* 12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL,
198 - 0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL,
199 - /* 13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL,
200 - 0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL,
201 - /* 14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL,
202 - 0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL,
203 - /* 15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL,
204 - 0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL,
205 - /* 16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL,
206 - 0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL,
207 - /* 17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL,
208 - 0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL,
209 - /* 18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL,
210 - 0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL,
211 - /* 19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL,
212 - 0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL,
213 - /* 20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL,
214 - 0x9d4935467caaf22eUL, 0x5166408eee85ff49UL,
215 - /* 21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL,
216 - 0x5259729241159b1cUL, 0x6a621892d5b0ab33UL,
217 - /* 22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL,
218 - 0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL,
219 - /* 23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL,
220 - 0x23758739f630a257UL, 0x295a407a01a78580UL,
221 - /* 24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL,
222 - 0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL,
223 - /* 25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL,
224 - 0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL,
225 - /* 26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL,
226 - 0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL,
227 - /* 27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL,
228 - 0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL,
229 - /* 28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL,
230 - 0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL,
231 - /* 29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL,
232 - 0x74b4c4ceab102f64UL, 0x183abadd10139845UL,
233 - /* 30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL,
234 - 0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL,
235 - /* 31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL,
236 - 0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL,
237 - /* 32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL,
238 - 0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL,
239 - /* 33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL,
240 - 0xd88768e4904032d8UL, 0x131384427b3aaeecUL,
241 - /* 34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL,
242 - 0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL,
243 - /* 35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL,
244 - 0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL,
245 - /* 36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL,
246 - 0xa401760b882c797aUL, 0x1fc223e28dc88730UL,
247 - /* 37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL,
248 - 0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL,
249 - /* 38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL,
250 - 0xfdbf177988bbc586UL, 0x2959894fcad81df5UL,
251 - /* 39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL,
252 - 0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL,
253 - /* 40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL,
254 - 0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL,
255 - /* 41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL,
256 - 0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL,
257 - /* 42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL,
258 - 0x5c217736fa279374UL, 0x7dde05734afeb1faUL,
259 - /* 43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL,
260 - 0xe6053bf89595bf7aUL, 0x394faf38da245530UL,
261 - /* 44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL,
262 - 0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL,
263 - /* 45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL,
264 - 0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL,
265 - /* 46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL,
266 - 0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL,
267 - /* 47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL,
268 - 0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL,
269 - /* 48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL,
270 - 0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL,
271 - /* 49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL,
272 - 0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL,
273 - /* 50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL,
274 - 0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL,
275 - /* 51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL,
276 - 0xc189218075e91436UL, 0x6d9284169b3b8484UL,
277 - /* 52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL,
278 - 0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL,
279 - /* 53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL,
280 - 0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL,
281 - /* 54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL,
282 - 0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL,
283 - /* 55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL,
284 - 0x19346a65d3224a08UL, 0x0f5034e49b9af466UL,
285 - /* 56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL,
286 - 0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL,
287 - /* 57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL,
288 - 0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL,
289 - /* 58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL,
290 - 0xf826842130f5ad28UL, 0x3ea988f75301a441UL,
291 - /* 59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL,
292 - 0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL,
293 - /* 60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL,
294 - 0xd01469df811d644bUL, 0x77fea47d81a5d71fUL,
295 - /* 61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL,
296 - 0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL,
297 - /* 62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL,
298 - 0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL,
299 - /* 63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL,
300 - 0xbea450e1dbd885d5UL, 0x61b68649320f712cUL,
301 - /* 64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL,
302 - 0x25232973322dbef4UL, 0x445dc4758c17f770UL,
303 - /* 65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL,
304 - 0x1efebefdc053db34UL, 0x4adbe867c65daf99UL,
305 - /* 66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL,
306 - 0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL,
307 - /* 67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL,
308 - 0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL,
309 - /* 68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL,
310 - 0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL,
311 - /* 69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL,
312 - 0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL,
313 - /* 70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL,
314 - 0xbdaacb805831ca6fUL, 0x445b652dc916694fUL,
315 - /* 71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL,
316 - 0xa1823aafe04c314aUL, 0x790a2d94437cf586UL,
317 - /* 72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL,
318 - 0xbf70903b204f5169UL, 0x2f7a89891ba319feUL,
319 - /* 73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL,
320 - 0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL,
321 - /* 74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL,
322 - 0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL,
323 - /* 75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL,
324 - 0x674f1288f8e11217UL, 0x5682250f329f93d0UL,
325 - /* 76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL,
326 - 0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL,
327 - /* 77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL,
328 - 0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL,
329 - /* 78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL,
330 - 0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL,
331 - /* 79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL,
332 - 0x5deadacec9f04973UL, 0x29275b5d41d29b27UL,
333 - /* 80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL,
334 - 0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL,
335 - /* 81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL,
336 - 0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL,
337 - /* 82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL,
338 - 0x894d1d855ae52359UL, 0x68e122157b743d69UL,
339 - /* 83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL,
340 - 0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL,
341 - /* 84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL,
342 - 0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL,
343 - /* 85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL,
344 - 0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL,
345 - /* 86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL,
346 - 0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL,
347 - /* 87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL,
348 - 0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL,
349 - /* 88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL,
350 - 0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL,
351 - /* 89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL,
352 - 0x45adb16e76cefcf2UL, 0x01f768aead232999UL,
353 - /* 90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL,
354 - 0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL,
355 - /* 91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL,
356 - 0x5eefa966de2a701dUL, 0x23b20565de55e3efUL,
357 - /* 92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL,
358 - 0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL,
359 - /* 93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL,
360 - 0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL,
361 - /* 94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL,
362 - 0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL,
363 - /* 95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL,
364 - 0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL,
365 - /* 96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL,
366 - 0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL,
367 - /* 97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL,
368 - 0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL,
369 - /* 98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL,
370 - 0xb9886314844006b1UL, 0x40d2a72ab454cc60UL,
371 - /* 99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL,
372 - 0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL,
373 - /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL,
374 - 0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL,
375 - /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL,
376 - 0x40727064c416d74fUL, 0x6e15c6114b502ef0UL,
377 - /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL,
378 - 0x4a497962066e6043UL, 0x705b3aab41355b44UL,
379 - /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL,
380 - 0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL,
381 - /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL,
382 - 0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL,
383 - /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL,
384 - 0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL,
385 - /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL,
386 - 0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL,
387 - /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL,
388 - 0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL,
389 - /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL,
390 - 0x2088ce1570033c68UL, 0x7fba1f495c837987UL,
391 - /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL,
392 - 0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL,
393 - /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL,
394 - 0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL,
395 - /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL,
396 - 0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL,
397 - /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL,
398 - 0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL,
399 - /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL,
400 - 0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL,
401 - /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL,
402 - 0x00f52e3f67280294UL, 0x566d4fc14730c509UL,
403 - /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL,
404 - 0x216730fba68d6095UL, 0x22e8c3843f69cea7UL,
405 - /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL,
406 - 0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL,
407 - /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL,
408 - 0x508e862f121692fcUL, 0x3a81907fa093c291UL,
409 - /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL,
410 - 0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL,
411 - /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL,
412 - 0xbee595ce8a9df2e5UL, 0x25e496c722422236UL,
413 - /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL,
414 - 0xe488de11d761e352UL, 0x0e878a01a085545cUL,
415 - /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL,
416 - 0x9ea37a487ae80d67UL, 0x67a9958011e41794UL,
417 - /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL,
418 - 0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL,
419 - /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL,
420 - 0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL,
421 - /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL,
422 - 0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL,
423 - /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL,
424 - 0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL,
425 - /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL,
426 - 0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL,
427 - /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL,
428 - 0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL,
429 - /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL,
430 - 0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL,
431 - /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL,
432 - 0x97134556a9832d06UL, 0x269bb0360a84f8a0UL,
433 - /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL,
434 - 0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL,
435 - /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL,
436 - 0x904659bb686e3772UL, 0x7215c371746ba8c8UL,
437 - /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL,
438 - 0x266fd5809208f294UL, 0x5c847085619a26b9UL,
439 - /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL,
440 - 0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL,
441 - /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL,
442 - 0x2b1641917b307614UL, 0x117c554fc4f45b7cUL,
443 - /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL,
444 - 0xd7e803f4171b2827UL, 0x1015e87487d225eaUL,
445 - /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL,
446 - 0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL,
447 - /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL,
448 - 0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL,
449 - /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL,
450 - 0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL,
451 - /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL,
452 - 0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL,
453 - /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL,
454 - 0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL,
455 - /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL,
456 - 0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL,
457 - /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL,
458 - 0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL,
459 - /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL,
460 - 0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL,
461 - /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL,
462 - 0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL,
463 - /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL,
464 - 0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL,
465 - /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL,
466 - 0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL,
467 - /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL,
468 - 0xec219c48fbd21604UL, 0x1aaf1af517c36731UL,
469 - /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL,
470 - 0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL,
471 - /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL,
472 - 0x4dbbc207f531561aUL, 0x0253b7f082128a27UL,
473 - /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL,
474 - 0x52d17436309d4253UL, 0x356f97e13efae576UL,
475 - /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL,
476 - 0x0c776128bed92c98UL, 0x1d34ae93032885b8UL,
477 - /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL,
478 - 0x66124c6f97bda770UL, 0x0f81a0290654124aUL,
479 - /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL,
480 - 0xff08d03f93d8c20aUL, 0x52a148199faef26bUL,
481 - /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL,
482 - 0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL,
483 - /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL,
484 - 0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL,
485 - /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL,
486 - 0x5da643cb4bf30035UL, 0x77db28d63940f721UL,
487 - /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL,
488 - 0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL,
489 - /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL,
490 - 0x140a69245ca575edUL, 0x0cf1c37134273a4cUL,
491 - /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL,
492 - 0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL,
493 - /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL,
494 - 0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL,
495 - /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL,
496 - 0x497d723f802e88e1UL, 0x30684dea602f408dUL,
497 - /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL,
498 - 0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL,
499 - /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL,
500 - 0x7468c7423a543258UL, 0x4a7f11464eb5642fUL,
501 - /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL,
502 - 0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL,
503 - /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL,
504 - 0x026df551dbb85c20UL, 0x74fcd91047e21901UL,
505 - /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL,
506 - 0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL,
507 - /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL,
508 - 0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL,
509 - /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL,
510 - 0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL,
511 - /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL,
512 - 0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL,
513 - /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL,
514 - 0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL,
515 - /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL,
516 - 0x13033ac001f66697UL, 0x273b24fe3b367d75UL,
517 - /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL,
518 - 0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL,
519 - /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL,
520 - 0xacc63ca34b8ec145UL, 0x74621888fee66574UL,
521 - /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL,
522 - 0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL,
523 - /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL,
524 - 0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL,
525 - /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL,
526 - 0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL,
527 - /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL,
528 - 0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL,
529 - /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL,
530 - 0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL,
531 - /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL,
532 - 0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL,
533 - /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL,
534 - 0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL,
535 - /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL,
536 - 0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL,
537 - /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL,
538 - 0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL,
539 - /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL,
540 - 0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL,
541 - /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL,
542 - 0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL,
543 - /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL,
544 - 0x81004b71e33cc191UL, 0x44e6be345122803cUL,
545 - /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL,
546 - 0x49c8c4281af60c29UL, 0x21edb518de701aeeUL,
547 - /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL,
548 - 0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL,
549 - /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL,
550 - 0x12bc8d6915783712UL, 0x498194c0fc620abbUL,
551 - /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL,
552 - 0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL,
553 - /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL,
554 - 0x1e60c24598c71fffUL, 0x59f2f014979983efUL,
555 - /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL,
556 - 0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL,
557 - /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL,
558 - 0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL,
559 - /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL,
560 - 0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL,
561 - /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL,
562 - 0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL,
563 - /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL,
564 - 0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL,
565 - /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL,
566 - 0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL,
567 - /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL,
568 - 0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL,
569 - /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL,
570 - 0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL,
571 - /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL,
572 - 0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL,
573 - /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL,
574 - 0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL,
575 - /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL,
576 - 0x883ada83a6a1652cUL, 0x585f1974034d6c17UL,
577 - /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL,
578 - 0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL,
579 - /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL,
580 - 0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL,
581 - /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL,
582 - 0x33979624f0e917beUL, 0x2c018dc527356b30UL,
583 - /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL,
584 - 0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL,
585 - /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL,
586 - 0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL,
587 - /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL,
588 - 0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL,
589 - /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL,
590 - 0x345ead5e972d091eUL, 0x18c8df11a83103baUL,
591 - /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL,
592 - 0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL,
593 - /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL,
594 - 0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL,
595 - /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL,
596 - 0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL,
597 - /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL,
598 - 0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL,
599 - /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL,
600 - 0x79952a008221e738UL, 0x4322e1a7535cd2bbUL,
601 - /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL,
602 - 0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL,
603 - /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL,
604 - 0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL,
605 - /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL,
606 - 0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL,
607 - /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL,
608 - 0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL,
609 - /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL,
610 - 0x1df4c0af01314a60UL, 0x09a62dab89289527UL,
611 - /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL,
612 - 0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL,
613 - /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL,
614 - 0x49b96853d7a7084aUL, 0x4980a319601420a8UL,
615 - /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL,
616 - 0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL,
617 - /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL,
618 - 0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL,
619 - /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL,
620 - 0xddeb34a061615d99UL, 0x5129cecceb64b773UL,
621 - /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL,
622 - 0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL,
623 - /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL,
624 - 0x680bd77c73edad2eUL, 0x487c02354edd9041UL,
625 - /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL,
626 - 0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL,
627 - /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL,
628 - 0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL,
629 - /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL,
630 - 0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL,
631 - /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL,
632 - 0xe9834262d13921edUL, 0x27fedafaa54bb592UL,
633 - /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL,
634 - 0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL,
635 - /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL,
636 - 0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL,
637 - /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL,
638 - 0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL,
639 - /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL,
640 - 0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL,
641 - /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL,
642 - 0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL,
643 - /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL,
644 - 0x645b426f3d1d58acUL, 0x4804a82227a557bcUL,
645 - /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL,
646 - 0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL,
647 - /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL,
648 - 0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL,
649 - /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL,
650 - 0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL,
651 - /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL,
652 - 0xc26ccff352b37ec7UL, 0x056f68341d797b21UL,
653 - /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL,
654 - 0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL,
655 - /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL,
656 - 0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL,
657 - /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL,
658 - 0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL,
659 - /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL,
660 - 0x2c43ecea0107c1ddUL, 0x526028809372de35UL,
661 - /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL,
662 - 0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL,
663 - /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL,
664 - 0x899fc38fc4b5c515UL, 0x250386b124ffc207UL,
665 - /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL,
666 - 0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL,
667 - /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL,
668 - 0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL,
669 - /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL,
670 - 0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL,
671 - /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL,
672 - 0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL,
673 - /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL,
674 - 0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL,
675 - /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL,
676 - 0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL,
677 - /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL,
678 - 0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL
679 -};
680 -
681 -/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
682 - * a is two 256-bit integers: a0[0:3] and a1[4:7]
683 - * b is two 256-bit integers: b0[0:3] and b1[4:7]
684 - */
685 -static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a,
686 - const u64 *const b)
687 -{
688 - asm volatile(
689 - "xorl %%r14d, %%r14d ;"
690 - "movq (%1), %%rdx; " /* A[0] */
691 - "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
692 - "xorl %%r10d, %%r10d ;"
693 - "movq %%r8, (%0) ;"
694 - "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
695 - "adox %%r10, %%r15 ;"
696 - "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
697 - "adox %%r8, %%rax ;"
698 - "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
699 - "adox %%r10, %%rbx ;"
700 - /******************************************/
701 - "adox %%r14, %%rcx ;"
702 -
703 - "movq 8(%1), %%rdx; " /* A[1] */
704 - "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
705 - "adox %%r15, %%r8 ;"
706 - "movq %%r8, 8(%0) ;"
707 - "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
708 - "adox %%r10, %%r9 ;"
709 - "adcx %%r9, %%rax ;"
710 - "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
711 - "adox %%r8, %%r11 ;"
712 - "adcx %%r11, %%rbx ;"
713 - "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
714 - "adox %%r10, %%r13 ;"
715 - "adcx %%r13, %%rcx ;"
716 - /******************************************/
717 - "adox %%r14, %%r15 ;"
718 - "adcx %%r14, %%r15 ;"
719 -
720 - "movq 16(%1), %%rdx; " /* A[2] */
721 - "xorl %%r10d, %%r10d ;"
722 - "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
723 - "adox %%rax, %%r8 ;"
724 - "movq %%r8, 16(%0) ;"
725 - "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
726 - "adox %%r10, %%r9 ;"
727 - "adcx %%r9, %%rbx ;"
728 - "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
729 - "adox %%r8, %%r11 ;"
730 - "adcx %%r11, %%rcx ;"
731 - "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
732 - "adox %%r10, %%r13 ;"
733 - "adcx %%r13, %%r15 ;"
734 - /******************************************/
735 - "adox %%r14, %%rax ;"
736 - "adcx %%r14, %%rax ;"
737 -
738 - "movq 24(%1), %%rdx; " /* A[3] */
739 - "xorl %%r10d, %%r10d ;"
740 - "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
741 - "adox %%rbx, %%r8 ;"
742 - "movq %%r8, 24(%0) ;"
743 - "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
744 - "adox %%r10, %%r9 ;"
745 - "adcx %%r9, %%rcx ;"
746 - "movq %%rcx, 32(%0) ;"
747 - "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
748 - "adox %%r8, %%r11 ;"
749 - "adcx %%r11, %%r15 ;"
750 - "movq %%r15, 40(%0) ;"
751 - "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
752 - "adox %%r10, %%r13 ;"
753 - "adcx %%r13, %%rax ;"
754 - "movq %%rax, 48(%0) ;"
755 - /******************************************/
756 - "adox %%r14, %%rbx ;"
757 - "adcx %%r14, %%rbx ;"
758 - "movq %%rbx, 56(%0) ;"
759 -
760 - "movq 32(%1), %%rdx; " /* C[0] */
761 - "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */
762 - "xorl %%r10d, %%r10d ;"
763 - "movq %%r8, 64(%0);"
764 - "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
765 - "adox %%r10, %%r15 ;"
766 - "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
767 - "adox %%r8, %%rax ;"
768 - "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
769 - "adox %%r10, %%rbx ;"
770 - /******************************************/
771 - "adox %%r14, %%rcx ;"
772 -
773 - "movq 40(%1), %%rdx; " /* C[1] */
774 - "xorl %%r10d, %%r10d ;"
775 - "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
776 - "adox %%r15, %%r8 ;"
777 - "movq %%r8, 72(%0);"
778 - "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
779 - "adox %%r10, %%r9 ;"
780 - "adcx %%r9, %%rax ;"
781 - "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
782 - "adox %%r8, %%r11 ;"
783 - "adcx %%r11, %%rbx ;"
784 - "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
785 - "adox %%r10, %%r13 ;"
786 - "adcx %%r13, %%rcx ;"
787 - /******************************************/
788 - "adox %%r14, %%r15 ;"
789 - "adcx %%r14, %%r15 ;"
790 -
791 - "movq 48(%1), %%rdx; " /* C[2] */
792 - "xorl %%r10d, %%r10d ;"
793 - "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
794 - "adox %%rax, %%r8 ;"
795 - "movq %%r8, 80(%0);"
796 - "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
797 - "adox %%r10, %%r9 ;"
798 - "adcx %%r9, %%rbx ;"
799 - "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
800 - "adox %%r8, %%r11 ;"
801 - "adcx %%r11, %%rcx ;"
802 - "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
803 - "adox %%r10, %%r13 ;"
804 - "adcx %%r13, %%r15 ;"
805 - /******************************************/
806 - "adox %%r14, %%rax ;"
807 - "adcx %%r14, %%rax ;"
808 -
809 - "movq 56(%1), %%rdx; " /* C[3] */
810 - "xorl %%r10d, %%r10d ;"
811 - "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
812 - "adox %%rbx, %%r8 ;"
813 - "movq %%r8, 88(%0);"
814 - "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
815 - "adox %%r10, %%r9 ;"
816 - "adcx %%r9, %%rcx ;"
817 - "movq %%rcx, 96(%0) ;"
818 - "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
819 - "adox %%r8, %%r11 ;"
820 - "adcx %%r11, %%r15 ;"
821 - "movq %%r15, 104(%0) ;"
822 - "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
823 - "adox %%r10, %%r13 ;"
824 - "adcx %%r13, %%rax ;"
825 - "movq %%rax, 112(%0) ;"
826 - /******************************************/
827 - "adox %%r14, %%rbx ;"
828 - "adcx %%r14, %%rbx ;"
829 - "movq %%rbx, 120(%0) ;"
830 - :
831 - : "r"(c), "r"(a), "r"(b)
832 - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
833 - "%r10", "%r11", "%r13", "%r14", "%r15");
834 -}
835 -
836 -static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a,
837 - const u64 *const b)
838 +static __always_inline u64 eq_mask(u64 a, u64 b)
839 {
840 - asm volatile(
841 - "movq (%1), %%rdx; " /* A[0] */
842 - "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
843 - "movq %%r8, (%0) ;"
844 - "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
845 - "addq %%r10, %%r15 ;"
846 - "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
847 - "adcq %%r8, %%rax ;"
848 - "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
849 - "adcq %%r10, %%rbx ;"
850 - /******************************************/
851 - "adcq $0, %%rcx ;"
852 -
853 - "movq 8(%1), %%rdx; " /* A[1] */
854 - "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
855 - "addq %%r15, %%r8 ;"
856 - "movq %%r8, 8(%0) ;"
857 - "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
858 - "adcq %%r10, %%r9 ;"
859 - "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
860 - "adcq %%r8, %%r11 ;"
861 - "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
862 - "adcq %%r10, %%r13 ;"
863 - /******************************************/
864 - "adcq $0, %%r15 ;"
865 -
866 - "addq %%r9, %%rax ;"
867 - "adcq %%r11, %%rbx ;"
868 - "adcq %%r13, %%rcx ;"
869 - "adcq $0, %%r15 ;"
870 -
871 - "movq 16(%1), %%rdx; " /* A[2] */
872 - "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
873 - "addq %%rax, %%r8 ;"
874 - "movq %%r8, 16(%0) ;"
875 - "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
876 - "adcq %%r10, %%r9 ;"
877 - "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
878 - "adcq %%r8, %%r11 ;"
879 - "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
880 - "adcq %%r10, %%r13 ;"
881 - /******************************************/
882 - "adcq $0, %%rax ;"
883 -
884 - "addq %%r9, %%rbx ;"
885 - "adcq %%r11, %%rcx ;"
886 - "adcq %%r13, %%r15 ;"
887 - "adcq $0, %%rax ;"
888 -
889 - "movq 24(%1), %%rdx; " /* A[3] */
890 - "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
891 - "addq %%rbx, %%r8 ;"
892 - "movq %%r8, 24(%0) ;"
893 - "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
894 - "adcq %%r10, %%r9 ;"
895 - "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
896 - "adcq %%r8, %%r11 ;"
897 - "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
898 - "adcq %%r10, %%r13 ;"
899 - /******************************************/
900 - "adcq $0, %%rbx ;"
901 -
902 - "addq %%r9, %%rcx ;"
903 - "movq %%rcx, 32(%0) ;"
904 - "adcq %%r11, %%r15 ;"
905 - "movq %%r15, 40(%0) ;"
906 - "adcq %%r13, %%rax ;"
907 - "movq %%rax, 48(%0) ;"
908 - "adcq $0, %%rbx ;"
909 - "movq %%rbx, 56(%0) ;"
910 -
911 - "movq 32(%1), %%rdx; " /* C[0] */
912 - "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */
913 - "movq %%r8, 64(%0) ;"
914 - "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
915 - "addq %%r10, %%r15 ;"
916 - "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
917 - "adcq %%r8, %%rax ;"
918 - "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
919 - "adcq %%r10, %%rbx ;"
920 - /******************************************/
921 - "adcq $0, %%rcx ;"
922 -
923 - "movq 40(%1), %%rdx; " /* C[1] */
924 - "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
925 - "addq %%r15, %%r8 ;"
926 - "movq %%r8, 72(%0) ;"
927 - "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
928 - "adcq %%r10, %%r9 ;"
929 - "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
930 - "adcq %%r8, %%r11 ;"
931 - "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
932 - "adcq %%r10, %%r13 ;"
933 - /******************************************/
934 - "adcq $0, %%r15 ;"
935 -
936 - "addq %%r9, %%rax ;"
937 - "adcq %%r11, %%rbx ;"
938 - "adcq %%r13, %%rcx ;"
939 - "adcq $0, %%r15 ;"
940 -
941 - "movq 48(%1), %%rdx; " /* C[2] */
942 - "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
943 - "addq %%rax, %%r8 ;"
944 - "movq %%r8, 80(%0) ;"
945 - "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
946 - "adcq %%r10, %%r9 ;"
947 - "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
948 - "adcq %%r8, %%r11 ;"
949 - "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
950 - "adcq %%r10, %%r13 ;"
951 - /******************************************/
952 - "adcq $0, %%rax ;"
953 -
954 - "addq %%r9, %%rbx ;"
955 - "adcq %%r11, %%rcx ;"
956 - "adcq %%r13, %%r15 ;"
957 - "adcq $0, %%rax ;"
958 -
959 - "movq 56(%1), %%rdx; " /* C[3] */
960 - "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
961 - "addq %%rbx, %%r8 ;"
962 - "movq %%r8, 88(%0) ;"
963 - "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
964 - "adcq %%r10, %%r9 ;"
965 - "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
966 - "adcq %%r8, %%r11 ;"
967 - "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
968 - "adcq %%r10, %%r13 ;"
969 - /******************************************/
970 - "adcq $0, %%rbx ;"
971 -
972 - "addq %%r9, %%rcx ;"
973 - "movq %%rcx, 96(%0) ;"
974 - "adcq %%r11, %%r15 ;"
975 - "movq %%r15, 104(%0) ;"
976 - "adcq %%r13, %%rax ;"
977 - "movq %%rax, 112(%0) ;"
978 - "adcq $0, %%rbx ;"
979 - "movq %%rbx, 120(%0) ;"
980 - :
981 - : "r"(c), "r"(a), "r"(b)
982 - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
983 - "%r10", "%r11", "%r13", "%r15");
984 + u64 x = a ^ b;
985 + u64 minus_x = ~x + (u64)1U;
986 + u64 x_or_minus_x = x | minus_x;
987 + u64 xnx = x_or_minus_x >> (u32)63U;
988 + return xnx - (u64)1U;
989 }
990
991 -static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a)
992 +static __always_inline u64 gte_mask(u64 a, u64 b)
993 {
994 - asm volatile(
995 - "movq (%1), %%rdx ;" /* A[0] */
996 - "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
997 - "xorl %%r15d, %%r15d;"
998 - "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
999 - "adcx %%r14, %%r9 ;"
1000 - "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
1001 - "adcx %%rax, %%r10 ;"
1002 - "movq 24(%1), %%rdx ;" /* A[3] */
1003 - "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
1004 - "adcx %%rcx, %%r11 ;"
1005 - "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
1006 - "adcx %%rax, %%rbx ;"
1007 - "movq 8(%1), %%rdx ;" /* A[1] */
1008 - "adcx %%r15, %%r13 ;"
1009 - "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
1010 - "movq $0, %%r14 ;"
1011 - /******************************************/
1012 - "adcx %%r15, %%r14 ;"
1013 -
1014 - "xorl %%r15d, %%r15d;"
1015 - "adox %%rax, %%r10 ;"
1016 - "adcx %%r8, %%r8 ;"
1017 - "adox %%rcx, %%r11 ;"
1018 - "adcx %%r9, %%r9 ;"
1019 - "adox %%r15, %%rbx ;"
1020 - "adcx %%r10, %%r10 ;"
1021 - "adox %%r15, %%r13 ;"
1022 - "adcx %%r11, %%r11 ;"
1023 - "adox %%r15, %%r14 ;"
1024 - "adcx %%rbx, %%rbx ;"
1025 - "adcx %%r13, %%r13 ;"
1026 - "adcx %%r14, %%r14 ;"
1027 -
1028 - "movq (%1), %%rdx ;"
1029 - "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
1030 - /*******************/
1031 - "movq %%rax, 0(%0) ;"
1032 - "addq %%rcx, %%r8 ;"
1033 - "movq %%r8, 8(%0) ;"
1034 - "movq 8(%1), %%rdx ;"
1035 - "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
1036 - "adcq %%rax, %%r9 ;"
1037 - "movq %%r9, 16(%0) ;"
1038 - "adcq %%rcx, %%r10 ;"
1039 - "movq %%r10, 24(%0) ;"
1040 - "movq 16(%1), %%rdx ;"
1041 - "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
1042 - "adcq %%rax, %%r11 ;"
1043 - "movq %%r11, 32(%0) ;"
1044 - "adcq %%rcx, %%rbx ;"
1045 - "movq %%rbx, 40(%0) ;"
1046 - "movq 24(%1), %%rdx ;"
1047 - "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
1048 - "adcq %%rax, %%r13 ;"
1049 - "movq %%r13, 48(%0) ;"
1050 - "adcq %%rcx, %%r14 ;"
1051 - "movq %%r14, 56(%0) ;"
1052 -
1053 -
1054 - "movq 32(%1), %%rdx ;" /* B[0] */
1055 - "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */
1056 - "xorl %%r15d, %%r15d;"
1057 - "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */
1058 - "adcx %%r14, %%r9 ;"
1059 - "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */
1060 - "adcx %%rax, %%r10 ;"
1061 - "movq 56(%1), %%rdx ;" /* B[3] */
1062 - "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */
1063 - "adcx %%rcx, %%r11 ;"
1064 - "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */
1065 - "adcx %%rax, %%rbx ;"
1066 - "movq 40(%1), %%rdx ;" /* B[1] */
1067 - "adcx %%r15, %%r13 ;"
1068 - "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */
1069 - "movq $0, %%r14 ;"
1070 - /******************************************/
1071 - "adcx %%r15, %%r14 ;"
1072 -
1073 - "xorl %%r15d, %%r15d;"
1074 - "adox %%rax, %%r10 ;"
1075 - "adcx %%r8, %%r8 ;"
1076 - "adox %%rcx, %%r11 ;"
1077 - "adcx %%r9, %%r9 ;"
1078 - "adox %%r15, %%rbx ;"
1079 - "adcx %%r10, %%r10 ;"
1080 - "adox %%r15, %%r13 ;"
1081 - "adcx %%r11, %%r11 ;"
1082 - "adox %%r15, %%r14 ;"
1083 - "adcx %%rbx, %%rbx ;"
1084 - "adcx %%r13, %%r13 ;"
1085 - "adcx %%r14, %%r14 ;"
1086 -
1087 - "movq 32(%1), %%rdx ;"
1088 - "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
1089 - /*******************/
1090 - "movq %%rax, 64(%0) ;"
1091 - "addq %%rcx, %%r8 ;"
1092 - "movq %%r8, 72(%0) ;"
1093 - "movq 40(%1), %%rdx ;"
1094 - "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
1095 - "adcq %%rax, %%r9 ;"
1096 - "movq %%r9, 80(%0) ;"
1097 - "adcq %%rcx, %%r10 ;"
1098 - "movq %%r10, 88(%0) ;"
1099 - "movq 48(%1), %%rdx ;"
1100 - "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
1101 - "adcq %%rax, %%r11 ;"
1102 - "movq %%r11, 96(%0) ;"
1103 - "adcq %%rcx, %%rbx ;"
1104 - "movq %%rbx, 104(%0) ;"
1105 - "movq 56(%1), %%rdx ;"
1106 - "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
1107 - "adcq %%rax, %%r13 ;"
1108 - "movq %%r13, 112(%0) ;"
1109 - "adcq %%rcx, %%r14 ;"
1110 - "movq %%r14, 120(%0) ;"
1111 - :
1112 - : "r"(c), "r"(a)
1113 - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1114 - "%r10", "%r11", "%r13", "%r14", "%r15");
1115 + u64 x = a;
1116 + u64 y = b;
1117 + u64 x_xor_y = x ^ y;
1118 + u64 x_sub_y = x - y;
1119 + u64 x_sub_y_xor_y = x_sub_y ^ y;
1120 + u64 q = x_xor_y | x_sub_y_xor_y;
1121 + u64 x_xor_q = x ^ q;
1122 + u64 x_xor_q_ = x_xor_q >> (u32)63U;
1123 + return x_xor_q_ - (u64)1U;
1124 }
1125
1126 -static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a)
1127 +/* Computes the addition of four-element f1 with value in f2
1128 + * and returns the carry (if any) */
1129 +static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
1130 {
1131 - asm volatile(
1132 - "movq 8(%1), %%rdx ;" /* A[1] */
1133 - "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
1134 - "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
1135 - "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
1136 -
1137 - "movq 16(%1), %%rdx ;" /* A[2] */
1138 - "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
1139 - "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
1140 -
1141 - "addq %%rax, %%r9 ;"
1142 - "adcq %%rdx, %%r10 ;"
1143 - "adcq %%rcx, %%r11 ;"
1144 - "adcq %%r14, %%r15 ;"
1145 - "adcq $0, %%r13 ;"
1146 - "movq $0, %%r14 ;"
1147 - "adcq $0, %%r14 ;"
1148 -
1149 - "movq (%1), %%rdx ;" /* A[0] */
1150 - "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
1151 -
1152 - "addq %%rax, %%r10 ;"
1153 - "adcq %%rcx, %%r11 ;"
1154 - "adcq $0, %%r15 ;"
1155 - "adcq $0, %%r13 ;"
1156 - "adcq $0, %%r14 ;"
1157 -
1158 - "shldq $1, %%r13, %%r14 ;"
1159 - "shldq $1, %%r15, %%r13 ;"
1160 - "shldq $1, %%r11, %%r15 ;"
1161 - "shldq $1, %%r10, %%r11 ;"
1162 - "shldq $1, %%r9, %%r10 ;"
1163 - "shldq $1, %%r8, %%r9 ;"
1164 - "shlq $1, %%r8 ;"
1165 -
1166 - /*******************/
1167 - "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
1168 - /*******************/
1169 - "movq %%rax, 0(%0) ;"
1170 - "addq %%rcx, %%r8 ;"
1171 - "movq %%r8, 8(%0) ;"
1172 - "movq 8(%1), %%rdx ;"
1173 - "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
1174 - "adcq %%rax, %%r9 ;"
1175 - "movq %%r9, 16(%0) ;"
1176 - "adcq %%rcx, %%r10 ;"
1177 - "movq %%r10, 24(%0) ;"
1178 - "movq 16(%1), %%rdx ;"
1179 - "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
1180 - "adcq %%rax, %%r11 ;"
1181 - "movq %%r11, 32(%0) ;"
1182 - "adcq %%rcx, %%r15 ;"
1183 - "movq %%r15, 40(%0) ;"
1184 - "movq 24(%1), %%rdx ;"
1185 - "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
1186 - "adcq %%rax, %%r13 ;"
1187 - "movq %%r13, 48(%0) ;"
1188 - "adcq %%rcx, %%r14 ;"
1189 - "movq %%r14, 56(%0) ;"
1190 -
1191 - "movq 40(%1), %%rdx ;" /* B[1] */
1192 - "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */
1193 - "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
1194 - "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
1195 -
1196 - "movq 48(%1), %%rdx ;" /* B[2] */
1197 - "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */
1198 - "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
1199 -
1200 - "addq %%rax, %%r9 ;"
1201 - "adcq %%rdx, %%r10 ;"
1202 - "adcq %%rcx, %%r11 ;"
1203 - "adcq %%r14, %%r15 ;"
1204 - "adcq $0, %%r13 ;"
1205 - "movq $0, %%r14 ;"
1206 - "adcq $0, %%r14 ;"
1207 -
1208 - "movq 32(%1), %%rdx ;" /* B[0] */
1209 - "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
1210 -
1211 - "addq %%rax, %%r10 ;"
1212 - "adcq %%rcx, %%r11 ;"
1213 - "adcq $0, %%r15 ;"
1214 - "adcq $0, %%r13 ;"
1215 - "adcq $0, %%r14 ;"
1216 -
1217 - "shldq $1, %%r13, %%r14 ;"
1218 - "shldq $1, %%r15, %%r13 ;"
1219 - "shldq $1, %%r11, %%r15 ;"
1220 - "shldq $1, %%r10, %%r11 ;"
1221 - "shldq $1, %%r9, %%r10 ;"
1222 - "shldq $1, %%r8, %%r9 ;"
1223 - "shlq $1, %%r8 ;"
1224 -
1225 - /*******************/
1226 - "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
1227 - /*******************/
1228 - "movq %%rax, 64(%0) ;"
1229 - "addq %%rcx, %%r8 ;"
1230 - "movq %%r8, 72(%0) ;"
1231 - "movq 40(%1), %%rdx ;"
1232 - "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
1233 - "adcq %%rax, %%r9 ;"
1234 - "movq %%r9, 80(%0) ;"
1235 - "adcq %%rcx, %%r10 ;"
1236 - "movq %%r10, 88(%0) ;"
1237 - "movq 48(%1), %%rdx ;"
1238 - "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
1239 - "adcq %%rax, %%r11 ;"
1240 - "movq %%r11, 96(%0) ;"
1241 - "adcq %%rcx, %%r15 ;"
1242 - "movq %%r15, 104(%0) ;"
1243 - "movq 56(%1), %%rdx ;"
1244 - "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
1245 - "adcq %%rax, %%r13 ;"
1246 - "movq %%r13, 112(%0) ;"
1247 - "adcq %%rcx, %%r14 ;"
1248 - "movq %%r14, 120(%0) ;"
1249 - :
1250 - : "r"(c), "r"(a)
1251 - : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1252 - "%r11", "%r13", "%r14", "%r15");
1253 -}
1254 + u64 carry_r;
1255
1256 -static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a)
1257 -{
1258 asm volatile(
1259 - "movl $38, %%edx; " /* 2*c = 38 = 2^256 */
1260 - "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */
1261 - "xorl %%ebx, %%ebx ;"
1262 - "adox (%1), %%r8 ;"
1263 - "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */
1264 - "adcx %%r10, %%r9 ;"
1265 - "adox 8(%1), %%r9 ;"
1266 - "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */
1267 - "adcx %%r11, %%r10 ;"
1268 - "adox 16(%1), %%r10 ;"
1269 - "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */
1270 - "adcx %%rax, %%r11 ;"
1271 - "adox 24(%1), %%r11 ;"
1272 - /***************************************/
1273 - "adcx %%rbx, %%rcx ;"
1274 - "adox %%rbx, %%rcx ;"
1275 - "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1276 - "adcx %%rcx, %%r8 ;"
1277 - "adcx %%rbx, %%r9 ;"
1278 - "movq %%r9, 8(%0) ;"
1279 - "adcx %%rbx, %%r10 ;"
1280 - "movq %%r10, 16(%0) ;"
1281 - "adcx %%rbx, %%r11 ;"
1282 - "movq %%r11, 24(%0) ;"
1283 - "mov $0, %%ecx ;"
1284 - "cmovc %%edx, %%ecx ;"
1285 - "addq %%rcx, %%r8 ;"
1286 - "movq %%r8, (%0) ;"
1287 -
1288 - "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */
1289 - "xorl %%ebx, %%ebx ;"
1290 - "adox 64(%1), %%r8 ;"
1291 - "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */
1292 - "adcx %%r10, %%r9 ;"
1293 - "adox 72(%1), %%r9 ;"
1294 - "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */
1295 - "adcx %%r11, %%r10 ;"
1296 - "adox 80(%1), %%r10 ;"
1297 - "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */
1298 - "adcx %%rax, %%r11 ;"
1299 - "adox 88(%1), %%r11 ;"
1300 - /****************************************/
1301 - "adcx %%rbx, %%rcx ;"
1302 - "adox %%rbx, %%rcx ;"
1303 - "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1304 - "adcx %%rcx, %%r8 ;"
1305 - "adcx %%rbx, %%r9 ;"
1306 - "movq %%r9, 40(%0) ;"
1307 - "adcx %%rbx, %%r10 ;"
1308 - "movq %%r10, 48(%0) ;"
1309 - "adcx %%rbx, %%r11 ;"
1310 - "movq %%r11, 56(%0) ;"
1311 - "mov $0, %%ecx ;"
1312 - "cmovc %%edx, %%ecx ;"
1313 - "addq %%rcx, %%r8 ;"
1314 - "movq %%r8, 32(%0) ;"
1315 - :
1316 - : "r"(c), "r"(a)
1317 - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1318 - "%r10", "%r11");
1319 -}
1320 + /* Clear registers to propagate the carry bit */
1321 + " xor %%r8, %%r8;"
1322 + " xor %%r9, %%r9;"
1323 + " xor %%r10, %%r10;"
1324 + " xor %%r11, %%r11;"
1325 + " xor %1, %1;"
1326 +
1327 + /* Begin addition chain */
1328 + " addq 0(%3), %0;"
1329 + " movq %0, 0(%2);"
1330 + " adcxq 8(%3), %%r8;"
1331 + " movq %%r8, 8(%2);"
1332 + " adcxq 16(%3), %%r9;"
1333 + " movq %%r9, 16(%2);"
1334 + " adcxq 24(%3), %%r10;"
1335 + " movq %%r10, 24(%2);"
1336 +
1337 + /* Return the carry bit in a register */
1338 + " adcx %%r11, %1;"
1339 + : "+&r" (f2), "=&r" (carry_r)
1340 + : "r" (out), "r" (f1)
1341 + : "%r8", "%r9", "%r10", "%r11", "memory", "cc"
1342 + );
1343
1344 -static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a)
1345 -{
1346 - asm volatile(
1347 - "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */
1348 - "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
1349 - "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
1350 - "addq %%r10, %%r9 ;"
1351 - "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1352 - "adcq %%r11, %%r10 ;"
1353 - "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1354 - "adcq %%rax, %%r11 ;"
1355 - /***************************************/
1356 - "adcq $0, %%rcx ;"
1357 - "addq (%1), %%r8 ;"
1358 - "adcq 8(%1), %%r9 ;"
1359 - "adcq 16(%1), %%r10 ;"
1360 - "adcq 24(%1), %%r11 ;"
1361 - "adcq $0, %%rcx ;"
1362 - "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1363 - "addq %%rcx, %%r8 ;"
1364 - "adcq $0, %%r9 ;"
1365 - "movq %%r9, 8(%0) ;"
1366 - "adcq $0, %%r10 ;"
1367 - "movq %%r10, 16(%0) ;"
1368 - "adcq $0, %%r11 ;"
1369 - "movq %%r11, 24(%0) ;"
1370 - "mov $0, %%ecx ;"
1371 - "cmovc %%edx, %%ecx ;"
1372 - "addq %%rcx, %%r8 ;"
1373 - "movq %%r8, (%0) ;"
1374 -
1375 - "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */
1376 - "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */
1377 - "addq %%r10, %%r9 ;"
1378 - "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */
1379 - "adcq %%r11, %%r10 ;"
1380 - "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */
1381 - "adcq %%rax, %%r11 ;"
1382 - /****************************************/
1383 - "adcq $0, %%rcx ;"
1384 - "addq 64(%1), %%r8 ;"
1385 - "adcq 72(%1), %%r9 ;"
1386 - "adcq 80(%1), %%r10 ;"
1387 - "adcq 88(%1), %%r11 ;"
1388 - "adcq $0, %%rcx ;"
1389 - "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1390 - "addq %%rcx, %%r8 ;"
1391 - "adcq $0, %%r9 ;"
1392 - "movq %%r9, 40(%0) ;"
1393 - "adcq $0, %%r10 ;"
1394 - "movq %%r10, 48(%0) ;"
1395 - "adcq $0, %%r11 ;"
1396 - "movq %%r11, 56(%0) ;"
1397 - "mov $0, %%ecx ;"
1398 - "cmovc %%edx, %%ecx ;"
1399 - "addq %%rcx, %%r8 ;"
1400 - "movq %%r8, 32(%0) ;"
1401 - :
1402 - : "r"(c), "r"(a)
1403 - : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1404 - "%r11");
1405 + return carry_r;
1406 }
1407
1408 -static void mul_256x256_integer_adx(u64 *const c, const u64 *const a,
1409 - const u64 *const b)
1410 +/* Computes the field addition of two field elements */
1411 +static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
1412 {
1413 asm volatile(
1414 - "movq (%1), %%rdx; " /* A[0] */
1415 - "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */
1416 - "xorl %%r10d, %%r10d ;"
1417 - "movq %%r8, (%0) ;"
1418 - "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */
1419 - "adox %%r9, %%r10 ;"
1420 - "movq %%r10, 8(%0) ;"
1421 - "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */
1422 - "adox %%r11, %%r15 ;"
1423 - "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */
1424 - "adox %%r13, %%r14 ;"
1425 - "movq $0, %%rax ;"
1426 - /******************************************/
1427 - "adox %%rdx, %%rax ;"
1428 -
1429 - "movq 8(%1), %%rdx; " /* A[1] */
1430 - "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
1431 - "xorl %%r10d, %%r10d ;"
1432 - "adcx 8(%0), %%r8 ;"
1433 - "movq %%r8, 8(%0) ;"
1434 - "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
1435 - "adox %%r9, %%r10 ;"
1436 - "adcx %%r15, %%r10 ;"
1437 - "movq %%r10, 16(%0) ;"
1438 - "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */
1439 - "adox %%r11, %%r15 ;"
1440 - "adcx %%r14, %%r15 ;"
1441 - "movq $0, %%r8 ;"
1442 - "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */
1443 - "adox %%r13, %%r14 ;"
1444 - "adcx %%rax, %%r14 ;"
1445 - "movq $0, %%rax ;"
1446 - /******************************************/
1447 - "adox %%rdx, %%rax ;"
1448 - "adcx %%r8, %%rax ;"
1449 -
1450 - "movq 16(%1), %%rdx; " /* A[2] */
1451 - "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
1452 - "xorl %%r10d, %%r10d ;"
1453 - "adcx 16(%0), %%r8 ;"
1454 - "movq %%r8, 16(%0) ;"
1455 - "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
1456 - "adox %%r9, %%r10 ;"
1457 - "adcx %%r15, %%r10 ;"
1458 - "movq %%r10, 24(%0) ;"
1459 - "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */
1460 - "adox %%r11, %%r15 ;"
1461 - "adcx %%r14, %%r15 ;"
1462 - "movq $0, %%r8 ;"
1463 - "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */
1464 - "adox %%r13, %%r14 ;"
1465 - "adcx %%rax, %%r14 ;"
1466 - "movq $0, %%rax ;"
1467 - /******************************************/
1468 - "adox %%rdx, %%rax ;"
1469 - "adcx %%r8, %%rax ;"
1470 -
1471 - "movq 24(%1), %%rdx; " /* A[3] */
1472 - "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
1473 - "xorl %%r10d, %%r10d ;"
1474 - "adcx 24(%0), %%r8 ;"
1475 - "movq %%r8, 24(%0) ;"
1476 - "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
1477 - "adox %%r9, %%r10 ;"
1478 - "adcx %%r15, %%r10 ;"
1479 - "movq %%r10, 32(%0) ;"
1480 - "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */
1481 - "adox %%r11, %%r15 ;"
1482 - "adcx %%r14, %%r15 ;"
1483 - "movq %%r15, 40(%0) ;"
1484 - "movq $0, %%r8 ;"
1485 - "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */
1486 - "adox %%r13, %%r14 ;"
1487 - "adcx %%rax, %%r14 ;"
1488 - "movq %%r14, 48(%0) ;"
1489 - "movq $0, %%rax ;"
1490 - /******************************************/
1491 - "adox %%rdx, %%rax ;"
1492 - "adcx %%r8, %%rax ;"
1493 - "movq %%rax, 56(%0) ;"
1494 - :
1495 - : "r"(c), "r"(a), "r"(b)
1496 - : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11",
1497 - "%r13", "%r14", "%r15");
1498 + /* Compute the raw addition of f1 + f2 */
1499 + " movq 0(%0), %%r8;"
1500 + " addq 0(%2), %%r8;"
1501 + " movq 8(%0), %%r9;"
1502 + " adcxq 8(%2), %%r9;"
1503 + " movq 16(%0), %%r10;"
1504 + " adcxq 16(%2), %%r10;"
1505 + " movq 24(%0), %%r11;"
1506 + " adcxq 24(%2), %%r11;"
1507 +
1508 + /* Wrap the result back into the field */
1509 +
1510 + /* Step 1: Compute carry*38 */
1511 + " mov $0, %%rax;"
1512 + " mov $38, %0;"
1513 + " cmovc %0, %%rax;"
1514 +
1515 + /* Step 2: Add carry*38 to the original sum */
1516 + " xor %%rcx, %%rcx;"
1517 + " add %%rax, %%r8;"
1518 + " adcx %%rcx, %%r9;"
1519 + " movq %%r9, 8(%1);"
1520 + " adcx %%rcx, %%r10;"
1521 + " movq %%r10, 16(%1);"
1522 + " adcx %%rcx, %%r11;"
1523 + " movq %%r11, 24(%1);"
1524 +
1525 + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
1526 + " mov $0, %%rax;"
1527 + " cmovc %0, %%rax;"
1528 + " add %%rax, %%r8;"
1529 + " movq %%r8, 0(%1);"
1530 + : "+&r" (f2)
1531 + : "r" (out), "r" (f1)
1532 + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
1533 + );
1534 }
1535
1536 -static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a,
1537 - const u64 *const b)
1538 +/* Computes the field substraction of two field elements */
1539 +static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
1540 {
1541 asm volatile(
1542 - "movq (%1), %%rdx; " /* A[0] */
1543 - "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
1544 - "movq %%r8, (%0) ;"
1545 - "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
1546 - "addq %%r10, %%r15 ;"
1547 - "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
1548 - "adcq %%r8, %%rax ;"
1549 - "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
1550 - "adcq %%r10, %%rbx ;"
1551 - /******************************************/
1552 - "adcq $0, %%rcx ;"
1553 -
1554 - "movq 8(%1), %%rdx; " /* A[1] */
1555 - "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
1556 - "addq %%r15, %%r8 ;"
1557 - "movq %%r8, 8(%0) ;"
1558 - "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
1559 - "adcq %%r10, %%r9 ;"
1560 - "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
1561 - "adcq %%r8, %%r11 ;"
1562 - "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
1563 - "adcq %%r10, %%r13 ;"
1564 - /******************************************/
1565 - "adcq $0, %%r15 ;"
1566 -
1567 - "addq %%r9, %%rax ;"
1568 - "adcq %%r11, %%rbx ;"
1569 - "adcq %%r13, %%rcx ;"
1570 - "adcq $0, %%r15 ;"
1571 -
1572 - "movq 16(%1), %%rdx; " /* A[2] */
1573 - "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
1574 - "addq %%rax, %%r8 ;"
1575 - "movq %%r8, 16(%0) ;"
1576 - "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
1577 - "adcq %%r10, %%r9 ;"
1578 - "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
1579 - "adcq %%r8, %%r11 ;"
1580 - "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
1581 - "adcq %%r10, %%r13 ;"
1582 - /******************************************/
1583 - "adcq $0, %%rax ;"
1584 -
1585 - "addq %%r9, %%rbx ;"
1586 - "adcq %%r11, %%rcx ;"
1587 - "adcq %%r13, %%r15 ;"
1588 - "adcq $0, %%rax ;"
1589 -
1590 - "movq 24(%1), %%rdx; " /* A[3] */
1591 - "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
1592 - "addq %%rbx, %%r8 ;"
1593 - "movq %%r8, 24(%0) ;"
1594 - "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
1595 - "adcq %%r10, %%r9 ;"
1596 - "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
1597 - "adcq %%r8, %%r11 ;"
1598 - "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
1599 - "adcq %%r10, %%r13 ;"
1600 - /******************************************/
1601 - "adcq $0, %%rbx ;"
1602 -
1603 - "addq %%r9, %%rcx ;"
1604 - "movq %%rcx, 32(%0) ;"
1605 - "adcq %%r11, %%r15 ;"
1606 - "movq %%r15, 40(%0) ;"
1607 - "adcq %%r13, %%rax ;"
1608 - "movq %%rax, 48(%0) ;"
1609 - "adcq $0, %%rbx ;"
1610 - "movq %%rbx, 56(%0) ;"
1611 - :
1612 - : "r"(c), "r"(a), "r"(b)
1613 - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1614 - "%r10", "%r11", "%r13", "%r15");
1615 + /* Compute the raw substraction of f1-f2 */
1616 + " movq 0(%1), %%r8;"
1617 + " subq 0(%2), %%r8;"
1618 + " movq 8(%1), %%r9;"
1619 + " sbbq 8(%2), %%r9;"
1620 + " movq 16(%1), %%r10;"
1621 + " sbbq 16(%2), %%r10;"
1622 + " movq 24(%1), %%r11;"
1623 + " sbbq 24(%2), %%r11;"
1624 +
1625 + /* Wrap the result back into the field */
1626 +
1627 + /* Step 1: Compute carry*38 */
1628 + " mov $0, %%rax;"
1629 + " mov $38, %%rcx;"
1630 + " cmovc %%rcx, %%rax;"
1631 +
1632 + /* Step 2: Substract carry*38 from the original difference */
1633 + " sub %%rax, %%r8;"
1634 + " sbb $0, %%r9;"
1635 + " sbb $0, %%r10;"
1636 + " sbb $0, %%r11;"
1637 +
1638 + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
1639 + " mov $0, %%rax;"
1640 + " cmovc %%rcx, %%rax;"
1641 + " sub %%rax, %%r8;"
1642 +
1643 + /* Store the result */
1644 + " movq %%r8, 0(%0);"
1645 + " movq %%r9, 8(%0);"
1646 + " movq %%r10, 16(%0);"
1647 + " movq %%r11, 24(%0);"
1648 + :
1649 + : "r" (out), "r" (f1), "r" (f2)
1650 + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
1651 + );
1652 }
1653
1654 -static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a)
1655 +/* Computes a field multiplication: out <- f1 * f2
1656 + * Uses the 8-element buffer tmp for intermediate results */
1657 +static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
1658 {
1659 asm volatile(
1660 - "movq (%1), %%rdx ;" /* A[0] */
1661 - "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
1662 - "xorl %%r15d, %%r15d;"
1663 - "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
1664 - "adcx %%r14, %%r9 ;"
1665 - "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
1666 - "adcx %%rax, %%r10 ;"
1667 - "movq 24(%1), %%rdx ;" /* A[3] */
1668 - "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
1669 - "adcx %%rcx, %%r11 ;"
1670 - "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
1671 - "adcx %%rax, %%rbx ;"
1672 - "movq 8(%1), %%rdx ;" /* A[1] */
1673 - "adcx %%r15, %%r13 ;"
1674 - "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
1675 - "movq $0, %%r14 ;"
1676 - /******************************************/
1677 - "adcx %%r15, %%r14 ;"
1678 -
1679 - "xorl %%r15d, %%r15d;"
1680 - "adox %%rax, %%r10 ;"
1681 - "adcx %%r8, %%r8 ;"
1682 - "adox %%rcx, %%r11 ;"
1683 - "adcx %%r9, %%r9 ;"
1684 - "adox %%r15, %%rbx ;"
1685 - "adcx %%r10, %%r10 ;"
1686 - "adox %%r15, %%r13 ;"
1687 - "adcx %%r11, %%r11 ;"
1688 - "adox %%r15, %%r14 ;"
1689 - "adcx %%rbx, %%rbx ;"
1690 - "adcx %%r13, %%r13 ;"
1691 - "adcx %%r14, %%r14 ;"
1692 -
1693 - "movq (%1), %%rdx ;"
1694 - "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
1695 - /*******************/
1696 - "movq %%rax, 0(%0) ;"
1697 - "addq %%rcx, %%r8 ;"
1698 - "movq %%r8, 8(%0) ;"
1699 - "movq 8(%1), %%rdx ;"
1700 - "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
1701 - "adcq %%rax, %%r9 ;"
1702 - "movq %%r9, 16(%0) ;"
1703 - "adcq %%rcx, %%r10 ;"
1704 - "movq %%r10, 24(%0) ;"
1705 - "movq 16(%1), %%rdx ;"
1706 - "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
1707 - "adcq %%rax, %%r11 ;"
1708 - "movq %%r11, 32(%0) ;"
1709 - "adcq %%rcx, %%rbx ;"
1710 - "movq %%rbx, 40(%0) ;"
1711 - "movq 24(%1), %%rdx ;"
1712 - "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
1713 - "adcq %%rax, %%r13 ;"
1714 - "movq %%r13, 48(%0) ;"
1715 - "adcq %%rcx, %%r14 ;"
1716 - "movq %%r14, 56(%0) ;"
1717 - :
1718 - : "r"(c), "r"(a)
1719 - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1720 - "%r10", "%r11", "%r13", "%r14", "%r15");
1721 -}
1722 + /* Compute the raw multiplication: tmp <- src1 * src2 */
1723
1724 -static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a)
1725 -{
1726 - asm volatile(
1727 - "movq 8(%1), %%rdx ;" /* A[1] */
1728 - "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
1729 - "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
1730 - "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
1731 -
1732 - "movq 16(%1), %%rdx ;" /* A[2] */
1733 - "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
1734 - "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
1735 -
1736 - "addq %%rax, %%r9 ;"
1737 - "adcq %%rdx, %%r10 ;"
1738 - "adcq %%rcx, %%r11 ;"
1739 - "adcq %%r14, %%r15 ;"
1740 - "adcq $0, %%r13 ;"
1741 - "movq $0, %%r14 ;"
1742 - "adcq $0, %%r14 ;"
1743 -
1744 - "movq (%1), %%rdx ;" /* A[0] */
1745 - "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
1746 -
1747 - "addq %%rax, %%r10 ;"
1748 - "adcq %%rcx, %%r11 ;"
1749 - "adcq $0, %%r15 ;"
1750 - "adcq $0, %%r13 ;"
1751 - "adcq $0, %%r14 ;"
1752 -
1753 - "shldq $1, %%r13, %%r14 ;"
1754 - "shldq $1, %%r15, %%r13 ;"
1755 - "shldq $1, %%r11, %%r15 ;"
1756 - "shldq $1, %%r10, %%r11 ;"
1757 - "shldq $1, %%r9, %%r10 ;"
1758 - "shldq $1, %%r8, %%r9 ;"
1759 - "shlq $1, %%r8 ;"
1760 -
1761 - /*******************/
1762 - "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
1763 - /*******************/
1764 - "movq %%rax, 0(%0) ;"
1765 - "addq %%rcx, %%r8 ;"
1766 - "movq %%r8, 8(%0) ;"
1767 - "movq 8(%1), %%rdx ;"
1768 - "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
1769 - "adcq %%rax, %%r9 ;"
1770 - "movq %%r9, 16(%0) ;"
1771 - "adcq %%rcx, %%r10 ;"
1772 - "movq %%r10, 24(%0) ;"
1773 - "movq 16(%1), %%rdx ;"
1774 - "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
1775 - "adcq %%rax, %%r11 ;"
1776 - "movq %%r11, 32(%0) ;"
1777 - "adcq %%rcx, %%r15 ;"
1778 - "movq %%r15, 40(%0) ;"
1779 - "movq 24(%1), %%rdx ;"
1780 - "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
1781 - "adcq %%rax, %%r13 ;"
1782 - "movq %%r13, 48(%0) ;"
1783 - "adcq %%rcx, %%r14 ;"
1784 - "movq %%r14, 56(%0) ;"
1785 - :
1786 - : "r"(c), "r"(a)
1787 - : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1788 - "%r11", "%r13", "%r14", "%r15");
1789 + /* Compute src1[0] * src2 */
1790 + " movq 0(%1), %%rdx;"
1791 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
1792 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
1793 + " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
1794 + " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
1795 + " adox %%rdx, %%rax;"
1796 + /* Compute src1[1] * src2 */
1797 + " movq 8(%1), %%rdx;"
1798 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
1799 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 16(%0);"
1800 + " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
1801 + " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
1802 + " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
1803 + /* Compute src1[2] * src2 */
1804 + " movq 16(%1), %%rdx;"
1805 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
1806 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 24(%0);"
1807 + " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
1808 + " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
1809 + " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
1810 + /* Compute src1[3] * src2 */
1811 + " movq 24(%1), %%rdx;"
1812 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
1813 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 32(%0);"
1814 + " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 40(%0);" " mov $0, %%r8;"
1815 + " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
1816 + " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
1817 + /* Line up pointers */
1818 + " mov %0, %1;"
1819 + " mov %2, %0;"
1820 +
1821 + /* Wrap the result back into the field */
1822 +
1823 + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
1824 + " mov $38, %%rdx;"
1825 + " mulxq 32(%1), %%r8, %%r13;"
1826 + " xor %3, %3;"
1827 + " adoxq 0(%1), %%r8;"
1828 + " mulxq 40(%1), %%r9, %%r12;"
1829 + " adcx %%r13, %%r9;"
1830 + " adoxq 8(%1), %%r9;"
1831 + " mulxq 48(%1), %%r10, %%r13;"
1832 + " adcx %%r12, %%r10;"
1833 + " adoxq 16(%1), %%r10;"
1834 + " mulxq 56(%1), %%r11, %%rax;"
1835 + " adcx %%r13, %%r11;"
1836 + " adoxq 24(%1), %%r11;"
1837 + " adcx %3, %%rax;"
1838 + " adox %3, %%rax;"
1839 + " imul %%rdx, %%rax;"
1840 +
1841 + /* Step 2: Fold the carry back into dst */
1842 + " add %%rax, %%r8;"
1843 + " adcx %3, %%r9;"
1844 + " movq %%r9, 8(%0);"
1845 + " adcx %3, %%r10;"
1846 + " movq %%r10, 16(%0);"
1847 + " adcx %3, %%r11;"
1848 + " movq %%r11, 24(%0);"
1849 +
1850 + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
1851 + " mov $0, %%rax;"
1852 + " cmovc %%rdx, %%rax;"
1853 + " add %%rax, %%r8;"
1854 + " movq %%r8, 0(%0);"
1855 + : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
1856 + :
1857 + : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
1858 + );
1859 }
1860
1861 -static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
1862 +/* Computes two field multiplications:
1863 + * out[0] <- f1[0] * f2[0]
1864 + * out[1] <- f1[1] * f2[1]
1865 + * Uses the 16-element buffer tmp for intermediate results. */
1866 +static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
1867 {
1868 asm volatile(
1869 - "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
1870 - "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
1871 - "xorl %%ebx, %%ebx ;"
1872 - "adox (%1), %%r8 ;"
1873 - "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
1874 - "adcx %%r10, %%r9 ;"
1875 - "adox 8(%1), %%r9 ;"
1876 - "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1877 - "adcx %%r11, %%r10 ;"
1878 - "adox 16(%1), %%r10 ;"
1879 - "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1880 - "adcx %%rax, %%r11 ;"
1881 - "adox 24(%1), %%r11 ;"
1882 - /***************************************/
1883 - "adcx %%rbx, %%rcx ;"
1884 - "adox %%rbx, %%rcx ;"
1885 - "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1886 - "adcx %%rcx, %%r8 ;"
1887 - "adcx %%rbx, %%r9 ;"
1888 - "movq %%r9, 8(%0) ;"
1889 - "adcx %%rbx, %%r10 ;"
1890 - "movq %%r10, 16(%0) ;"
1891 - "adcx %%rbx, %%r11 ;"
1892 - "movq %%r11, 24(%0) ;"
1893 - "mov $0, %%ecx ;"
1894 - "cmovc %%edx, %%ecx ;"
1895 - "addq %%rcx, %%r8 ;"
1896 - "movq %%r8, (%0) ;"
1897 - :
1898 - : "r"(c), "r"(a)
1899 - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1900 - "%r10", "%r11");
1901 -}
1902 + /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
1903
1904 -static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
1905 -{
1906 - asm volatile(
1907 - "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
1908 - "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
1909 - "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
1910 - "addq %%r10, %%r9 ;"
1911 - "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1912 - "adcq %%r11, %%r10 ;"
1913 - "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1914 - "adcq %%rax, %%r11 ;"
1915 - /***************************************/
1916 - "adcq $0, %%rcx ;"
1917 - "addq (%1), %%r8 ;"
1918 - "adcq 8(%1), %%r9 ;"
1919 - "adcq 16(%1), %%r10 ;"
1920 - "adcq 24(%1), %%r11 ;"
1921 - "adcq $0, %%rcx ;"
1922 - "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1923 - "addq %%rcx, %%r8 ;"
1924 - "adcq $0, %%r9 ;"
1925 - "movq %%r9, 8(%0) ;"
1926 - "adcq $0, %%r10 ;"
1927 - "movq %%r10, 16(%0) ;"
1928 - "adcq $0, %%r11 ;"
1929 - "movq %%r11, 24(%0) ;"
1930 - "mov $0, %%ecx ;"
1931 - "cmovc %%edx, %%ecx ;"
1932 - "addq %%rcx, %%r8 ;"
1933 - "movq %%r8, (%0) ;"
1934 - :
1935 - : "r"(c), "r"(a)
1936 - : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1937 - "%r11");
1938 + /* Compute src1[0] * src2 */
1939 + " movq 0(%1), %%rdx;"
1940 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
1941 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
1942 + " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
1943 + " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
1944 + " adox %%rdx, %%rax;"
1945 + /* Compute src1[1] * src2 */
1946 + " movq 8(%1), %%rdx;"
1947 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
1948 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 16(%0);"
1949 + " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
1950 + " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
1951 + " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
1952 + /* Compute src1[2] * src2 */
1953 + " movq 16(%1), %%rdx;"
1954 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
1955 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 24(%0);"
1956 + " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
1957 + " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
1958 + " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
1959 + /* Compute src1[3] * src2 */
1960 + " movq 24(%1), %%rdx;"
1961 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
1962 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 32(%0);"
1963 + " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 40(%0);" " mov $0, %%r8;"
1964 + " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
1965 + " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
1966 +
1967 + /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
1968 +
1969 + /* Compute src1[0] * src2 */
1970 + " movq 32(%1), %%rdx;"
1971 + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);"
1972 + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
1973 + " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
1974 + " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
1975 + " adox %%rdx, %%rax;"
1976 + /* Compute src1[1] * src2 */
1977 + " movq 40(%1), %%rdx;"
1978 + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
1979 + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 80(%0);"
1980 + " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
1981 + " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
1982 + " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
1983 + /* Compute src1[2] * src2 */
1984 + " movq 48(%1), %%rdx;"
1985 + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
1986 + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 88(%0);"
1987 + " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
1988 + " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
1989 + " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
1990 + /* Compute src1[3] * src2 */
1991 + " movq 56(%1), %%rdx;"
1992 + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
1993 + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 96(%0);"
1994 + " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 104(%0);" " mov $0, %%r8;"
1995 + " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
1996 + " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
1997 + /* Line up pointers */
1998 + " mov %0, %1;"
1999 + " mov %2, %0;"
2000 +
2001 + /* Wrap the results back into the field */
2002 +
2003 + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
2004 + " mov $38, %%rdx;"
2005 + " mulxq 32(%1), %%r8, %%r13;"
2006 + " xor %3, %3;"
2007 + " adoxq 0(%1), %%r8;"
2008 + " mulxq 40(%1), %%r9, %%r12;"
2009 + " adcx %%r13, %%r9;"
2010 + " adoxq 8(%1), %%r9;"
2011 + " mulxq 48(%1), %%r10, %%r13;"
2012 + " adcx %%r12, %%r10;"
2013 + " adoxq 16(%1), %%r10;"
2014 + " mulxq 56(%1), %%r11, %%rax;"
2015 + " adcx %%r13, %%r11;"
2016 + " adoxq 24(%1), %%r11;"
2017 + " adcx %3, %%rax;"
2018 + " adox %3, %%rax;"
2019 + " imul %%rdx, %%rax;"
2020 +
2021 + /* Step 2: Fold the carry back into dst */
2022 + " add %%rax, %%r8;"
2023 + " adcx %3, %%r9;"
2024 + " movq %%r9, 8(%0);"
2025 + " adcx %3, %%r10;"
2026 + " movq %%r10, 16(%0);"
2027 + " adcx %3, %%r11;"
2028 + " movq %%r11, 24(%0);"
2029 +
2030 + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
2031 + " mov $0, %%rax;"
2032 + " cmovc %%rdx, %%rax;"
2033 + " add %%rax, %%r8;"
2034 + " movq %%r8, 0(%0);"
2035 +
2036 + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
2037 + " mov $38, %%rdx;"
2038 + " mulxq 96(%1), %%r8, %%r13;"
2039 + " xor %3, %3;"
2040 + " adoxq 64(%1), %%r8;"
2041 + " mulxq 104(%1), %%r9, %%r12;"
2042 + " adcx %%r13, %%r9;"
2043 + " adoxq 72(%1), %%r9;"
2044 + " mulxq 112(%1), %%r10, %%r13;"
2045 + " adcx %%r12, %%r10;"
2046 + " adoxq 80(%1), %%r10;"
2047 + " mulxq 120(%1), %%r11, %%rax;"
2048 + " adcx %%r13, %%r11;"
2049 + " adoxq 88(%1), %%r11;"
2050 + " adcx %3, %%rax;"
2051 + " adox %3, %%rax;"
2052 + " imul %%rdx, %%rax;"
2053 +
2054 + /* Step 2: Fold the carry back into dst */
2055 + " add %%rax, %%r8;"
2056 + " adcx %3, %%r9;"
2057 + " movq %%r9, 40(%0);"
2058 + " adcx %3, %%r10;"
2059 + " movq %%r10, 48(%0);"
2060 + " adcx %3, %%r11;"
2061 + " movq %%r11, 56(%0);"
2062 +
2063 + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
2064 + " mov $0, %%rax;"
2065 + " cmovc %%rdx, %%rax;"
2066 + " add %%rax, %%r8;"
2067 + " movq %%r8, 32(%0);"
2068 + : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
2069 + :
2070 + : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
2071 + );
2072 }
2073
2074 -static __always_inline void
2075 -add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b)
2076 +/* Computes the field multiplication of four-element f1 with value in f2 */
2077 +static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
2078 {
2079 - asm volatile(
2080 - "mov $38, %%eax ;"
2081 - "xorl %%ecx, %%ecx ;"
2082 - "movq (%2), %%r8 ;"
2083 - "adcx (%1), %%r8 ;"
2084 - "movq 8(%2), %%r9 ;"
2085 - "adcx 8(%1), %%r9 ;"
2086 - "movq 16(%2), %%r10 ;"
2087 - "adcx 16(%1), %%r10 ;"
2088 - "movq 24(%2), %%r11 ;"
2089 - "adcx 24(%1), %%r11 ;"
2090 - "cmovc %%eax, %%ecx ;"
2091 - "xorl %%eax, %%eax ;"
2092 - "adcx %%rcx, %%r8 ;"
2093 - "adcx %%rax, %%r9 ;"
2094 - "movq %%r9, 8(%0) ;"
2095 - "adcx %%rax, %%r10 ;"
2096 - "movq %%r10, 16(%0) ;"
2097 - "adcx %%rax, %%r11 ;"
2098 - "movq %%r11, 24(%0) ;"
2099 - "mov $38, %%ecx ;"
2100 - "cmovc %%ecx, %%eax ;"
2101 - "addq %%rax, %%r8 ;"
2102 - "movq %%r8, (%0) ;"
2103 - :
2104 - : "r"(c), "r"(a), "r"(b)
2105 - : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
2106 -}
2107 + register u64 f2_r asm("rdx") = f2;
2108
2109 -static __always_inline void
2110 -add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b)
2111 -{
2112 asm volatile(
2113 - "mov $38, %%eax ;"
2114 - "movq (%2), %%r8 ;"
2115 - "addq (%1), %%r8 ;"
2116 - "movq 8(%2), %%r9 ;"
2117 - "adcq 8(%1), %%r9 ;"
2118 - "movq 16(%2), %%r10 ;"
2119 - "adcq 16(%1), %%r10 ;"
2120 - "movq 24(%2), %%r11 ;"
2121 - "adcq 24(%1), %%r11 ;"
2122 - "mov $0, %%ecx ;"
2123 - "cmovc %%eax, %%ecx ;"
2124 - "addq %%rcx, %%r8 ;"
2125 - "adcq $0, %%r9 ;"
2126 - "movq %%r9, 8(%0) ;"
2127 - "adcq $0, %%r10 ;"
2128 - "movq %%r10, 16(%0) ;"
2129 - "adcq $0, %%r11 ;"
2130 - "movq %%r11, 24(%0) ;"
2131 - "mov $0, %%ecx ;"
2132 - "cmovc %%eax, %%ecx ;"
2133 - "addq %%rcx, %%r8 ;"
2134 - "movq %%r8, (%0) ;"
2135 - :
2136 - : "r"(c), "r"(a), "r"(b)
2137 - : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
2138 + /* Compute the raw multiplication of f1*f2 */
2139 + " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
2140 + " mulxq 8(%2), %%r9, %%r12;" /* f1[1]*f2 */
2141 + " add %%rcx, %%r9;"
2142 + " mov $0, %%rcx;"
2143 + " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
2144 + " adcx %%r12, %%r10;"
2145 + " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
2146 + " adcx %%r13, %%r11;"
2147 + " adcx %%rcx, %%rax;"
2148 +
2149 + /* Wrap the result back into the field */
2150 +
2151 + /* Step 1: Compute carry*38 */
2152 + " mov $38, %%rdx;"
2153 + " imul %%rdx, %%rax;"
2154 +
2155 + /* Step 2: Fold the carry back into dst */
2156 + " add %%rax, %%r8;"
2157 + " adcx %%rcx, %%r9;"
2158 + " movq %%r9, 8(%1);"
2159 + " adcx %%rcx, %%r10;"
2160 + " movq %%r10, 16(%1);"
2161 + " adcx %%rcx, %%r11;"
2162 + " movq %%r11, 24(%1);"
2163 +
2164 + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
2165 + " mov $0, %%rax;"
2166 + " cmovc %%rdx, %%rax;"
2167 + " add %%rax, %%r8;"
2168 + " movq %%r8, 0(%1);"
2169 + : "+&r" (f2_r)
2170 + : "r" (out), "r" (f1)
2171 + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "memory", "cc"
2172 + );
2173 }
2174
2175 -static __always_inline void
2176 -sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b)
2177 -{
2178 - asm volatile(
2179 - "mov $38, %%eax ;"
2180 - "movq (%1), %%r8 ;"
2181 - "subq (%2), %%r8 ;"
2182 - "movq 8(%1), %%r9 ;"
2183 - "sbbq 8(%2), %%r9 ;"
2184 - "movq 16(%1), %%r10 ;"
2185 - "sbbq 16(%2), %%r10 ;"
2186 - "movq 24(%1), %%r11 ;"
2187 - "sbbq 24(%2), %%r11 ;"
2188 - "mov $0, %%ecx ;"
2189 - "cmovc %%eax, %%ecx ;"
2190 - "subq %%rcx, %%r8 ;"
2191 - "sbbq $0, %%r9 ;"
2192 - "movq %%r9, 8(%0) ;"
2193 - "sbbq $0, %%r10 ;"
2194 - "movq %%r10, 16(%0) ;"
2195 - "sbbq $0, %%r11 ;"
2196 - "movq %%r11, 24(%0) ;"
2197 - "mov $0, %%ecx ;"
2198 - "cmovc %%eax, %%ecx ;"
2199 - "subq %%rcx, %%r8 ;"
2200 - "movq %%r8, (%0) ;"
2201 - :
2202 - : "r"(c), "r"(a), "r"(b)
2203 - : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
2204 -}
2205 -
2206 -/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */
2207 -static __always_inline void
2208 -mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a)
2209 +/* Computes p1 <- bit ? p2 : p1 in constant time */
2210 +static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
2211 {
2212 - const u64 a24 = 121666;
2213 asm volatile(
2214 - "movq %2, %%rdx ;"
2215 - "mulx (%1), %%r8, %%r10 ;"
2216 - "mulx 8(%1), %%r9, %%r11 ;"
2217 - "addq %%r10, %%r9 ;"
2218 - "mulx 16(%1), %%r10, %%rax ;"
2219 - "adcq %%r11, %%r10 ;"
2220 - "mulx 24(%1), %%r11, %%rcx ;"
2221 - "adcq %%rax, %%r11 ;"
2222 - /**************************/
2223 - "adcq $0, %%rcx ;"
2224 - "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
2225 - "imul %%rdx, %%rcx ;"
2226 - "addq %%rcx, %%r8 ;"
2227 - "adcq $0, %%r9 ;"
2228 - "movq %%r9, 8(%0) ;"
2229 - "adcq $0, %%r10 ;"
2230 - "movq %%r10, 16(%0) ;"
2231 - "adcq $0, %%r11 ;"
2232 - "movq %%r11, 24(%0) ;"
2233 - "mov $0, %%ecx ;"
2234 - "cmovc %%edx, %%ecx ;"
2235 - "addq %%rcx, %%r8 ;"
2236 - "movq %%r8, (%0) ;"
2237 - :
2238 - : "r"(c), "r"(a), "r"(a24)
2239 - : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
2240 - "%r11");
2241 -}
2242 -
2243 -static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
2244 -{
2245 - struct {
2246 - eltfp25519_1w_buffer buffer;
2247 - eltfp25519_1w x0, x1, x2;
2248 - } __aligned(32) m;
2249 - u64 *T[4];
2250 -
2251 - T[0] = m.x0;
2252 - T[1] = c; /* x^(-1) */
2253 - T[2] = m.x1;
2254 - T[3] = m.x2;
2255 -
2256 - copy_eltfp25519_1w(T[1], a);
2257 - sqrn_eltfp25519_1w_adx(T[1], 1);
2258 - copy_eltfp25519_1w(T[2], T[1]);
2259 - sqrn_eltfp25519_1w_adx(T[2], 2);
2260 - mul_eltfp25519_1w_adx(T[0], a, T[2]);
2261 - mul_eltfp25519_1w_adx(T[1], T[1], T[0]);
2262 - copy_eltfp25519_1w(T[2], T[1]);
2263 - sqrn_eltfp25519_1w_adx(T[2], 1);
2264 - mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
2265 - copy_eltfp25519_1w(T[2], T[0]);
2266 - sqrn_eltfp25519_1w_adx(T[2], 5);
2267 - mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
2268 - copy_eltfp25519_1w(T[2], T[0]);
2269 - sqrn_eltfp25519_1w_adx(T[2], 10);
2270 - mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
2271 - copy_eltfp25519_1w(T[3], T[2]);
2272 - sqrn_eltfp25519_1w_adx(T[3], 20);
2273 - mul_eltfp25519_1w_adx(T[3], T[3], T[2]);
2274 - sqrn_eltfp25519_1w_adx(T[3], 10);
2275 - mul_eltfp25519_1w_adx(T[3], T[3], T[0]);
2276 - copy_eltfp25519_1w(T[0], T[3]);
2277 - sqrn_eltfp25519_1w_adx(T[0], 50);
2278 - mul_eltfp25519_1w_adx(T[0], T[0], T[3]);
2279 - copy_eltfp25519_1w(T[2], T[0]);
2280 - sqrn_eltfp25519_1w_adx(T[2], 100);
2281 - mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
2282 - sqrn_eltfp25519_1w_adx(T[2], 50);
2283 - mul_eltfp25519_1w_adx(T[2], T[2], T[3]);
2284 - sqrn_eltfp25519_1w_adx(T[2], 5);
2285 - mul_eltfp25519_1w_adx(T[1], T[1], T[2]);
2286 -
2287 - memzero_explicit(&m, sizeof(m));
2288 -}
2289 -
2290 -static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
2291 -{
2292 - struct {
2293 - eltfp25519_1w_buffer buffer;
2294 - eltfp25519_1w x0, x1, x2;
2295 - } __aligned(32) m;
2296 - u64 *T[5];
2297 -
2298 - T[0] = m.x0;
2299 - T[1] = c; /* x^(-1) */
2300 - T[2] = m.x1;
2301 - T[3] = m.x2;
2302 -
2303 - copy_eltfp25519_1w(T[1], a);
2304 - sqrn_eltfp25519_1w_bmi2(T[1], 1);
2305 - copy_eltfp25519_1w(T[2], T[1]);
2306 - sqrn_eltfp25519_1w_bmi2(T[2], 2);
2307 - mul_eltfp25519_1w_bmi2(T[0], a, T[2]);
2308 - mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]);
2309 - copy_eltfp25519_1w(T[2], T[1]);
2310 - sqrn_eltfp25519_1w_bmi2(T[2], 1);
2311 - mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
2312 - copy_eltfp25519_1w(T[2], T[0]);
2313 - sqrn_eltfp25519_1w_bmi2(T[2], 5);
2314 - mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
2315 - copy_eltfp25519_1w(T[2], T[0]);
2316 - sqrn_eltfp25519_1w_bmi2(T[2], 10);
2317 - mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
2318 - copy_eltfp25519_1w(T[3], T[2]);
2319 - sqrn_eltfp25519_1w_bmi2(T[3], 20);
2320 - mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]);
2321 - sqrn_eltfp25519_1w_bmi2(T[3], 10);
2322 - mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]);
2323 - copy_eltfp25519_1w(T[0], T[3]);
2324 - sqrn_eltfp25519_1w_bmi2(T[0], 50);
2325 - mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]);
2326 - copy_eltfp25519_1w(T[2], T[0]);
2327 - sqrn_eltfp25519_1w_bmi2(T[2], 100);
2328 - mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
2329 - sqrn_eltfp25519_1w_bmi2(T[2], 50);
2330 - mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]);
2331 - sqrn_eltfp25519_1w_bmi2(T[2], 5);
2332 - mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]);
2333 + /* Invert the polarity of bit to match cmov expectations */
2334 + " add $18446744073709551615, %0;"
2335
2336 - memzero_explicit(&m, sizeof(m));
2337 + /* cswap p1[0], p2[0] */
2338 + " movq 0(%1), %%r8;"
2339 + " movq 0(%2), %%r9;"
2340 + " mov %%r8, %%r10;"
2341 + " cmovc %%r9, %%r8;"
2342 + " cmovc %%r10, %%r9;"
2343 + " movq %%r8, 0(%1);"
2344 + " movq %%r9, 0(%2);"
2345 +
2346 + /* cswap p1[1], p2[1] */
2347 + " movq 8(%1), %%r8;"
2348 + " movq 8(%2), %%r9;"
2349 + " mov %%r8, %%r10;"
2350 + " cmovc %%r9, %%r8;"
2351 + " cmovc %%r10, %%r9;"
2352 + " movq %%r8, 8(%1);"
2353 + " movq %%r9, 8(%2);"
2354 +
2355 + /* cswap p1[2], p2[2] */
2356 + " movq 16(%1), %%r8;"
2357 + " movq 16(%2), %%r9;"
2358 + " mov %%r8, %%r10;"
2359 + " cmovc %%r9, %%r8;"
2360 + " cmovc %%r10, %%r9;"
2361 + " movq %%r8, 16(%1);"
2362 + " movq %%r9, 16(%2);"
2363 +
2364 + /* cswap p1[3], p2[3] */
2365 + " movq 24(%1), %%r8;"
2366 + " movq 24(%2), %%r9;"
2367 + " mov %%r8, %%r10;"
2368 + " cmovc %%r9, %%r8;"
2369 + " cmovc %%r10, %%r9;"
2370 + " movq %%r8, 24(%1);"
2371 + " movq %%r9, 24(%2);"
2372 +
2373 + /* cswap p1[4], p2[4] */
2374 + " movq 32(%1), %%r8;"
2375 + " movq 32(%2), %%r9;"
2376 + " mov %%r8, %%r10;"
2377 + " cmovc %%r9, %%r8;"
2378 + " cmovc %%r10, %%r9;"
2379 + " movq %%r8, 32(%1);"
2380 + " movq %%r9, 32(%2);"
2381 +
2382 + /* cswap p1[5], p2[5] */
2383 + " movq 40(%1), %%r8;"
2384 + " movq 40(%2), %%r9;"
2385 + " mov %%r8, %%r10;"
2386 + " cmovc %%r9, %%r8;"
2387 + " cmovc %%r10, %%r9;"
2388 + " movq %%r8, 40(%1);"
2389 + " movq %%r9, 40(%2);"
2390 +
2391 + /* cswap p1[6], p2[6] */
2392 + " movq 48(%1), %%r8;"
2393 + " movq 48(%2), %%r9;"
2394 + " mov %%r8, %%r10;"
2395 + " cmovc %%r9, %%r8;"
2396 + " cmovc %%r10, %%r9;"
2397 + " movq %%r8, 48(%1);"
2398 + " movq %%r9, 48(%2);"
2399 +
2400 + /* cswap p1[7], p2[7] */
2401 + " movq 56(%1), %%r8;"
2402 + " movq 56(%2), %%r9;"
2403 + " mov %%r8, %%r10;"
2404 + " cmovc %%r9, %%r8;"
2405 + " cmovc %%r10, %%r9;"
2406 + " movq %%r8, 56(%1);"
2407 + " movq %%r9, 56(%2);"
2408 + : "+&r" (bit)
2409 + : "r" (p1), "r" (p2)
2410 + : "%r8", "%r9", "%r10", "memory", "cc"
2411 + );
2412 }
2413
2414 -/* Given c, a 256-bit number, fred_eltfp25519_1w updates c
2415 - * with a number such that 0 <= C < 2**255-19.
2416 - */
2417 -static __always_inline void fred_eltfp25519_1w(u64 *const c)
2418 +/* Computes the square of a field element: out <- f * f
2419 + * Uses the 8-element buffer tmp for intermediate results */
2420 +static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
2421 {
2422 - u64 tmp0 = 38, tmp1 = 19;
2423 asm volatile(
2424 - "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */
2425 - "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */
2426 -
2427 - /* Add either 19 or 38 to c */
2428 - "addq %4, %0 ;"
2429 - "adcq $0, %1 ;"
2430 - "adcq $0, %2 ;"
2431 - "adcq $0, %3 ;"
2432 -
2433 - /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */
2434 - "movl $0, %k4 ;"
2435 - "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */
2436 - "btrq $63, %3 ;" /* Clear bit 255 */
2437 -
2438 - /* Subtract 19 if necessary */
2439 - "subq %4, %0 ;"
2440 - "sbbq $0, %1 ;"
2441 - "sbbq $0, %2 ;"
2442 - "sbbq $0, %3 ;"
2443 -
2444 - : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0),
2445 - "+r"(tmp1)
2446 - :
2447 - : "memory", "cc");
2448 -}
2449 + /* Compute the raw multiplication: tmp <- f * f */
2450
2451 -static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py)
2452 -{
2453 - u64 temp;
2454 - asm volatile(
2455 - "test %9, %9 ;"
2456 - "movq %0, %8 ;"
2457 - "cmovnzq %4, %0 ;"
2458 - "cmovnzq %8, %4 ;"
2459 - "movq %1, %8 ;"
2460 - "cmovnzq %5, %1 ;"
2461 - "cmovnzq %8, %5 ;"
2462 - "movq %2, %8 ;"
2463 - "cmovnzq %6, %2 ;"
2464 - "cmovnzq %8, %6 ;"
2465 - "movq %3, %8 ;"
2466 - "cmovnzq %7, %3 ;"
2467 - "cmovnzq %8, %7 ;"
2468 - : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]),
2469 - "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]),
2470 - "=r"(temp)
2471 - : "r"(bit)
2472 - : "cc"
2473 + /* Step 1: Compute all partial products */
2474 + " movq 0(%1), %%rdx;" /* f[0] */
2475 + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
2476 + " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
2477 + " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
2478 + " movq 24(%1), %%rdx;" /* f[3] */
2479 + " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
2480 + " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
2481 + " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
2482 + " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
2483 +
2484 + /* Step 2: Compute two parallel carry chains */
2485 + " xor %%r15, %%r15;"
2486 + " adox %%rax, %%r10;"
2487 + " adcx %%r8, %%r8;"
2488 + " adox %%rcx, %%r11;"
2489 + " adcx %%r9, %%r9;"
2490 + " adox %%r15, %%r12;"
2491 + " adcx %%r10, %%r10;"
2492 + " adox %%r15, %%r13;"
2493 + " adcx %%r11, %%r11;"
2494 + " adox %%r15, %%r14;"
2495 + " adcx %%r12, %%r12;"
2496 + " adcx %%r13, %%r13;"
2497 + " adcx %%r14, %%r14;"
2498 +
2499 + /* Step 3: Compute intermediate squares */
2500 + " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
2501 + " movq %%rax, 0(%0);"
2502 + " add %%rcx, %%r8;" " movq %%r8, 8(%0);"
2503 + " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
2504 + " adcx %%rax, %%r9;" " movq %%r9, 16(%0);"
2505 + " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
2506 + " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
2507 + " adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
2508 + " adcx %%rcx, %%r12;" " movq %%r12, 40(%0);"
2509 + " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
2510 + " adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
2511 + " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
2512 +
2513 + /* Line up pointers */
2514 + " mov %0, %1;"
2515 + " mov %2, %0;"
2516 +
2517 + /* Wrap the result back into the field */
2518 +
2519 + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
2520 + " mov $38, %%rdx;"
2521 + " mulxq 32(%1), %%r8, %%r13;"
2522 + " xor %%rcx, %%rcx;"
2523 + " adoxq 0(%1), %%r8;"
2524 + " mulxq 40(%1), %%r9, %%r12;"
2525 + " adcx %%r13, %%r9;"
2526 + " adoxq 8(%1), %%r9;"
2527 + " mulxq 48(%1), %%r10, %%r13;"
2528 + " adcx %%r12, %%r10;"
2529 + " adoxq 16(%1), %%r10;"
2530 + " mulxq 56(%1), %%r11, %%rax;"
2531 + " adcx %%r13, %%r11;"
2532 + " adoxq 24(%1), %%r11;"
2533 + " adcx %%rcx, %%rax;"
2534 + " adox %%rcx, %%rax;"
2535 + " imul %%rdx, %%rax;"
2536 +
2537 + /* Step 2: Fold the carry back into dst */
2538 + " add %%rax, %%r8;"
2539 + " adcx %%rcx, %%r9;"
2540 + " movq %%r9, 8(%0);"
2541 + " adcx %%rcx, %%r10;"
2542 + " movq %%r10, 16(%0);"
2543 + " adcx %%rcx, %%r11;"
2544 + " movq %%r11, 24(%0);"
2545 +
2546 + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
2547 + " mov $0, %%rax;"
2548 + " cmovc %%rdx, %%rax;"
2549 + " add %%rax, %%r8;"
2550 + " movq %%r8, 0(%0);"
2551 + : "+&r" (tmp), "+&r" (f), "+&r" (out)
2552 + :
2553 + : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
2554 );
2555 }
2556
2557 -static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py)
2558 +/* Computes two field squarings:
2559 + * out[0] <- f[0] * f[0]
2560 + * out[1] <- f[1] * f[1]
2561 + * Uses the 16-element buffer tmp for intermediate results */
2562 +static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
2563 {
2564 asm volatile(
2565 - "test %4, %4 ;"
2566 - "cmovnzq %5, %0 ;"
2567 - "cmovnzq %6, %1 ;"
2568 - "cmovnzq %7, %2 ;"
2569 - "cmovnzq %8, %3 ;"
2570 - : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3])
2571 - : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3])
2572 - : "cc"
2573 + /* Step 1: Compute all partial products */
2574 + " movq 0(%1), %%rdx;" /* f[0] */
2575 + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
2576 + " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
2577 + " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
2578 + " movq 24(%1), %%rdx;" /* f[3] */
2579 + " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
2580 + " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
2581 + " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
2582 + " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
2583 +
2584 + /* Step 2: Compute two parallel carry chains */
2585 + " xor %%r15, %%r15;"
2586 + " adox %%rax, %%r10;"
2587 + " adcx %%r8, %%r8;"
2588 + " adox %%rcx, %%r11;"
2589 + " adcx %%r9, %%r9;"
2590 + " adox %%r15, %%r12;"
2591 + " adcx %%r10, %%r10;"
2592 + " adox %%r15, %%r13;"
2593 + " adcx %%r11, %%r11;"
2594 + " adox %%r15, %%r14;"
2595 + " adcx %%r12, %%r12;"
2596 + " adcx %%r13, %%r13;"
2597 + " adcx %%r14, %%r14;"
2598 +
2599 + /* Step 3: Compute intermediate squares */
2600 + " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
2601 + " movq %%rax, 0(%0);"
2602 + " add %%rcx, %%r8;" " movq %%r8, 8(%0);"
2603 + " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
2604 + " adcx %%rax, %%r9;" " movq %%r9, 16(%0);"
2605 + " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
2606 + " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
2607 + " adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
2608 + " adcx %%rcx, %%r12;" " movq %%r12, 40(%0);"
2609 + " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
2610 + " adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
2611 + " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
2612 +
2613 + /* Step 1: Compute all partial products */
2614 + " movq 32(%1), %%rdx;" /* f[0] */
2615 + " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
2616 + " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
2617 + " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
2618 + " movq 56(%1), %%rdx;" /* f[3] */
2619 + " mulxq 40(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
2620 + " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
2621 + " movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
2622 + " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
2623 +
2624 + /* Step 2: Compute two parallel carry chains */
2625 + " xor %%r15, %%r15;"
2626 + " adox %%rax, %%r10;"
2627 + " adcx %%r8, %%r8;"
2628 + " adox %%rcx, %%r11;"
2629 + " adcx %%r9, %%r9;"
2630 + " adox %%r15, %%r12;"
2631 + " adcx %%r10, %%r10;"
2632 + " adox %%r15, %%r13;"
2633 + " adcx %%r11, %%r11;"
2634 + " adox %%r15, %%r14;"
2635 + " adcx %%r12, %%r12;"
2636 + " adcx %%r13, %%r13;"
2637 + " adcx %%r14, %%r14;"
2638 +
2639 + /* Step 3: Compute intermediate squares */
2640 + " movq 32(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
2641 + " movq %%rax, 64(%0);"
2642 + " add %%rcx, %%r8;" " movq %%r8, 72(%0);"
2643 + " movq 40(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
2644 + " adcx %%rax, %%r9;" " movq %%r9, 80(%0);"
2645 + " adcx %%rcx, %%r10;" " movq %%r10, 88(%0);"
2646 + " movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
2647 + " adcx %%rax, %%r11;" " movq %%r11, 96(%0);"
2648 + " adcx %%rcx, %%r12;" " movq %%r12, 104(%0);"
2649 + " movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
2650 + " adcx %%rax, %%r13;" " movq %%r13, 112(%0);"
2651 + " adcx %%rcx, %%r14;" " movq %%r14, 120(%0);"
2652 +
2653 + /* Line up pointers */
2654 + " mov %0, %1;"
2655 + " mov %2, %0;"
2656 +
2657 + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
2658 + " mov $38, %%rdx;"
2659 + " mulxq 32(%1), %%r8, %%r13;"
2660 + " xor %%rcx, %%rcx;"
2661 + " adoxq 0(%1), %%r8;"
2662 + " mulxq 40(%1), %%r9, %%r12;"
2663 + " adcx %%r13, %%r9;"
2664 + " adoxq 8(%1), %%r9;"
2665 + " mulxq 48(%1), %%r10, %%r13;"
2666 + " adcx %%r12, %%r10;"
2667 + " adoxq 16(%1), %%r10;"
2668 + " mulxq 56(%1), %%r11, %%rax;"
2669 + " adcx %%r13, %%r11;"
2670 + " adoxq 24(%1), %%r11;"
2671 + " adcx %%rcx, %%rax;"
2672 + " adox %%rcx, %%rax;"
2673 + " imul %%rdx, %%rax;"
2674 +
2675 + /* Step 2: Fold the carry back into dst */
2676 + " add %%rax, %%r8;"
2677 + " adcx %%rcx, %%r9;"
2678 + " movq %%r9, 8(%0);"
2679 + " adcx %%rcx, %%r10;"
2680 + " movq %%r10, 16(%0);"
2681 + " adcx %%rcx, %%r11;"
2682 + " movq %%r11, 24(%0);"
2683 +
2684 + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
2685 + " mov $0, %%rax;"
2686 + " cmovc %%rdx, %%rax;"
2687 + " add %%rax, %%r8;"
2688 + " movq %%r8, 0(%0);"
2689 +
2690 + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
2691 + " mov $38, %%rdx;"
2692 + " mulxq 96(%1), %%r8, %%r13;"
2693 + " xor %%rcx, %%rcx;"
2694 + " adoxq 64(%1), %%r8;"
2695 + " mulxq 104(%1), %%r9, %%r12;"
2696 + " adcx %%r13, %%r9;"
2697 + " adoxq 72(%1), %%r9;"
2698 + " mulxq 112(%1), %%r10, %%r13;"
2699 + " adcx %%r12, %%r10;"
2700 + " adoxq 80(%1), %%r10;"
2701 + " mulxq 120(%1), %%r11, %%rax;"
2702 + " adcx %%r13, %%r11;"
2703 + " adoxq 88(%1), %%r11;"
2704 + " adcx %%rcx, %%rax;"
2705 + " adox %%rcx, %%rax;"
2706 + " imul %%rdx, %%rax;"
2707 +
2708 + /* Step 2: Fold the carry back into dst */
2709 + " add %%rax, %%r8;"
2710 + " adcx %%rcx, %%r9;"
2711 + " movq %%r9, 40(%0);"
2712 + " adcx %%rcx, %%r10;"
2713 + " movq %%r10, 48(%0);"
2714 + " adcx %%rcx, %%r11;"
2715 + " movq %%r11, 56(%0);"
2716 +
2717 + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
2718 + " mov $0, %%rax;"
2719 + " cmovc %%rdx, %%rax;"
2720 + " add %%rax, %%r8;"
2721 + " movq %%r8, 32(%0);"
2722 + : "+&r" (tmp), "+&r" (f), "+&r" (out)
2723 + :
2724 + : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
2725 );
2726 }
2727
2728 -static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE],
2729 - const u8 private_key[CURVE25519_KEY_SIZE],
2730 - const u8 session_key[CURVE25519_KEY_SIZE])
2731 -{
2732 - struct {
2733 - u64 buffer[4 * NUM_WORDS_ELTFP25519];
2734 - u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2735 - u64 workspace[6 * NUM_WORDS_ELTFP25519];
2736 - u8 session[CURVE25519_KEY_SIZE];
2737 - u8 private[CURVE25519_KEY_SIZE];
2738 - } __aligned(32) m;
2739 -
2740 - int i = 0, j = 0;
2741 - u64 prev = 0;
2742 - u64 *const X1 = (u64 *)m.session;
2743 - u64 *const key = (u64 *)m.private;
2744 - u64 *const Px = m.coordinates + 0;
2745 - u64 *const Pz = m.coordinates + 4;
2746 - u64 *const Qx = m.coordinates + 8;
2747 - u64 *const Qz = m.coordinates + 12;
2748 - u64 *const X2 = Qx;
2749 - u64 *const Z2 = Qz;
2750 - u64 *const X3 = Px;
2751 - u64 *const Z3 = Pz;
2752 - u64 *const X2Z2 = Qx;
2753 - u64 *const X3Z3 = Px;
2754 -
2755 - u64 *const A = m.workspace + 0;
2756 - u64 *const B = m.workspace + 4;
2757 - u64 *const D = m.workspace + 8;
2758 - u64 *const C = m.workspace + 12;
2759 - u64 *const DA = m.workspace + 16;
2760 - u64 *const CB = m.workspace + 20;
2761 - u64 *const AB = A;
2762 - u64 *const DC = D;
2763 - u64 *const DACB = DA;
2764 -
2765 - memcpy(m.private, private_key, sizeof(m.private));
2766 - memcpy(m.session, session_key, sizeof(m.session));
2767 -
2768 - curve25519_clamp_secret(m.private);
2769 -
2770 - /* As in the draft:
2771 - * When receiving such an array, implementations of curve25519
2772 - * MUST mask the most-significant bit in the final byte. This
2773 - * is done to preserve compatibility with point formats which
2774 - * reserve the sign bit for use in other protocols and to
2775 - * increase resistance to implementation fingerprinting
2776 - */
2777 - m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
2778 -
2779 - copy_eltfp25519_1w(Px, X1);
2780 - setzero_eltfp25519_1w(Pz);
2781 - setzero_eltfp25519_1w(Qx);
2782 - setzero_eltfp25519_1w(Qz);
2783 -
2784 - Pz[0] = 1;
2785 - Qx[0] = 1;
2786 -
2787 - /* main-loop */
2788 - prev = 0;
2789 - j = 62;
2790 - for (i = 3; i >= 0; --i) {
2791 - while (j >= 0) {
2792 - u64 bit = (key[i] >> j) & 0x1;
2793 - u64 swap = bit ^ prev;
2794 - prev = bit;
2795 -
2796 - add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */
2797 - sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
2798 - add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */
2799 - sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
2800 - mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
2801 -
2802 - cselect(swap, A, C);
2803 - cselect(swap, B, D);
2804 -
2805 - sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */
2806 - add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */
2807 - sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
2808 - sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
2809 -
2810 - copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
2811 - sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
2812 -
2813 - mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
2814 - add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */
2815 - mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
2816 - mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */
2817 - --j;
2818 - }
2819 - j = 63;
2820 - }
2821 -
2822 - inv_eltfp25519_1w_adx(A, Qz);
2823 - mul_eltfp25519_1w_adx((u64 *)shared, Qx, A);
2824 - fred_eltfp25519_1w((u64 *)shared);
2825 -
2826 - memzero_explicit(&m, sizeof(m));
2827 -}
2828 -
2829 -static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE],
2830 - const u8 private_key[CURVE25519_KEY_SIZE])
2831 +static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
2832 {
2833 - struct {
2834 - u64 buffer[4 * NUM_WORDS_ELTFP25519];
2835 - u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2836 - u64 workspace[4 * NUM_WORDS_ELTFP25519];
2837 - u8 private[CURVE25519_KEY_SIZE];
2838 - } __aligned(32) m;
2839 -
2840 - const int ite[4] = { 64, 64, 64, 63 };
2841 - const int q = 3;
2842 - u64 swap = 1;
2843 -
2844 - int i = 0, j = 0, k = 0;
2845 - u64 *const key = (u64 *)m.private;
2846 - u64 *const Ur1 = m.coordinates + 0;
2847 - u64 *const Zr1 = m.coordinates + 4;
2848 - u64 *const Ur2 = m.coordinates + 8;
2849 - u64 *const Zr2 = m.coordinates + 12;
2850 -
2851 - u64 *const UZr1 = m.coordinates + 0;
2852 - u64 *const ZUr2 = m.coordinates + 8;
2853 -
2854 - u64 *const A = m.workspace + 0;
2855 - u64 *const B = m.workspace + 4;
2856 - u64 *const C = m.workspace + 8;
2857 - u64 *const D = m.workspace + 12;
2858 -
2859 - u64 *const AB = m.workspace + 0;
2860 - u64 *const CD = m.workspace + 8;
2861 -
2862 - const u64 *const P = table_ladder_8k;
2863 -
2864 - memcpy(m.private, private_key, sizeof(m.private));
2865 -
2866 - curve25519_clamp_secret(m.private);
2867 -
2868 - setzero_eltfp25519_1w(Ur1);
2869 - setzero_eltfp25519_1w(Zr1);
2870 - setzero_eltfp25519_1w(Zr2);
2871 - Ur1[0] = 1;
2872 - Zr1[0] = 1;
2873 - Zr2[0] = 1;
2874 -
2875 - /* G-S */
2876 - Ur2[3] = 0x1eaecdeee27cab34UL;
2877 - Ur2[2] = 0xadc7a0b9235d48e2UL;
2878 - Ur2[1] = 0xbbf095ae14b2edf8UL;
2879 - Ur2[0] = 0x7e94e1fec82faabdUL;
2880 -
2881 - /* main-loop */
2882 - j = q;
2883 - for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
2884 - while (j < ite[i]) {
2885 - u64 bit = (key[i] >> j) & 0x1;
2886 - k = (64 * i + j - q);
2887 - swap = swap ^ bit;
2888 - cswap(swap, Ur1, Ur2);
2889 - cswap(swap, Zr1, Zr2);
2890 - swap = bit;
2891 - /* Addition */
2892 - sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
2893 - add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
2894 - mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */
2895 - sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
2896 - add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
2897 - sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */
2898 - mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
2899 - ++j;
2900 + u64 *nq = p01_tmp1;
2901 + u64 *nq_p1 = p01_tmp1 + (u32)8U;
2902 + u64 *tmp1 = p01_tmp1 + (u32)16U;
2903 + u64 *x1 = q;
2904 + u64 *x2 = nq;
2905 + u64 *z2 = nq + (u32)4U;
2906 + u64 *z3 = nq_p1 + (u32)4U;
2907 + u64 *a = tmp1;
2908 + u64 *b = tmp1 + (u32)4U;
2909 + u64 *ab = tmp1;
2910 + u64 *dc = tmp1 + (u32)8U;
2911 + u64 *x3;
2912 + u64 *z31;
2913 + u64 *d0;
2914 + u64 *c0;
2915 + u64 *a1;
2916 + u64 *b1;
2917 + u64 *d;
2918 + u64 *c;
2919 + u64 *ab1;
2920 + u64 *dc1;
2921 + fadd(a, x2, z2);
2922 + fsub(b, x2, z2);
2923 + x3 = nq_p1;
2924 + z31 = nq_p1 + (u32)4U;
2925 + d0 = dc;
2926 + c0 = dc + (u32)4U;
2927 + fadd(c0, x3, z31);
2928 + fsub(d0, x3, z31);
2929 + fmul2(dc, dc, ab, tmp2);
2930 + fadd(x3, d0, c0);
2931 + fsub(z31, d0, c0);
2932 + a1 = tmp1;
2933 + b1 = tmp1 + (u32)4U;
2934 + d = tmp1 + (u32)8U;
2935 + c = tmp1 + (u32)12U;
2936 + ab1 = tmp1;
2937 + dc1 = tmp1 + (u32)8U;
2938 + fsqr2(dc1, ab1, tmp2);
2939 + fsqr2(nq_p1, nq_p1, tmp2);
2940 + a1[0U] = c[0U];
2941 + a1[1U] = c[1U];
2942 + a1[2U] = c[2U];
2943 + a1[3U] = c[3U];
2944 + fsub(c, d, c);
2945 + fmul_scalar(b1, c, (u64)121665U);
2946 + fadd(b1, b1, d);
2947 + fmul2(nq, dc1, ab1, tmp2);
2948 + fmul(z3, z3, x1, tmp2);
2949 +}
2950 +
2951 +static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
2952 +{
2953 + u64 *x2 = nq;
2954 + u64 *z2 = nq + (u32)4U;
2955 + u64 *a = tmp1;
2956 + u64 *b = tmp1 + (u32)4U;
2957 + u64 *d = tmp1 + (u32)8U;
2958 + u64 *c = tmp1 + (u32)12U;
2959 + u64 *ab = tmp1;
2960 + u64 *dc = tmp1 + (u32)8U;
2961 + fadd(a, x2, z2);
2962 + fsub(b, x2, z2);
2963 + fsqr2(dc, ab, tmp2);
2964 + a[0U] = c[0U];
2965 + a[1U] = c[1U];
2966 + a[2U] = c[2U];
2967 + a[3U] = c[3U];
2968 + fsub(c, d, c);
2969 + fmul_scalar(b, c, (u64)121665U);
2970 + fadd(b, b, d);
2971 + fmul2(nq, dc, ab, tmp2);
2972 +}
2973 +
2974 +static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
2975 +{
2976 + u64 tmp2[16U] = { 0U };
2977 + u64 p01_tmp1_swap[33U] = { 0U };
2978 + u64 *p0 = p01_tmp1_swap;
2979 + u64 *p01 = p01_tmp1_swap;
2980 + u64 *p03 = p01;
2981 + u64 *p11 = p01 + (u32)8U;
2982 + u64 *x0;
2983 + u64 *z0;
2984 + u64 *p01_tmp1;
2985 + u64 *p01_tmp11;
2986 + u64 *nq10;
2987 + u64 *nq_p11;
2988 + u64 *swap1;
2989 + u64 sw0;
2990 + u64 *nq1;
2991 + u64 *tmp1;
2992 + memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
2993 + x0 = p03;
2994 + z0 = p03 + (u32)4U;
2995 + x0[0U] = (u64)1U;
2996 + x0[1U] = (u64)0U;
2997 + x0[2U] = (u64)0U;
2998 + x0[3U] = (u64)0U;
2999 + z0[0U] = (u64)0U;
3000 + z0[1U] = (u64)0U;
3001 + z0[2U] = (u64)0U;
3002 + z0[3U] = (u64)0U;
3003 + p01_tmp1 = p01_tmp1_swap;
3004 + p01_tmp11 = p01_tmp1_swap;
3005 + nq10 = p01_tmp1_swap;
3006 + nq_p11 = p01_tmp1_swap + (u32)8U;
3007 + swap1 = p01_tmp1_swap + (u32)32U;
3008 + cswap2((u64)1U, nq10, nq_p11);
3009 + point_add_and_double(init1, p01_tmp11, tmp2);
3010 + swap1[0U] = (u64)1U;
3011 + {
3012 + u32 i;
3013 + for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
3014 + u64 *p01_tmp12 = p01_tmp1_swap;
3015 + u64 *swap2 = p01_tmp1_swap + (u32)32U;
3016 + u64 *nq2 = p01_tmp12;
3017 + u64 *nq_p12 = p01_tmp12 + (u32)8U;
3018 + u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
3019 + u64 sw = swap2[0U] ^ bit;
3020 + cswap2(sw, nq2, nq_p12);
3021 + point_add_and_double(init1, p01_tmp12, tmp2);
3022 + swap2[0U] = bit;
3023 }
3024 - j = 0;
3025 }
3026 -
3027 - /* Doubling */
3028 - for (i = 0; i < q; ++i) {
3029 - add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
3030 - sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
3031 - sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */
3032 - copy_eltfp25519_1w(C, B); /* C = B */
3033 - sub_eltfp25519_1w(B, A, B); /* B = A-B */
3034 - mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
3035 - add_eltfp25519_1w_adx(D, D, C); /* D = D+C */
3036 - mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
3037 - }
3038 -
3039 - /* Convert to affine coordinates */
3040 - inv_eltfp25519_1w_adx(A, Zr1);
3041 - mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A);
3042 - fred_eltfp25519_1w((u64 *)session_key);
3043 -
3044 - memzero_explicit(&m, sizeof(m));
3045 -}
3046 -
3047 -static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE],
3048 - const u8 private_key[CURVE25519_KEY_SIZE],
3049 - const u8 session_key[CURVE25519_KEY_SIZE])
3050 -{
3051 - struct {
3052 - u64 buffer[4 * NUM_WORDS_ELTFP25519];
3053 - u64 coordinates[4 * NUM_WORDS_ELTFP25519];
3054 - u64 workspace[6 * NUM_WORDS_ELTFP25519];
3055 - u8 session[CURVE25519_KEY_SIZE];
3056 - u8 private[CURVE25519_KEY_SIZE];
3057 - } __aligned(32) m;
3058 -
3059 - int i = 0, j = 0;
3060 - u64 prev = 0;
3061 - u64 *const X1 = (u64 *)m.session;
3062 - u64 *const key = (u64 *)m.private;
3063 - u64 *const Px = m.coordinates + 0;
3064 - u64 *const Pz = m.coordinates + 4;
3065 - u64 *const Qx = m.coordinates + 8;
3066 - u64 *const Qz = m.coordinates + 12;
3067 - u64 *const X2 = Qx;
3068 - u64 *const Z2 = Qz;
3069 - u64 *const X3 = Px;
3070 - u64 *const Z3 = Pz;
3071 - u64 *const X2Z2 = Qx;
3072 - u64 *const X3Z3 = Px;
3073 -
3074 - u64 *const A = m.workspace + 0;
3075 - u64 *const B = m.workspace + 4;
3076 - u64 *const D = m.workspace + 8;
3077 - u64 *const C = m.workspace + 12;
3078 - u64 *const DA = m.workspace + 16;
3079 - u64 *const CB = m.workspace + 20;
3080 - u64 *const AB = A;
3081 - u64 *const DC = D;
3082 - u64 *const DACB = DA;
3083 -
3084 - memcpy(m.private, private_key, sizeof(m.private));
3085 - memcpy(m.session, session_key, sizeof(m.session));
3086 -
3087 - curve25519_clamp_secret(m.private);
3088 -
3089 - /* As in the draft:
3090 - * When receiving such an array, implementations of curve25519
3091 - * MUST mask the most-significant bit in the final byte. This
3092 - * is done to preserve compatibility with point formats which
3093 - * reserve the sign bit for use in other protocols and to
3094 - * increase resistance to implementation fingerprinting
3095 - */
3096 - m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
3097 -
3098 - copy_eltfp25519_1w(Px, X1);
3099 - setzero_eltfp25519_1w(Pz);
3100 - setzero_eltfp25519_1w(Qx);
3101 - setzero_eltfp25519_1w(Qz);
3102 -
3103 - Pz[0] = 1;
3104 - Qx[0] = 1;
3105 -
3106 - /* main-loop */
3107 - prev = 0;
3108 - j = 62;
3109 - for (i = 3; i >= 0; --i) {
3110 - while (j >= 0) {
3111 - u64 bit = (key[i] >> j) & 0x1;
3112 - u64 swap = bit ^ prev;
3113 - prev = bit;
3114 -
3115 - add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */
3116 - sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
3117 - add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */
3118 - sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
3119 - mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
3120 -
3121 - cselect(swap, A, C);
3122 - cselect(swap, B, D);
3123 -
3124 - sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */
3125 - add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */
3126 - sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
3127 - sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
3128 -
3129 - copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
3130 - sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
3131 -
3132 - mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
3133 - add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */
3134 - mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
3135 - mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */
3136 - --j;
3137 + sw0 = swap1[0U];
3138 + cswap2(sw0, nq10, nq_p11);
3139 + nq1 = p01_tmp1;
3140 + tmp1 = p01_tmp1 + (u32)16U;
3141 + point_double(nq1, tmp1, tmp2);
3142 + point_double(nq1, tmp1, tmp2);
3143 + point_double(nq1, tmp1, tmp2);
3144 + memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
3145 +
3146 + memzero_explicit(tmp2, sizeof(tmp2));
3147 + memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
3148 +}
3149 +
3150 +static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
3151 +{
3152 + u32 i;
3153 + fsqr(o, inp, tmp);
3154 + for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
3155 + fsqr(o, o, tmp);
3156 +}
3157 +
3158 +static void finv(u64 *o, const u64 *i, u64 *tmp)
3159 +{
3160 + u64 t1[16U] = { 0U };
3161 + u64 *a0 = t1;
3162 + u64 *b = t1 + (u32)4U;
3163 + u64 *c = t1 + (u32)8U;
3164 + u64 *t00 = t1 + (u32)12U;
3165 + u64 *tmp1 = tmp;
3166 + u64 *a;
3167 + u64 *t0;
3168 + fsquare_times(a0, i, tmp1, (u32)1U);
3169 + fsquare_times(t00, a0, tmp1, (u32)2U);
3170 + fmul(b, t00, i, tmp);
3171 + fmul(a0, b, a0, tmp);
3172 + fsquare_times(t00, a0, tmp1, (u32)1U);
3173 + fmul(b, t00, b, tmp);
3174 + fsquare_times(t00, b, tmp1, (u32)5U);
3175 + fmul(b, t00, b, tmp);
3176 + fsquare_times(t00, b, tmp1, (u32)10U);
3177 + fmul(c, t00, b, tmp);
3178 + fsquare_times(t00, c, tmp1, (u32)20U);
3179 + fmul(t00, t00, c, tmp);
3180 + fsquare_times(t00, t00, tmp1, (u32)10U);
3181 + fmul(b, t00, b, tmp);
3182 + fsquare_times(t00, b, tmp1, (u32)50U);
3183 + fmul(c, t00, b, tmp);
3184 + fsquare_times(t00, c, tmp1, (u32)100U);
3185 + fmul(t00, t00, c, tmp);
3186 + fsquare_times(t00, t00, tmp1, (u32)50U);
3187 + fmul(t00, t00, b, tmp);
3188 + fsquare_times(t00, t00, tmp1, (u32)5U);
3189 + a = t1;
3190 + t0 = t1 + (u32)12U;
3191 + fmul(o, t0, a, tmp);
3192 +}
3193 +
3194 +static void store_felem(u64 *b, u64 *f)
3195 +{
3196 + u64 f30 = f[3U];
3197 + u64 top_bit0 = f30 >> (u32)63U;
3198 + u64 carry0;
3199 + u64 f31;
3200 + u64 top_bit;
3201 + u64 carry;
3202 + u64 f0;
3203 + u64 f1;
3204 + u64 f2;
3205 + u64 f3;
3206 + u64 m0;
3207 + u64 m1;
3208 + u64 m2;
3209 + u64 m3;
3210 + u64 mask;
3211 + u64 f0_;
3212 + u64 f1_;
3213 + u64 f2_;
3214 + u64 f3_;
3215 + u64 o0;
3216 + u64 o1;
3217 + u64 o2;
3218 + u64 o3;
3219 + f[3U] = f30 & (u64)0x7fffffffffffffffU;
3220 + carry0 = add_scalar(f, f, (u64)19U * top_bit0);
3221 + f31 = f[3U];
3222 + top_bit = f31 >> (u32)63U;
3223 + f[3U] = f31 & (u64)0x7fffffffffffffffU;
3224 + carry = add_scalar(f, f, (u64)19U * top_bit);
3225 + f0 = f[0U];
3226 + f1 = f[1U];
3227 + f2 = f[2U];
3228 + f3 = f[3U];
3229 + m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
3230 + m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
3231 + m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
3232 + m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
3233 + mask = ((m0 & m1) & m2) & m3;
3234 + f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
3235 + f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
3236 + f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
3237 + f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
3238 + o0 = f0_;
3239 + o1 = f1_;
3240 + o2 = f2_;
3241 + o3 = f3_;
3242 + b[0U] = o0;
3243 + b[1U] = o1;
3244 + b[2U] = o2;
3245 + b[3U] = o3;
3246 +}
3247 +
3248 +static void encode_point(u8 *o, const u64 *i)
3249 +{
3250 + const u64 *x = i;
3251 + const u64 *z = i + (u32)4U;
3252 + u64 tmp[4U] = { 0U };
3253 + u64 tmp_w[16U] = { 0U };
3254 + finv(tmp, z, tmp_w);
3255 + fmul(tmp, tmp, x, tmp_w);
3256 + store_felem((u64 *)o, tmp);
3257 +}
3258 +
3259 +static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
3260 +{
3261 + u64 init1[8U] = { 0U };
3262 + u64 tmp[4U] = { 0U };
3263 + u64 tmp3;
3264 + u64 *x;
3265 + u64 *z;
3266 + {
3267 + u32 i;
3268 + for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
3269 + u64 *os = tmp;
3270 + const u8 *bj = pub + i * (u32)8U;
3271 + u64 u = *(u64 *)bj;
3272 + u64 r = u;
3273 + u64 x0 = r;
3274 + os[i] = x0;
3275 }
3276 - j = 63;
3277 }
3278 + tmp3 = tmp[3U];
3279 + tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
3280 + x = init1;
3281 + z = init1 + (u32)4U;
3282 + z[0U] = (u64)1U;
3283 + z[1U] = (u64)0U;
3284 + z[2U] = (u64)0U;
3285 + z[3U] = (u64)0U;
3286 + x[0U] = tmp[0U];
3287 + x[1U] = tmp[1U];
3288 + x[2U] = tmp[2U];
3289 + x[3U] = tmp[3U];
3290 + montgomery_ladder(init1, priv, init1);
3291 + encode_point(out, init1);
3292 +}
3293 +
3294 +/* The below constants were generated using this sage script:
3295 + *
3296 + * #!/usr/bin/env sage
3297 + * import sys
3298 + * from sage.all import *
3299 + * def limbs(n):
3300 + * n = int(n)
3301 + * l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
3302 + * return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
3303 + * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
3304 + * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
3305 + * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
3306 + * print("static const u64 table_ladder[] = {")
3307 + * p = ec.lift_x(9)
3308 + * for i in range(252):
3309 + * l = (p[0] + p[2]) / (p[0] - p[2])
3310 + * print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
3311 + * p = p * 2
3312 + * print("};")
3313 + *
3314 + */
3315
3316 - inv_eltfp25519_1w_bmi2(A, Qz);
3317 - mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A);
3318 - fred_eltfp25519_1w((u64 *)shared);
3319 +static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
3320
3321 - memzero_explicit(&m, sizeof(m));
3322 -}
3323 +static const u64 table_ladder[] = {
3324 + 0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
3325 + 0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
3326 + 0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
3327 + 0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
3328 + 0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
3329 + 0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
3330 + 0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
3331 + 0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
3332 + 0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
3333 + 0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
3334 + 0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
3335 + 0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
3336 + 0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
3337 + 0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
3338 + 0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
3339 + 0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
3340 + 0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
3341 + 0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
3342 + 0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
3343 + 0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
3344 + 0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
3345 + 0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
3346 + 0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
3347 + 0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
3348 + 0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
3349 + 0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
3350 + 0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
3351 + 0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
3352 + 0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
3353 + 0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
3354 + 0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
3355 + 0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
3356 + 0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
3357 + 0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
3358 + 0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
3359 + 0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
3360 + 0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
3361 + 0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
3362 + 0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
3363 + 0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
3364 + 0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
3365 + 0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
3366 + 0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
3367 + 0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
3368 + 0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
3369 + 0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
3370 + 0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
3371 + 0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
3372 + 0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
3373 + 0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
3374 + 0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
3375 + 0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
3376 + 0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
3377 + 0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
3378 + 0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
3379 + 0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
3380 + 0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
3381 + 0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
3382 + 0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
3383 + 0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
3384 + 0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
3385 + 0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
3386 + 0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
3387 + 0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
3388 + 0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
3389 + 0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
3390 + 0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
3391 + 0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
3392 + 0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
3393 + 0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
3394 + 0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
3395 + 0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
3396 + 0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
3397 + 0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
3398 + 0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
3399 + 0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
3400 + 0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
3401 + 0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
3402 + 0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
3403 + 0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
3404 + 0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
3405 + 0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
3406 + 0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
3407 + 0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
3408 + 0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
3409 + 0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
3410 + 0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
3411 + 0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
3412 + 0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
3413 + 0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
3414 + 0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
3415 + 0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
3416 + 0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
3417 + 0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
3418 + 0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
3419 + 0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
3420 + 0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
3421 + 0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
3422 + 0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
3423 + 0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
3424 + 0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
3425 + 0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
3426 + 0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
3427 + 0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
3428 + 0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
3429 + 0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
3430 + 0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
3431 + 0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
3432 + 0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
3433 + 0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
3434 + 0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
3435 + 0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
3436 + 0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
3437 + 0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
3438 + 0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
3439 + 0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
3440 + 0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
3441 + 0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
3442 + 0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
3443 + 0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
3444 + 0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
3445 + 0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
3446 + 0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
3447 + 0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
3448 + 0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
3449 + 0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
3450 + 0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
3451 + 0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
3452 + 0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
3453 + 0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
3454 + 0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
3455 + 0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
3456 + 0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
3457 + 0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
3458 + 0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
3459 + 0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
3460 + 0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
3461 + 0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
3462 + 0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
3463 + 0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
3464 + 0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
3465 + 0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
3466 + 0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
3467 + 0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
3468 + 0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
3469 + 0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
3470 + 0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
3471 + 0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
3472 + 0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
3473 + 0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
3474 + 0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
3475 + 0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
3476 + 0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
3477 + 0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
3478 + 0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
3479 + 0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
3480 + 0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
3481 + 0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
3482 + 0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
3483 + 0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
3484 + 0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
3485 + 0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
3486 + 0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
3487 + 0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
3488 + 0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
3489 + 0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
3490 + 0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
3491 + 0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
3492 + 0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
3493 + 0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
3494 + 0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
3495 + 0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
3496 + 0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
3497 + 0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
3498 + 0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
3499 + 0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
3500 + 0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
3501 + 0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
3502 + 0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
3503 + 0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
3504 + 0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
3505 + 0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
3506 + 0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
3507 + 0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
3508 + 0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
3509 + 0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
3510 + 0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
3511 + 0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
3512 + 0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
3513 + 0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
3514 + 0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
3515 + 0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
3516 + 0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
3517 + 0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
3518 + 0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
3519 + 0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
3520 + 0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
3521 + 0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
3522 + 0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
3523 + 0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
3524 + 0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
3525 + 0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
3526 + 0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
3527 + 0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
3528 + 0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
3529 + 0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
3530 + 0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
3531 + 0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
3532 + 0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
3533 + 0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
3534 + 0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
3535 + 0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
3536 + 0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
3537 + 0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
3538 + 0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
3539 + 0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
3540 + 0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
3541 + 0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
3542 + 0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
3543 + 0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
3544 + 0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
3545 + 0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
3546 + 0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
3547 + 0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
3548 + 0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
3549 + 0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
3550 + 0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
3551 + 0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
3552 + 0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
3553 + 0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
3554 + 0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
3555 + 0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
3556 + 0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
3557 + 0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
3558 + 0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
3559 + 0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
3560 + 0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
3561 + 0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
3562 + 0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
3563 + 0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
3564 + 0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
3565 + 0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
3566 + 0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
3567 + 0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
3568 + 0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
3569 + 0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
3570 + 0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
3571 + 0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
3572 + 0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
3573 + 0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
3574 + 0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
3575 + 0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
3576 +};
3577
3578 -static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE],
3579 - const u8 private_key[CURVE25519_KEY_SIZE])
3580 +static void curve25519_ever64_base(u8 *out, const u8 *priv)
3581 {
3582 - struct {
3583 - u64 buffer[4 * NUM_WORDS_ELTFP25519];
3584 - u64 coordinates[4 * NUM_WORDS_ELTFP25519];
3585 - u64 workspace[4 * NUM_WORDS_ELTFP25519];
3586 - u8 private[CURVE25519_KEY_SIZE];
3587 - } __aligned(32) m;
3588 -
3589 - const int ite[4] = { 64, 64, 64, 63 };
3590 - const int q = 3;
3591 u64 swap = 1;
3592 -
3593 - int i = 0, j = 0, k = 0;
3594 - u64 *const key = (u64 *)m.private;
3595 - u64 *const Ur1 = m.coordinates + 0;
3596 - u64 *const Zr1 = m.coordinates + 4;
3597 - u64 *const Ur2 = m.coordinates + 8;
3598 - u64 *const Zr2 = m.coordinates + 12;
3599 -
3600 - u64 *const UZr1 = m.coordinates + 0;
3601 - u64 *const ZUr2 = m.coordinates + 8;
3602 -
3603 - u64 *const A = m.workspace + 0;
3604 - u64 *const B = m.workspace + 4;
3605 - u64 *const C = m.workspace + 8;
3606 - u64 *const D = m.workspace + 12;
3607 -
3608 - u64 *const AB = m.workspace + 0;
3609 - u64 *const CD = m.workspace + 8;
3610 -
3611 - const u64 *const P = table_ladder_8k;
3612 -
3613 - memcpy(m.private, private_key, sizeof(m.private));
3614 -
3615 - curve25519_clamp_secret(m.private);
3616 -
3617 - setzero_eltfp25519_1w(Ur1);
3618 - setzero_eltfp25519_1w(Zr1);
3619 - setzero_eltfp25519_1w(Zr2);
3620 - Ur1[0] = 1;
3621 - Zr1[0] = 1;
3622 - Zr2[0] = 1;
3623 -
3624 - /* G-S */
3625 - Ur2[3] = 0x1eaecdeee27cab34UL;
3626 - Ur2[2] = 0xadc7a0b9235d48e2UL;
3627 - Ur2[1] = 0xbbf095ae14b2edf8UL;
3628 - Ur2[0] = 0x7e94e1fec82faabdUL;
3629 -
3630 - /* main-loop */
3631 - j = q;
3632 - for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
3633 - while (j < ite[i]) {
3634 - u64 bit = (key[i] >> j) & 0x1;
3635 - k = (64 * i + j - q);
3636 + int i, j, k;
3637 + u64 tmp[16 + 32 + 4];
3638 + u64 *x1 = &tmp[0];
3639 + u64 *z1 = &tmp[4];
3640 + u64 *x2 = &tmp[8];
3641 + u64 *z2 = &tmp[12];
3642 + u64 *xz1 = &tmp[0];
3643 + u64 *xz2 = &tmp[8];
3644 + u64 *a = &tmp[0 + 16];
3645 + u64 *b = &tmp[4 + 16];
3646 + u64 *c = &tmp[8 + 16];
3647 + u64 *ab = &tmp[0 + 16];
3648 + u64 *abcd = &tmp[0 + 16];
3649 + u64 *ef = &tmp[16 + 16];
3650 + u64 *efgh = &tmp[16 + 16];
3651 + u64 *key = &tmp[0 + 16 + 32];
3652 +
3653 + memcpy(key, priv, 32);
3654 + ((u8 *)key)[0] &= 248;
3655 + ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
3656 +
3657 + x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
3658 + z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
3659 + z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
3660 + memcpy(x2, p_minus_s, sizeof(p_minus_s));
3661 +
3662 + j = 3;
3663 + for (i = 0; i < 4; ++i) {
3664 + while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
3665 + u64 bit = (key[i] >> j) & 1;
3666 + k = (64 * i + j - 3);
3667 swap = swap ^ bit;
3668 - cswap(swap, Ur1, Ur2);
3669 - cswap(swap, Zr1, Zr2);
3670 + cswap2(swap, xz1, xz2);
3671 swap = bit;
3672 - /* Addition */
3673 - sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
3674 - add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
3675 - mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */
3676 - sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
3677 - add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
3678 - sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */
3679 - mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
3680 + fsub(b, x1, z1);
3681 + fadd(a, x1, z1);
3682 + fmul(c, &table_ladder[4 * k], b, ef);
3683 + fsub(b, a, c);
3684 + fadd(a, a, c);
3685 + fsqr2(ab, ab, efgh);
3686 + fmul2(xz1, xz2, ab, efgh);
3687 ++j;
3688 }
3689 j = 0;
3690 }
3691
3692 - /* Doubling */
3693 - for (i = 0; i < q; ++i) {
3694 - add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
3695 - sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
3696 - sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */
3697 - copy_eltfp25519_1w(C, B); /* C = B */
3698 - sub_eltfp25519_1w(B, A, B); /* B = A-B */
3699 - mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
3700 - add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */
3701 - mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
3702 - }
3703 -
3704 - /* Convert to affine coordinates */
3705 - inv_eltfp25519_1w_bmi2(A, Zr1);
3706 - mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A);
3707 - fred_eltfp25519_1w((u64 *)session_key);
3708 + point_double(xz1, abcd, efgh);
3709 + point_double(xz1, abcd, efgh);
3710 + point_double(xz1, abcd, efgh);
3711 + encode_point(out, xz1);
3712
3713 - memzero_explicit(&m, sizeof(m));
3714 + memzero_explicit(tmp, sizeof(tmp));
3715 }
3716
3717 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
3718 +
3719 void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
3720 const u8 secret[CURVE25519_KEY_SIZE],
3721 const u8 basepoint[CURVE25519_KEY_SIZE])
3722 {
3723 - if (static_branch_likely(&curve25519_use_adx))
3724 - curve25519_adx(mypublic, secret, basepoint);
3725 - else if (static_branch_likely(&curve25519_use_bmi2))
3726 - curve25519_bmi2(mypublic, secret, basepoint);
3727 + if (static_branch_likely(&curve25519_use_bmi2_adx))
3728 + curve25519_ever64(mypublic, secret, basepoint);
3729 else
3730 curve25519_generic(mypublic, secret, basepoint);
3731 }
3732 @@ -2355,10 +1395,8 @@ EXPORT_SYMBOL(curve25519_arch);
3733 void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
3734 const u8 secret[CURVE25519_KEY_SIZE])
3735 {
3736 - if (static_branch_likely(&curve25519_use_adx))
3737 - curve25519_adx_base(pub, secret);
3738 - else if (static_branch_likely(&curve25519_use_bmi2))
3739 - curve25519_bmi2_base(pub, secret);
3740 + if (static_branch_likely(&curve25519_use_bmi2_adx))
3741 + curve25519_ever64_base(pub, secret);
3742 else
3743 curve25519_generic(pub, secret, curve25519_base_point);
3744 }
3745 @@ -2449,12 +1487,11 @@ static struct kpp_alg curve25519_alg = {
3746 .max_size = curve25519_max_size,
3747 };
3748
3749 +
3750 static int __init curve25519_mod_init(void)
3751 {
3752 - if (boot_cpu_has(X86_FEATURE_BMI2))
3753 - static_branch_enable(&curve25519_use_bmi2);
3754 - else if (boot_cpu_has(X86_FEATURE_ADX))
3755 - static_branch_enable(&curve25519_use_adx);
3756 + if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
3757 + static_branch_enable(&curve25519_use_bmi2_adx);
3758 else
3759 return 0;
3760 return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
3761 @@ -2474,3 +1511,4 @@ module_exit(curve25519_mod_exit);
3762 MODULE_ALIAS_CRYPTO("curve25519");
3763 MODULE_ALIAS_CRYPTO("curve25519-x86");
3764 MODULE_LICENSE("GPL v2");
3765 +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");