#include <stddef.h>
+extern int aegis128_have_aes_insn;
+
void *memcpy(void *dest, const void *src, size_t n);
void *memset(void *s, int c, size_t n);
uint8x16_t v[5];
};
+extern const uint8x16x4_t crypto_aes_sbox[];
+
static struct aegis128_state aegis128_load_state_neon(const void *state)
{
return (struct aegis128_state){ {
{
uint8x16_t z = {};
+#ifdef CONFIG_ARM64
+ if (!__builtin_expect(aegis128_have_aes_insn, 1)) {
+ static const uint8x16_t shift_rows = {
+ 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+ 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+ };
+ static const uint8x16_t ror32by8 = {
+ 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+ 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+ };
+ uint8x16_t v;
+
+ // shift rows
+ w = vqtbl1q_u8(w, shift_rows);
+
+ // sub bytes
+ if (!IS_ENABLED(CONFIG_CC_IS_GCC)) {
+ v = vqtbl4q_u8(crypto_aes_sbox[0], w);
+ v = vqtbx4q_u8(v, crypto_aes_sbox[1], w - 0x40);
+ v = vqtbx4q_u8(v, crypto_aes_sbox[2], w - 0x80);
+ v = vqtbx4q_u8(v, crypto_aes_sbox[3], w - 0xc0);
+ } else {
+ asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w));
+ w -= 0x40;
+ asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w));
+ w -= 0x40;
+ asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w));
+ w -= 0x40;
+ asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w));
+ }
+
+ // mix columns
+ w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
+ w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
+ w ^= vqtbl1q_u8(v ^ w, ror32by8);
+
+ return w;
+ }
+#endif
+
/*
* We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics
* to force the compiler to issue the aese/aesmc instructions in pairs.
return st;
}
+static inline __attribute__((always_inline))
+void preload_sbox(void)
+{
+ if (!IS_ENABLED(CONFIG_ARM64) ||
+ !IS_ENABLED(CONFIG_CC_IS_GCC) ||
+ __builtin_expect(aegis128_have_aes_insn, 1))
+ return;
+
+ asm("ld1 {v16.16b-v19.16b}, [%0], #64 \n\t"
+ "ld1 {v20.16b-v23.16b}, [%0], #64 \n\t"
+ "ld1 {v24.16b-v27.16b}, [%0], #64 \n\t"
+ "ld1 {v28.16b-v31.16b}, [%0] \n\t"
+ :: "r"(crypto_aes_sbox));
+}
+
void crypto_aegis128_update_neon(void *state, const void *msg)
{
struct aegis128_state st = aegis128_load_state_neon(state);
+ preload_sbox();
+
st = aegis128_update_neon(st, vld1q_u8(msg));
aegis128_save_state_neon(st, state);
struct aegis128_state st = aegis128_load_state_neon(state);
uint8x16_t msg;
+ preload_sbox();
+
while (size >= AEGIS_BLOCK_SIZE) {
uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
struct aegis128_state st = aegis128_load_state_neon(state);
uint8x16_t msg;
+ preload_sbox();
+
while (size >= AEGIS_BLOCK_SIZE) {
msg = vld1q_u8(src) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
st = aegis128_update_neon(st, msg);