1 files changed, 202 insertions, 1 deletions
diff --git a/sha3.c b/sha3.c
index 21b2c78..b12e20d 100644
--- a/sha3.c
+++ b/sha3.c
@@ -31,6 +31,7 @@
 #define BACKEND_AVX512 2      // AVX-512 backend
 #define BACKEND_NEON 3        // Neon backend (experimental)
 #define BACKEND_DIET_NEON 4   // Neon backend which uses fewer registers
+#define BACKEND_HYBRID_NEON 5 // Hybrid neon backend
 
 // if SHA3_BACKEND is defined and set to 0 (the default), then unset it
 // and auto-detect the appropriate backend
@@ -1057,16 +1058,180 @@ static inline void permute_n_diet_neon(uint64_t a[static 25], const size_t num_r
   }
 
   // store column 4 of r4
-  vst1_u64(a + 24, row_get_tail(r4));
+  vst1_u64(a + 24, row_last(r4));
 }
 #endif /* SHA3_BACKEND == BACKEND_DIET_NEON */
 
+#if (SHA3_BACKEND == BACKEND_HYBRID_NEON)
+#include <arm_neon.h>
+
+/**
+ * @brief Scalar Keccak permutation.
+ *
+ * Apply `num_rounds` of Keccak permutation.  This function is only
+ * called by:
+ *
+ * - `permute_scalar()`: 24 rounds
+ * - `permute12_scalar()`: 12 rounds.  Used by TurboSHAKE and KangarooTwelve.
+ *
+ * @param[in,out] a Keccak state (array of 25 64-bit integers).
+ * @param[in] num_rounds Number of rounds (12 or 24).
+ */
+static inline void permute_n_hybrid_neon(uint64_t a[static 25], const size_t num_rounds) {
+  uint64_t tmp[25] = { 0 };
+  for (size_t i = SHA3_NUM_ROUNDS - num_rounds; i < SHA3_NUM_ROUNDS; i++) {
+    // theta
+    const uint64_t c[5] = {
+      a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20],
+      a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21],
+      a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22],
+      a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23],
+      a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24],
+    };
+
+    const uint64_t d[5] = {
+      c[4] ^ ROL(c[1], 1),
+      c[0] ^ ROL(c[2], 1),
+      c[1] ^ ROL(c[3], 1),
+      c[2] ^ ROL(c[4], 1),
+      c[3] ^ ROL(c[0], 1),
+    };
+
+    const uint64x2_t d0 = { d[0], d[1] },
+                     d1 = { d[2], d[3] };
+
+    uint64x2_t r0a = { a[ 0], a[ 1] }, r0b = { a[ 2], a[ 3] },
+               r1a = { a[ 5], a[ 6] }, r1b = { a[ 7], a[ 8] },
+               r2a = { a[10], a[11] }, r2b = { a[12], a[13] },
+               r3a = { a[15], a[16] }, r3b = { a[17], a[18] },
+               r4a = { a[20], a[21] }, r4b = { a[22], a[23] };
+
+    r0a ^= d0; r0b ^= d1; a[ 4] ^= d[4];
+    r1a ^= d0; r1b ^= d1; a[ 9] ^= d[4];
+    r2a ^= d0; r2b ^= d1; a[14] ^= d[4];
+    r3a ^= d0; r3b ^= d1; a[19] ^= d[4];
+    r4a ^= d0; r4b ^= d1; a[24] ^= d[4];
+
+
+    // rho
+    {
+      static const uint64x2_t r0a_ids = {  0,  1 },
+                              r0b_ids = { 62, 28 },
+                              r1a_ids = { 36, 44 },
+                              r1b_ids = {  6, 55 },
+                              r2a_ids = {  3, 10 },
+                              r2b_ids = { 43, 25 },
+                              r3a_ids = { 41, 45 },
+                              r3b_ids = { 15, 21 },
+                              r4a_ids = { 18,  2 },
+                              r4b_ids = { 61, 56 };
+
+      r0a = (r0a << r0a_ids) | (r0a >> (64 - r0a_ids));
+      r0b = (r0b << r0b_ids) | (r0b >> (64 - r0b_ids));
+      a[ 4] = ROL(a[ 4], 27); // 91 % 64 = 27
+
+      r1a = (r1a << r1a_ids) | (r1a >> (64 - r1a_ids));
+      r1b = (r1b << r1b_ids) | (r1b >> (64 - r1b_ids));
+      a[ 9] = ROL(a[ 9], 20); // 276 % 64 = 20
+
+      r2a = (r2a << r2a_ids) | (r2a >> (64 - r2a_ids));
+      r2b = (r2b << r2b_ids) | (r2b >> (64 - r2b_ids));
+      a[14] = ROL(a[14], 39); // 231 % 64 = 39
+
+      r3a = (r3a << r3a_ids) | (r3a >> (64 - r3a_ids));
+      r3b = (r3b << r3b_ids) | (r3b >> (64 - r3b_ids));
+      a[19] = ROL(a[19],  8); // 136 % 64 = 8
+
+      r4a = (r4a << r4a_ids) | (r4a >> (64 - r4a_ids));
+      r4b = (r4b << r4b_ids) | (r4b >> (64 - r4b_ids));
+      a[24] = ROL(a[24], 14); // 78 % 64 = 14
+
+      vst1q_u64(a +  0, r0a); vst1q_u64(a +  2, r0b);
+      vst1q_u64(a +  5, r1a); vst1q_u64(a +  7, r1b);
+      vst1q_u64(a + 10, r2a); vst1q_u64(a + 12, r2b);
+      vst1q_u64(a + 15, r3a); vst1q_u64(a + 17, r3b);
+      vst1q_u64(a + 20, r4a); vst1q_u64(a + 22, r4b);
+    }
+
+    // pi
+    {
+      tmp[ 0] = a[ 0];
+      tmp[ 1] = a[ 6];
+      tmp[ 2] = a[12];
+      tmp[ 3] = a[18];
+      tmp[ 4] = a[24];
+
+      tmp[ 5] = a[ 3];
+      tmp[ 6] = a[ 9];
+      tmp[ 7] = a[10];
+      tmp[ 8] = a[16];
+      tmp[ 9] = a[22];
+
+      tmp[10] = a[ 1];
+      tmp[11] = a[ 7];
+      tmp[12] = a[13];
+      tmp[13] = a[19];
+      tmp[14] = a[20];
+
+      tmp[15] = a[ 4];
+      tmp[16] = a[ 5];
+      tmp[17] = a[11];
+      tmp[18] = a[17];
+      tmp[19] = a[23];
+
+      tmp[20] = a[ 2];
+      tmp[21] = a[ 8];
+      tmp[22] = a[14];
+      tmp[23] = a[15];
+      tmp[24] = a[21];
+    }
+
+    // chi
+    {
+      a[ 0] = tmp[ 0] ^ (~tmp[ 1] & tmp[ 2]);
+      a[ 1] = tmp[ 1] ^ (~tmp[ 2] & tmp[ 3]);
+      a[ 2] = tmp[ 2] ^ (~tmp[ 3] & tmp[ 4]);
+      a[ 3] = tmp[ 3] ^ (~tmp[ 4] & tmp[ 0]);
+      a[ 4] = tmp[ 4] ^ (~tmp[ 0] & tmp[ 1]);
+
+      a[ 5] = tmp[ 5] ^ (~tmp[ 6] & tmp[ 7]);
+      a[ 6] = tmp[ 6] ^ (~tmp[ 7] & tmp[ 8]);
+      a[ 7] = tmp[ 7] ^ (~tmp[ 8] & tmp[ 9]);
+      a[ 8] = tmp[ 8] ^ (~tmp[ 9] & tmp[ 5]);
+      a[ 9] = tmp[ 9] ^ (~tmp[ 5] & tmp[ 6]);
+
+      a[10] = tmp[10] ^ (~tmp[11] & tmp[12]);
+      a[11] = tmp[11] ^ (~tmp[12] & tmp[13]);
+      a[12] = tmp[12] ^ (~tmp[13] & tmp[14]);
+      a[13] = tmp[13] ^ (~tmp[14] & tmp[10]);
+      a[14] = tmp[14] ^ (~tmp[10] & tmp[11]);
+
+      a[15] = tmp[15] ^ (~tmp[16] & tmp[17]);
+      a[16] = tmp[16] ^ (~tmp[17] & tmp[18]);
+      a[17] = tmp[17] ^ (~tmp[18] & tmp[19]);
+      a[18] = tmp[18] ^ (~tmp[19] & tmp[15]);
+      a[19] = tmp[19] ^ (~tmp[15] & tmp[16]);
+
+      a[20] = tmp[20] ^ (~tmp[21] & tmp[22]);
+      a[21] = tmp[21] ^ (~tmp[22] & tmp[23]);
+      a[22] = tmp[22] ^ (~tmp[23] & tmp[24]);
+      a[23] = tmp[23] ^ (~tmp[24] & tmp[20]);
+      a[24] = tmp[24] ^ (~tmp[20] & tmp[21]);
+    }
+
+    a[0] ^= RCS[i];
+  }
+}
+#endif /* (SHA3_BACKEND == BACKEND_HYBRID_NEON) */
+
 #if SHA3_BACKEND == BACKEND_AVX512
 #define permute_n permute_n_avx512 // use avx512 backend
 #elif SHA3_BACKEND == BACKEND_NEON
 #define permute_n permute_n_neon // use neon backend
 #elif SHA3_BACKEND == BACKEND_DIET_NEON
 #define permute_n permute_n_diet_neon // use diet-neon backend
+#elif SHA3_BACKEND == BACKEND_HYBRID_NEON
+#define permute_n permute_n_hybrid_neon // use hybrid-neon backend
 #elif SHA3_BACKEND == BACKEND_SCALAR
 #define permute_n permute_n_scalar // use scalar backend
 #else
@@ -2564,6 +2729,8 @@ const char *sha3_backend(void) {
   return "neon";
 #elif SHA3_BACKEND == BACKEND_DIET_NEON
   return "diet-neon";
+#elif SHA3_BACKEND == BACKEND_HYBRID_NEON
+  return "hybrid-neon";
 #elif SHA3_BACKEND == BACKEND_SCALAR
   return "scalar";
 #endif /* SHA3_BACKEND */
@@ -2873,6 +3040,22 @@ static void test_permute_diet_neon(void) {
 #endif /* SHA3_BACKEND == BACKEND_DIET_NEON */
 }
 
+static void test_permute_hybrid_neon(void) {
+#if SHA3_BACKEND == BACKEND_HYBRID_NEON
+  for (size_t i = 0; i < sizeof(PERMUTE_TESTS) / sizeof(PERMUTE_TESTS[0]); i++) {
+    const size_t exp_len = PERMUTE_TESTS[i].exp_len;
+
+    uint64_t got[25] = { 0 };
+    memcpy(got, PERMUTE_TESTS[i].a, sizeof(got));
+    permute_n_hybrid_neon(got, 24); // call permute_n_diet_neon() directly
+
+    if (memcmp(got, PERMUTE_TESTS[i].exp, exp_len)) {
+      fail_test(__func__, "", (uint8_t*) got, exp_len, (uint8_t*) PERMUTE_TESTS[i].exp, exp_len);
+    }
+  }
+#endif /* SHA3_BACKEND == BACKEND_HYBRID_NEON */
+}
+
 static const struct {
   uint64_t a[25]; // input state
   const uint64_t exp[25]; // expected value
@@ -2945,6 +3128,22 @@ static void test_permute12_diet_neon(void) {
 #endif /* SHA3_BACKEND == BACKEND_DIET_NEON */
 }
 
+static void test_permute12_hybrid_neon(void) {
+#if SHA3_BACKEND == BACKEND_HYBRID_NEON
+  for (size_t i = 0; i < sizeof(PERMUTE12_TESTS) / sizeof(PERMUTE12_TESTS[0]); i++) {
+    const size_t exp_len = PERMUTE12_TESTS[i].exp_len;
+
+    uint64_t got[25] = { 0 };
+    memcpy(got, PERMUTE12_TESTS[i].a, sizeof(got));
+    permute_n_hybrid_neon(got, 12); // call permute_n() directly
+
+    if (memcmp(got, PERMUTE12_TESTS[i].exp, exp_len)) {
+      fail_test(__func__, "", (uint8_t*) got, exp_len, (uint8_t*) PERMUTE12_TESTS[i].exp, exp_len);
+    }
+  }
+#endif /* SHA3_BACKEND == BACKEND_HYBRID_NEON */
+}
+
 static void test_sha3_224(void) {
   static const struct {
     const char *name; // test name
@@ -7101,10 +7300,12 @@ int main(void) {
   test_permute_avx512();
   test_permute_neon();
   test_permute_diet_neon();
+  test_permute_hybrid_neon();
   test_permute12_scalar();
   test_permute12_avx512();
   test_permute12_neon();
   test_permute12_diet_neon();
+  test_permute12_hybrid_neon();
   test_sha3_224();
   test_sha3_256();
   test_sha3_384();