From cc37060f29d872df8372fce4712a00cdb0afa0ba Mon Sep 17 00:00:00 2001
From: Paul Duncan <pabs@pablotron.org>
Date: Sun, 26 May 2024 16:01:09 -0400
Subject: sha3.c: add initial avx2 backend (slow)

bench results (oof)
-------------------
scalar (gcc):
  > make clean all BACKEND=1 && ./bench
  ...
  info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
  info: backend=scalar num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32
  function,dst_len,64,256,1024,4096,16384
  sha3_224,28,19.5,10.0,10.0,8.9,8.8
  sha3_256,32,19.5,10.0,9.9,9.5,9.3
  sha3_384,48,19.5,14.7,12.3,12.2,12.0
  sha3_512,64,19.5,19.4,18.2,17.2,17.1
  shake128,32,19.7,9.9,8.7,7.8,7.6
  shake256,32,20.2,10.1,10.0,9.6,9.3

scalar (clang):
  > make clean all BACKEND=1 CC=clang && ./bench
  info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
  info: backend=scalar num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32
  function,dst_len,64,256,1024,4096,16384
  sha3_224,28,19.7,9.9,9.8,8.8,8.7
  sha3_256,32,19.7,9.9,9.8,9.4,9.2
  sha3_384,48,19.7,14.6,12.1,12.0,11.8
  sha3_512,64,19.7,19.3,17.8,16.9,16.9
  shake128,32,19.7,9.8,8.6,7.7,7.5
  shake256,32,19.7,9.9,9.8,9.4,9.2

avx2 (gcc):
  > make clean all BACKEND=6 && ./bench
  info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
  info: backend=avx2 num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32
  function,dst_len,64,256,1024,4096,16384
  sha3_224,28,80.0,40.5,40.9,37.2,36.6
  sha3_256,32,79.8,40.5,40.9,39.8,38.9
  sha3_384,48,80.1,61.0,51.2,51.3,50.7
  sha3_512,64,79.9,81.5,76.7,73.0,73.1
  shake128,32,86.0,43.6,38.6,34.5,33.9
  shake256,32,86.0,43.6,44.1,42.8,41.9

avx2 (clang):
  > make clean all BACKEND=6 CC=clang && ./bench
  info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
  info: backend=avx2 num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32
  function,dst_len,64,256,1024,4096,16384
  sha3_224,28,62.0,31.6,31.9,29.0,28.5
  sha3_256,32,61.9,31.6,32.0,31.1,30.3
  sha3_384,48,62.6,47.7,40.1,40.2,39.8
  sha3_512,64,62.0,63.8,60.2,57.2,57.3
  shake128,32,62.5,31.7,27.9,24.9,24.5
  shake256,32,62.0,31.5,31.9,31.0,30.3

the culprit is using gather in the pi step for permutes, so those will
need to be rewritten as permutes:

  > perf annotate -Mintel --stdio --quiet permute_n_avx2 | grep gatherqq
     24.09 :   207e:   vpgatherqq ymm0,QWORD PTR [rdi+ymm3*8],ymm6
     14.07 :   20a0:   vpgatherqq ymm4,QWORD PTR [rdi+ymm3*8],ymm6
     13.36 :   20aa:   vpgatherqq ymm8,QWORD PTR [rdi+ymm7*8],ymm6
     13.50 :   20c4:   vpgatherqq ymm3,QWORD PTR [rdi+ymm7*8],ymm6
     13.61 :   20ce:   vpgatherqq ymm7,QWORD PTR [rdi+ymm1*8],ymm6
---
 sha3.c | 249 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 249 insertions(+)

(limited to 'sha3.c')

diff --git a/sha3.c b/sha3.c
index 9b213d0..7979e72 100644
--- a/sha3.c
+++ b/sha3.c
@@ -68,6 +68,7 @@
 #define BACKEND_NEON 3        // Neon backend.  Slower than scalar.
 #define BACKEND_DIET_NEON 4   // Neon backend, fewer registers.  Slower than scalar.
 #define BACKEND_HYBRID 5      // Hybrid scalar/neon backend.  Slower than scalar.
+#define BACKEND_AVX2 6        // AVX2 backend
 
 // if BACKEND is defined and set to 0 (the default), then unset it
 // and auto-detect the appropriate backend below
@@ -79,6 +80,8 @@
 #ifndef BACKEND
 #if defined(__AVX512F__)
 #define BACKEND BACKEND_AVX512
+#elif defined(__AVX2__)
+#define BACKEND BACKEND_AVX2
 #elif 0 && defined(__ARM_NEON)
 #define BACKEND BACKEND_NEON
 #else
@@ -520,6 +523,248 @@ static inline void permute_n_avx512(uint64_t s[static 25], const size_t num_roun
 }
 #endif /* BACKEND == BACKEND_AVX512 */
 
+#if BACKEND == BACKEND_AVX2
+#include <immintrin.h>
+
+static const __m256i M0 = { ~0, 0, 0, 0 }, // mask, first lane only
+                     K64 = { 64, 64, 64, 64 }; // 64, all lanes (used by ROLV)
+
+// load state array to avx2 registers
+// FIXME: remove macro, not needed
+#define AVX2_LOAD(s) __m256i \
+  r0_lo = _mm256_loadu_epi64(s +  0), /* row 0, cols 0-3 */ \
+  r1_lo = _mm256_loadu_epi64(s +  5), /* row 1, cols 0-3 */ \
+  r2_lo = _mm256_loadu_epi64(s + 10), /* row 2, cols 0-3 */ \
+  r3_lo = _mm256_loadu_epi64(s + 15), /* row 3, cols 0-3 */ \
+  r4_lo = _mm256_loadu_epi64(s + 20), /* row 4, cols 0-3 */ \
+  r0_hi = { s[ 4] },                  /* row 0, col 4 */ \
+  r1_hi = { s[ 9] },                  /* row 1, col 4 */ \
+  r2_hi = { s[14] },                  /* row 2, col 4 */ \
+  r3_hi = { s[19] },                  /* row 3, col 4 */ \
+  r4_hi = { s[24] };                  /* row 4, col 4 */
+
+// store avx2 registers to state array
+#define AVX2_STORE(s) do { \
+  union { long long int *i64; uint64_t *u64; } p = { .u64 = s }; \
+  \
+  /* store rows */ \
+  _mm256_storeu_epi64(p.i64 +  0, r0_lo); /* row 0, cols 0-3 */ \
+  _mm256_storeu_epi64(p.i64 +  5, r1_lo); /* row 1, cols 0-3 */ \
+  _mm256_storeu_epi64(p.i64 + 10, r2_lo); /* row 2, cols 0-3 */ \
+  _mm256_storeu_epi64(p.i64 + 15, r3_lo); /* row 3, cols 0-3 */ \
+  _mm256_storeu_epi64(p.i64 + 20, r4_lo); /* row 4, cols 0-3 */ \
+  _mm256_maskstore_epi64(p.i64 +  4, M0, r0_hi); /* row 0, col 4 */ \
+  _mm256_maskstore_epi64(p.i64 +  9, M0, r1_hi); /* row 1, col 4 */ \
+  _mm256_maskstore_epi64(p.i64 + 14, M0, r2_hi); /* row 2, col 4 */ \
+  _mm256_maskstore_epi64(p.i64 + 19, M0, r3_hi); /* row 3, col 4 */ \
+  _mm256_maskstore_epi64(p.i64 + 24, M0, r4_hi); /* row 4, col 4 */ \
+} while (0)
+
+// rotate left immediate
+#define AVX2_ROLI(v, n) (_mm256_slli_epi64((v), (n)) | _mm256_srli_epi64((v), (64-(n))))
+
+// rotate left by vector
+#define AVX2_ROLV(v, n) (_mm256_sllv_epi64((v), (n)) | _mm256_srlv_epi64((v), (K64-(n))))
+
+// pi permute IDs
+#define PI_I0_LO 0x90 // 0, 0, 1, 2 -> 0b10010000 -> 0x90
+#define PI_I0_HI 0x03 // 3, 0, 0, 0 -> 0b00000011 -> 0x03
+#define PI_I1_LO 0x39 // 1, 2, 3, 0 -> 0b00111001 -> 0x39
+#define PI_I1_HI 0x00 // 0, 0, 0, 0 -> 0b00000000 -> 0x00
+
+// chi permute IDs
+#define CHI_I0_LO 0x39 // 1, 2, 3, 0 -> 0b00111001 -> 0x39
+#define CHI_I1_LO 0x0e // 2, 3, 0, 0 -> 0b00001110 -> 0x0e
+#define CHI_I1_HI 0x01 // 1, 0, 0, 0 -> 0b00000001 -> 0x01
+
+/**
+ * @brief AVX2 Keccak permutation.
+ *
+ * @param[in,out] s Keccak state (array of 25 64-bit integers).
+ * @param[in] num_rounds Number of rounds (12 or 24).
+ *
+ * How it works:
+ *
+ * 1. The Keccak state for cells in columns 0-3 is loaded from `s` (an
+ * array of 25 64-bit unsigned integers) into four 64-bit lanes of 5
+ * 256-bit registers r0_lo-r4_lo.
+ *
+ * 2. The Keccak state for cells in column 4 is loaded from `s` into
+ * the first 64-bit lanes of 5 256-bit registers r0_hi-r4_hi.
+ *
+ * When steps #1 and #2 are done, the registers look like this:
+ *
+ *   ------------------------------------------------
+ *   |          | 64-bit Lanes of 256-bit Registers |
+ *   |----------|-----------------------------------|
+ *   | Register | Lane 0 | Lane 1 | Lane 2 | Lane 3 |
+ *   |----------|--------|--------|--------|--------|
+ *   | r0_lo    |  s[ 0] |  s[ 1] |  s[ 2] |  s[ 3] |
+ *   | r1_lo    |  s[ 5] |  s[ 6] |  s[ 7] |  s[ 8] |
+ *   | r2_lo    |  s[10] |  s[11] |  s[12] |  s[13] |
+ *   | r3_lo    |  s[15] |  s[16] |  s[17] |  s[18] |
+ *   | r4_lo    |  s[20] |  s[21] |  s[22] |  s[23] |
+ *   | r0_hi    |  s[ 4] |   n/a  |   n/a  |   n/a  |
+ *   | r1_hi    |  s[ 9] |   n/a  |   n/a  |   n/a  |
+ *   | r2_hi    |  s[14] |   n/a  |   n/a  |   n/a  |
+ *   | r3_hi    |  s[19] |   n/a  |   n/a  |   n/a  |
+ *   | r4_hi    |  s[24] |   n/a  |   n/a  |   n/a  |
+ *   ------------------------------------------------
+ *
+ * 3. The Keccak permutation is applied `num_rounds` times, where
+ * `num_rounds` is either 12 for TurboSHAKE and KangarooTwelve or 24
+ * otherwise.
+ *
+ * (Note: for the Pi step the registers are stored back to the state
+ * array and then gathered to permute the state.  This is different than
+ * the AVX-512 implementation because of register pressure).
+ *
+ * 4. The permuted Keccak state is copied back to `s`.
+ */
+static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds) {
+  // load state
+  AVX2_LOAD(s);
+
+  // loop over rounds
+  for (size_t i = (SHA3_NUM_ROUNDS - num_rounds); __builtin_expect(i < SHA3_NUM_ROUNDS, 1); i++) {
+    // theta
+    {
+      // c = xor(r0, r1, r2, r3, r4)
+      const __m256i c_lo = r0_lo ^ r1_lo ^ r2_lo ^ r3_lo ^ r4_lo,
+                    c_hi = r0_hi ^ r1_hi ^ r2_hi ^ r3_hi ^ r4_hi;
+
+      // avx512 permute ids (for reference)
+      // static const __m512i I0 = { 4, 0, 1, 2, 3 },
+      //                      I1 = { 1, 2, 3, 4, 0 };
+
+      // masks
+      static const __m256i M0 = { ~0,  0,  0,  0 }, // { 1, 0, 0, 0 }
+                           M1 = { ~0, ~0, ~0,  0 }; // { 1, 1, 1, 0 }
+
+      // d = xor(permute(i0, c), permute(i1, rol(c, 1)))
+      const __m256i d0_lo = (_mm256_permute4x64_epi64(c_lo, PI_I0_LO) & ~M0) | (c_hi & M0),
+                    d0_hi = _mm256_permute4x64_epi64(c_lo, PI_I0_HI) & M0,
+                    d1_lo = (_mm256_permute4x64_epi64(c_lo, PI_I1_LO) & M1) | (_mm256_permute4x64_epi64(c_hi, PI_I1_HI) & ~M1),
+                    d1_hi = (c_lo & M0),
+                    d_lo = d0_lo ^ AVX2_ROLI(d1_lo, 1),
+                    d_hi = d0_hi ^ AVX2_ROLI(d1_hi, 1);
+
+      // row = xor(row, d)
+      r0_lo ^= d_lo; r1_lo ^= d_lo; r2_lo ^= d_lo; r3_lo ^= d_lo; r4_lo ^= d_lo;
+      r0_hi ^= d_hi; r1_hi ^= d_hi; r2_hi ^= d_hi; r3_hi ^= d_hi; r4_hi ^= d_hi;
+    }
+
+    // rho
+    {
+      // rotate values
+      static const __m256i V0_LO = {  0,  1, 62, 28 }, V0_HI = { 27 },
+                           V1_LO = { 36, 44,  6, 55 }, V1_HI = { 20 },
+                           V2_LO = {  3, 10, 43, 25 }, V2_HI = { 39 },
+                           V3_LO = { 41, 45, 15, 21 }, V3_HI = {  8 },
+                           V4_LO = { 18,  2, 61, 56 }, V4_HI = { 14 };
+
+      // rotate rows
+      // FIXME: could reduce rotates by permuting
+      r0_lo = AVX2_ROLV(r0_lo, V0_LO); r0_hi = AVX2_ROLV(r0_hi, V0_HI);
+      r1_lo = AVX2_ROLV(r1_lo, V1_LO); r1_hi = AVX2_ROLV(r1_hi, V1_HI);
+      r2_lo = AVX2_ROLV(r2_lo, V2_LO); r2_hi = AVX2_ROLV(r2_hi, V2_HI);
+      r3_lo = AVX2_ROLV(r3_lo, V3_LO); r3_hi = AVX2_ROLV(r3_hi, V3_HI);
+      r4_lo = AVX2_ROLV(r4_lo, V4_LO); r4_hi = AVX2_ROLV(r4_hi, V4_HI);
+    }
+
+    // pi
+    //
+    // store state array, then gather to permute the state.  note: with
+    // some work we could probably do in-register permutes, but
+    // benchmark first to see if this is worth the trouble.
+    {
+      static const __m256i V0_LO = {  0,  6, 12, 18 },
+                           V1_LO = {  3,  9, 10, 16 },
+                           V2_LO = {  1,  7, 13, 19 },
+                           V3_LO = {  4,  5, 11, 17 },
+                           V4_LO = {  2,  8, 14, 15 };
+      static const size_t V0_HI = 24, V1_HI = 22, V2_HI = 20, V3_HI = 23, V4_HI = 21;
+
+      // store rows to state, then gather to permute
+      AVX2_STORE(s);
+
+      // re-load using gather to permute
+      union { long long int *i64; uint64_t *u64; } p = { .u64 = s };
+      r0_lo = _mm256_i64gather_epi64(p.i64, V0_LO, 8); r0_hi = ((__m256i) { s[V0_HI] });
+      r1_lo = _mm256_i64gather_epi64(p.i64, V1_LO, 8); r1_hi = ((__m256i) { s[V1_HI] });
+      r2_lo = _mm256_i64gather_epi64(p.i64, V2_LO, 8); r2_hi = ((__m256i) { s[V2_HI] });
+      r3_lo = _mm256_i64gather_epi64(p.i64, V3_LO, 8); r3_hi = ((__m256i) { s[V3_HI] });
+      r4_lo = _mm256_i64gather_epi64(p.i64, V4_LO, 8); r4_hi = ((__m256i) { s[V4_HI] });
+    }
+
+    // chi
+    {
+      // masks
+      static const __m256i M0 = { ~0,  0,  0,  0 }, // { 1, 0, 0, 0 }
+                           M1 = { ~0, ~0, ~0,  0 }, // { 1, 1, 1, 0 }
+                           M2 = { ~0, ~0,  0, ~0 }; // { 1, 1, 0, 1 }
+
+      // r0
+      {
+        const __m256i a_lo = (_mm256_permute4x64_epi64(r0_lo, CHI_I0_LO) & M1) | (_mm256_permute4x64_epi64(r0_hi, CHI_I0_LO) & ~M1),
+                      a_hi = r0_lo & M0,
+                      b_lo = (_mm256_permute4x64_epi64(r0_lo, CHI_I1_LO) & M2) | (_mm256_permute4x64_epi64(r0_hi, CHI_I1_LO) & ~M0),
+                      b_hi = _mm256_permute4x64_epi64(r0_lo, CHI_I1_HI) & M0;
+
+        r0_lo ^= ~a_lo & b_lo; r0_hi ^= ~a_hi & b_hi; // r0 ^= ~a & b
+      }
+
+      // r1
+      {
+        const __m256i a_lo = (_mm256_permute4x64_epi64(r1_lo, CHI_I0_LO) & M1) | (_mm256_permute4x64_epi64(r1_hi, CHI_I0_LO) & ~M1),
+                      a_hi = r1_lo & M0,
+                      b_lo = (_mm256_permute4x64_epi64(r1_lo, CHI_I1_LO) & M2) | (_mm256_permute4x64_epi64(r1_hi, CHI_I1_LO) & ~M0),
+                      b_hi = _mm256_permute4x64_epi64(r1_lo, CHI_I1_HI) & M0;
+
+        r1_lo ^= ~a_lo & b_lo; r1_hi ^= ~a_hi & b_hi; // r1 ^= ~a & b
+      }
+
+      // r2
+      {
+        const __m256i a_lo = (_mm256_permute4x64_epi64(r2_lo, CHI_I0_LO) & M1) | (_mm256_permute4x64_epi64(r2_hi, CHI_I0_LO) & ~M1),
+                      a_hi = r2_lo & M0,
+                      b_lo = (_mm256_permute4x64_epi64(r2_lo, CHI_I1_LO) & M2) | (_mm256_permute4x64_epi64(r2_hi, CHI_I1_LO) & ~M0),
+                      b_hi = _mm256_permute4x64_epi64(r2_lo, CHI_I1_HI) & M0;
+
+        r2_lo ^= ~a_lo & b_lo; r2_hi ^= ~a_hi & b_hi; // r2 ^= ~a & b
+      }
+
+      // r3
+      {
+        const __m256i a_lo = (_mm256_permute4x64_epi64(r3_lo, CHI_I0_LO) & M1) | (_mm256_permute4x64_epi64(r3_hi, CHI_I0_LO) & ~M1),
+                      a_hi = r3_lo & M0,
+                      b_lo = (_mm256_permute4x64_epi64(r3_lo, CHI_I1_LO) & M2) | (_mm256_permute4x64_epi64(r3_hi, CHI_I1_LO) & ~M0),
+                      b_hi = _mm256_permute4x64_epi64(r3_lo, CHI_I1_HI) & M0;
+
+        r3_lo ^= ~a_lo & b_lo; r3_hi ^= ~a_hi & b_hi; // r3 ^= ~a & b
+      }
+
+      // r4
+      {
+        const __m256i a_lo = (_mm256_permute4x64_epi64(r4_lo, CHI_I0_LO) & M1) | (_mm256_permute4x64_epi64(r4_hi, CHI_I0_LO) & ~M1),
+                      a_hi = r4_lo & M0,
+                      b_lo = (_mm256_permute4x64_epi64(r4_lo, CHI_I1_LO) & M2) | (_mm256_permute4x64_epi64(r4_hi, CHI_I1_LO) & ~M0),
+                      b_hi = _mm256_permute4x64_epi64(r4_lo, CHI_I1_HI) & M0;
+
+        r4_lo ^= ~a_lo & b_lo; r4_hi ^= ~a_hi & b_hi; // r4 ^= ~a & b
+      }
+    }
+
+    // iota
+    const __m256i rc = { RCS[i], 0, 0, 0 };
+    r0_lo ^= rc;
+  }
+
+  // store rows to state
+  AVX2_STORE(s);
+}
+#endif /* BACKEND == BACKEND_AVX2 */
+
 #if BACKEND == BACKEND_NEON
 #include <arm_neon.h>
 
@@ -1310,6 +1555,8 @@ static inline void permute_n_hybrid(uint64_t a[static 25], const size_t num_roun
 // map permute_n() to active backend
 #if BACKEND == BACKEND_AVX512
 #define permute_n permute_n_avx512 // use avx512 backend
+#elif BACKEND == BACKEND_AVX2
+#define permute_n permute_n_avx2 // use AVX2 backend
 #elif BACKEND == BACKEND_NEON
 #define permute_n permute_n_neon // use neon backend
 #elif BACKEND == BACKEND_DIET_NEON
@@ -2968,6 +3215,8 @@ void k12_once(const uint8_t *src, const size_t src_len, uint8_t *dst, const size
 const char *sha3_backend(void) {
 #if BACKEND == BACKEND_AVX512
   return "avx512";
+#elif BACKEND == BACKEND_AVX2
+  return "avx2";
 #elif BACKEND == BACKEND_NEON
   return "neon";
 #elif BACKEND == BACKEND_DIET_NEON
-- 
cgit v1.2.3