From 9aea6905a8a21b42a7501599fc8e3f8a58859a87 Mon Sep 17 00:00:00 2001
From: Paul Duncan <pabs@pablotron.org>
Date: Mon, 27 May 2024 03:11:59 -0400
Subject: sha3.c: permute_n_avx2(): remove LOAD/STORE macros, minor comment
 fixes

---
 sha3.c | 59 +++++++++++++++++++++++------------------------------------
 1 file changed, 23 insertions(+), 36 deletions(-)

diff --git a/sha3.c b/sha3.c
index a2060f5..ba6a02a 100644
--- a/sha3.c
+++ b/sha3.c
@@ -532,35 +532,6 @@ static const __m256i LM0 = { ~0,  0,  0,  0 }, // only lane 0
                      LM2 = {  0,  0, ~0,  0 }, // only lane 2
                      LM3 = {  0,  0,  0, ~0 }; // only lane 3
 
-// load state array to avx2 registers
-// FIXME: remove macro, not needed
-#define AVX2_LOAD(s) __m256i \
-  r0_lo = _mm256_loadu_si256((__m256i*) (s +  0)), /* row 0, cols 0-3 */ \
-  r1_lo = _mm256_loadu_si256((__m256i*) (s +  5)), /* row 1, cols 0-3 */ \
-  r2_lo = _mm256_loadu_si256((__m256i*) (s + 10)), /* row 2, cols 0-3 */ \
-  r3_lo = _mm256_loadu_si256((__m256i*) (s + 15)), /* row 3, cols 0-3 */ \
-  r4_lo = _mm256_loadu_si256((__m256i*) (s + 20)), /* row 4, cols 0-3 */ \
-  r0_hi = { s[ 4] }, /* row 0, col 4 */ \
-  r1_hi = { s[ 9] }, /* row 1, col 4 */ \
-  r2_hi = { s[14] }, /* row 2, col 4 */ \
-  r3_hi = { s[19] }, /* row 3, col 4 */ \
-  r4_hi = { s[24] }; /* row 4, col 4 */
-
-// store avx2 registers to state array
-#define AVX2_STORE(s) do { \
-  union { long long int *i64; uint64_t *u64; } p = { .u64 = s }; \
-  _mm256_storeu_si256((__m256i*) (p.i64 +  0), r0_lo); /* row 0, cols 0-3 */ \
-  _mm256_storeu_si256((__m256i*) (p.i64 +  5), r1_lo); /* row 1, cols 0-3 */ \
-  _mm256_storeu_si256((__m256i*) (p.i64 + 10), r2_lo); /* row 2, cols 0-3 */ \
-  _mm256_storeu_si256((__m256i*) (p.i64 + 15), r3_lo); /* row 3, cols 0-3 */ \
-  _mm256_storeu_si256((__m256i*) (p.i64 + 20), r4_lo); /* row 4, cols 0-3 */ \
-  _mm256_maskstore_epi64(p.i64 +  4, LM0, r0_hi); /* row 0, col 4 */ \
-  _mm256_maskstore_epi64(p.i64 +  9, LM0, r1_hi); /* row 1, col 4 */ \
-  _mm256_maskstore_epi64(p.i64 + 14, LM0, r2_hi); /* row 2, col 4 */ \
-  _mm256_maskstore_epi64(p.i64 + 19, LM0, r3_hi); /* row 3, col 4 */ \
-  _mm256_maskstore_epi64(p.i64 + 24, LM0, r4_hi); /* row 4, col 4 */ \
-} while (0)
-
 // rotate left immediate
 #define AVX2_ROLI(v, n) (_mm256_slli_epi64((v), (n)) | _mm256_srli_epi64((v), (64-(n))))
 
@@ -640,8 +611,17 @@ static const __m256i LM0 = { ~0,  0,  0,  0 }, // only lane 0
  * 4. The permuted Keccak state is copied back to `s`.
  */
 static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds) {
-  // load state
-  AVX2_LOAD(s);
+  // load state array into avx2 registers
+  __m256i r0_lo = _mm256_loadu_si256((__m256i*) (s +  0)), /* row 0, cols 0-3 */
+          r1_lo = _mm256_loadu_si256((__m256i*) (s +  5)), /* row 1, cols 0-3 */
+          r2_lo = _mm256_loadu_si256((__m256i*) (s + 10)), /* row 2, cols 0-3 */
+          r3_lo = _mm256_loadu_si256((__m256i*) (s + 15)), /* row 3, cols 0-3 */
+          r4_lo = _mm256_loadu_si256((__m256i*) (s + 20)), /* row 4, cols 0-3 */
+          r0_hi = { s[ 4] }, /* row 0, col 4 */
+          r1_hi = { s[ 9] }, /* row 1, col 4 */
+          r2_hi = { s[14] }, /* row 2, col 4 */
+          r3_hi = { s[19] }, /* row 3, col 4 */
+          r4_hi = { s[24] }; /* row 4, col 4 */
 
   // loop over rounds
   for (size_t i = (SHA3_NUM_ROUNDS - num_rounds); __builtin_expect(i < SHA3_NUM_ROUNDS, 1); i++) {
@@ -651,10 +631,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds
       const __m256i c_lo = r0_lo ^ r1_lo ^ r2_lo ^ r3_lo ^ r4_lo,
                     c_hi = r0_hi ^ r1_hi ^ r2_hi ^ r3_hi ^ r4_hi;
 
-      // avx512 permute ids (for reference)
-      // static const __m512i I0 = { 4, 0, 1, 2, 3 },
-      //                      I1 = { 1, 2, 3, 4, 0 };
-
+      // i0 = { 4, 0, 1, 2, 3 }, i1 = { 1, 2, 3, 4, 0 }
       // d = xor(permute(i0, c), permute(i1, rol(c, 1)))
       const __m256i d0_lo = (_mm256_permute4x64_epi64(c_lo, THETA_I0_LO) & ~LM0) | (c_hi & LM0),
                     d0_hi = _mm256_permute4x64_epi64(c_lo, THETA_I0_HI) & LM0,
@@ -735,7 +712,17 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds
   }
 
   // store rows to state
-  AVX2_STORE(s);
+  union { long long int *i64; uint64_t *u64; } p = { .u64 = s };
+  _mm256_storeu_si256((__m256i*) (p.i64 +  0), r0_lo); /* row 0, cols 0-3 */
+  _mm256_storeu_si256((__m256i*) (p.i64 +  5), r1_lo); /* row 1, cols 0-3 */
+  _mm256_storeu_si256((__m256i*) (p.i64 + 10), r2_lo); /* row 2, cols 0-3 */
+  _mm256_storeu_si256((__m256i*) (p.i64 + 15), r3_lo); /* row 3, cols 0-3 */
+  _mm256_storeu_si256((__m256i*) (p.i64 + 20), r4_lo); /* row 4, cols 0-3 */
+  _mm256_maskstore_epi64(p.i64 +  4, LM0, r0_hi); /* row 0, col 4 */
+  _mm256_maskstore_epi64(p.i64 +  9, LM0, r1_hi); /* row 1, col 4 */
+  _mm256_maskstore_epi64(p.i64 + 14, LM0, r2_hi); /* row 2, col 4 */
+  _mm256_maskstore_epi64(p.i64 + 19, LM0, r3_hi); /* row 3, col 4 */
+  _mm256_maskstore_epi64(p.i64 + 24, LM0, r4_hi); /* row 4, col 4 */
 }
 #endif /* BACKEND == BACKEND_AVX2 */
 
-- 
cgit v1.2.3