From 9aea6905a8a21b42a7501599fc8e3f8a58859a87 Mon Sep 17 00:00:00 2001 From: Paul Duncan Date: Mon, 27 May 2024 03:11:59 -0400 Subject: sha3.c: permute_n_avx2(): remove LOAD/STORE macros, minor comment fixes --- sha3.c | 59 +++++++++++++++++++++++------------------------------------ 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/sha3.c b/sha3.c index a2060f5..ba6a02a 100644 --- a/sha3.c +++ b/sha3.c @@ -532,35 +532,6 @@ static const __m256i LM0 = { ~0, 0, 0, 0 }, // only lane 0 LM2 = { 0, 0, ~0, 0 }, // only lane 2 LM3 = { 0, 0, 0, ~0 }; // only lane 3 -// load state array to avx2 registers -// FIXME: remove macro, not needed -#define AVX2_LOAD(s) __m256i \ - r0_lo = _mm256_loadu_si256((__m256i*) (s + 0)), /* row 0, cols 0-3 */ \ - r1_lo = _mm256_loadu_si256((__m256i*) (s + 5)), /* row 1, cols 0-3 */ \ - r2_lo = _mm256_loadu_si256((__m256i*) (s + 10)), /* row 2, cols 0-3 */ \ - r3_lo = _mm256_loadu_si256((__m256i*) (s + 15)), /* row 3, cols 0-3 */ \ - r4_lo = _mm256_loadu_si256((__m256i*) (s + 20)), /* row 4, cols 0-3 */ \ - r0_hi = { s[ 4] }, /* row 0, col 4 */ \ - r1_hi = { s[ 9] }, /* row 1, col 4 */ \ - r2_hi = { s[14] }, /* row 2, col 4 */ \ - r3_hi = { s[19] }, /* row 3, col 4 */ \ - r4_hi = { s[24] }; /* row 4, col 4 */ - -// store avx2 registers to state array -#define AVX2_STORE(s) do { \ - union { long long int *i64; uint64_t *u64; } p = { .u64 = s }; \ - _mm256_storeu_si256((__m256i*) (p.i64 + 0), r0_lo); /* row 0, cols 0-3 */ \ - _mm256_storeu_si256((__m256i*) (p.i64 + 5), r1_lo); /* row 1, cols 0-3 */ \ - _mm256_storeu_si256((__m256i*) (p.i64 + 10), r2_lo); /* row 2, cols 0-3 */ \ - _mm256_storeu_si256((__m256i*) (p.i64 + 15), r3_lo); /* row 3, cols 0-3 */ \ - _mm256_storeu_si256((__m256i*) (p.i64 + 20), r4_lo); /* row 4, cols 0-3 */ \ - _mm256_maskstore_epi64(p.i64 + 4, LM0, r0_hi); /* row 0, col 4 */ \ - _mm256_maskstore_epi64(p.i64 + 9, LM0, r1_hi); /* row 1, col 4 */ \ - _mm256_maskstore_epi64(p.i64 + 14, LM0, r2_hi); /* row 2, col 4 */ \ - _mm256_maskstore_epi64(p.i64 + 19, LM0, r3_hi); /* row 3, col 4 */ \ - _mm256_maskstore_epi64(p.i64 + 24, LM0, r4_hi); /* row 4, col 4 */ \ -} while (0) - // rotate left immediate #define AVX2_ROLI(v, n) (_mm256_slli_epi64((v), (n)) | _mm256_srli_epi64((v), (64-(n)))) @@ -640,8 +611,17 @@ static const __m256i LM0 = { ~0, 0, 0, 0 }, // only lane 0 * 4. The permuted Keccak state is copied back to `s`. */ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds) { - // load state - AVX2_LOAD(s); + // load state array into avx2 registers + __m256i r0_lo = _mm256_loadu_si256((__m256i*) (s + 0)), /* row 0, cols 0-3 */ + r1_lo = _mm256_loadu_si256((__m256i*) (s + 5)), /* row 1, cols 0-3 */ + r2_lo = _mm256_loadu_si256((__m256i*) (s + 10)), /* row 2, cols 0-3 */ + r3_lo = _mm256_loadu_si256((__m256i*) (s + 15)), /* row 3, cols 0-3 */ + r4_lo = _mm256_loadu_si256((__m256i*) (s + 20)), /* row 4, cols 0-3 */ + r0_hi = { s[ 4] }, /* row 0, col 4 */ + r1_hi = { s[ 9] }, /* row 1, col 4 */ + r2_hi = { s[14] }, /* row 2, col 4 */ + r3_hi = { s[19] }, /* row 3, col 4 */ + r4_hi = { s[24] }; /* row 4, col 4 */ // loop over rounds for (size_t i = (SHA3_NUM_ROUNDS - num_rounds); __builtin_expect(i < SHA3_NUM_ROUNDS, 1); i++) { @@ -651,10 +631,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds const __m256i c_lo = r0_lo ^ r1_lo ^ r2_lo ^ r3_lo ^ r4_lo, c_hi = r0_hi ^ r1_hi ^ r2_hi ^ r3_hi ^ r4_hi; - // avx512 permute ids (for reference) - // static const __m512i I0 = { 4, 0, 1, 2, 3 }, - // I1 = { 1, 2, 3, 4, 0 }; - + // i0 = { 4, 0, 1, 2, 3 }, i1 = { 1, 2, 3, 4, 0 } // d = xor(permute(i0, c), permute(i1, rol(c, 1))) const __m256i d0_lo = (_mm256_permute4x64_epi64(c_lo, THETA_I0_LO) & ~LM0) | (c_hi & LM0), d0_hi = _mm256_permute4x64_epi64(c_lo, THETA_I0_HI) & LM0, @@ -735,7 +712,17 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds } // store rows to state - AVX2_STORE(s); + union { long long int *i64; uint64_t *u64; } p = { .u64 = s }; + _mm256_storeu_si256((__m256i*) (p.i64 + 0), r0_lo); /* row 0, cols 0-3 */ + _mm256_storeu_si256((__m256i*) (p.i64 + 5), r1_lo); /* row 1, cols 0-3 */ + _mm256_storeu_si256((__m256i*) (p.i64 + 10), r2_lo); /* row 2, cols 0-3 */ + _mm256_storeu_si256((__m256i*) (p.i64 + 15), r3_lo); /* row 3, cols 0-3 */ + _mm256_storeu_si256((__m256i*) (p.i64 + 20), r4_lo); /* row 4, cols 0-3 */ + _mm256_maskstore_epi64(p.i64 + 4, LM0, r0_hi); /* row 0, col 4 */ + _mm256_maskstore_epi64(p.i64 + 9, LM0, r1_hi); /* row 1, col 4 */ + _mm256_maskstore_epi64(p.i64 + 14, LM0, r2_hi); /* row 2, col 4 */ + _mm256_maskstore_epi64(p.i64 + 19, LM0, r3_hi); /* row 3, col 4 */ + _mm256_maskstore_epi64(p.i64 + 24, LM0, r4_hi); /* row 4, col 4 */ } #endif /* BACKEND == BACKEND_AVX2 */ -- cgit v1.2.3