aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Duncan <pabs@pablotron.org>2024-05-27 03:11:59 -0400
committerPaul Duncan <pabs@pablotron.org>2024-05-27 03:11:59 -0400
commit9aea6905a8a21b42a7501599fc8e3f8a58859a87 (patch)
treede5ee95d279df86bae0e20b585a01fbe6b9509f2
parent47c89748dd940bf3500fa70994e9653cbdeb5d14 (diff)
downloadsha3-9aea6905a8a21b42a7501599fc8e3f8a58859a87.tar.bz2
sha3-9aea6905a8a21b42a7501599fc8e3f8a58859a87.zip
sha3.c: permute_n_avx2(): remove LOAD/STORE macros, minor comment fixes
-rw-r--r--sha3.c59
1 files changed, 23 insertions, 36 deletions
diff --git a/sha3.c b/sha3.c
index a2060f5..ba6a02a 100644
--- a/sha3.c
+++ b/sha3.c
@@ -532,35 +532,6 @@ static const __m256i LM0 = { ~0, 0, 0, 0 }, // only lane 0
LM2 = { 0, 0, ~0, 0 }, // only lane 2
LM3 = { 0, 0, 0, ~0 }; // only lane 3
-// load state array to avx2 registers
-// FIXME: remove macro, not needed
-#define AVX2_LOAD(s) __m256i \
- r0_lo = _mm256_loadu_si256((__m256i*) (s + 0)), /* row 0, cols 0-3 */ \
- r1_lo = _mm256_loadu_si256((__m256i*) (s + 5)), /* row 1, cols 0-3 */ \
- r2_lo = _mm256_loadu_si256((__m256i*) (s + 10)), /* row 2, cols 0-3 */ \
- r3_lo = _mm256_loadu_si256((__m256i*) (s + 15)), /* row 3, cols 0-3 */ \
- r4_lo = _mm256_loadu_si256((__m256i*) (s + 20)), /* row 4, cols 0-3 */ \
- r0_hi = { s[ 4] }, /* row 0, col 4 */ \
- r1_hi = { s[ 9] }, /* row 1, col 4 */ \
- r2_hi = { s[14] }, /* row 2, col 4 */ \
- r3_hi = { s[19] }, /* row 3, col 4 */ \
- r4_hi = { s[24] }; /* row 4, col 4 */
-
-// store avx2 registers to state array
-#define AVX2_STORE(s) do { \
- union { long long int *i64; uint64_t *u64; } p = { .u64 = s }; \
- _mm256_storeu_si256((__m256i*) (p.i64 + 0), r0_lo); /* row 0, cols 0-3 */ \
- _mm256_storeu_si256((__m256i*) (p.i64 + 5), r1_lo); /* row 1, cols 0-3 */ \
- _mm256_storeu_si256((__m256i*) (p.i64 + 10), r2_lo); /* row 2, cols 0-3 */ \
- _mm256_storeu_si256((__m256i*) (p.i64 + 15), r3_lo); /* row 3, cols 0-3 */ \
- _mm256_storeu_si256((__m256i*) (p.i64 + 20), r4_lo); /* row 4, cols 0-3 */ \
- _mm256_maskstore_epi64(p.i64 + 4, LM0, r0_hi); /* row 0, col 4 */ \
- _mm256_maskstore_epi64(p.i64 + 9, LM0, r1_hi); /* row 1, col 4 */ \
- _mm256_maskstore_epi64(p.i64 + 14, LM0, r2_hi); /* row 2, col 4 */ \
- _mm256_maskstore_epi64(p.i64 + 19, LM0, r3_hi); /* row 3, col 4 */ \
- _mm256_maskstore_epi64(p.i64 + 24, LM0, r4_hi); /* row 4, col 4 */ \
-} while (0)
-
// rotate left immediate
#define AVX2_ROLI(v, n) (_mm256_slli_epi64((v), (n)) | _mm256_srli_epi64((v), (64-(n))))
@@ -640,8 +611,17 @@ static const __m256i LM0 = { ~0, 0, 0, 0 }, // only lane 0
* 4. The permuted Keccak state is copied back to `s`.
*/
static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds) {
- // load state
- AVX2_LOAD(s);
+ // load state array into avx2 registers
+ __m256i r0_lo = _mm256_loadu_si256((__m256i*) (s + 0)), /* row 0, cols 0-3 */
+ r1_lo = _mm256_loadu_si256((__m256i*) (s + 5)), /* row 1, cols 0-3 */
+ r2_lo = _mm256_loadu_si256((__m256i*) (s + 10)), /* row 2, cols 0-3 */
+ r3_lo = _mm256_loadu_si256((__m256i*) (s + 15)), /* row 3, cols 0-3 */
+ r4_lo = _mm256_loadu_si256((__m256i*) (s + 20)), /* row 4, cols 0-3 */
+ r0_hi = { s[ 4] }, /* row 0, col 4 */
+ r1_hi = { s[ 9] }, /* row 1, col 4 */
+ r2_hi = { s[14] }, /* row 2, col 4 */
+ r3_hi = { s[19] }, /* row 3, col 4 */
+ r4_hi = { s[24] }; /* row 4, col 4 */
// loop over rounds
for (size_t i = (SHA3_NUM_ROUNDS - num_rounds); __builtin_expect(i < SHA3_NUM_ROUNDS, 1); i++) {
@@ -651,10 +631,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds
const __m256i c_lo = r0_lo ^ r1_lo ^ r2_lo ^ r3_lo ^ r4_lo,
c_hi = r0_hi ^ r1_hi ^ r2_hi ^ r3_hi ^ r4_hi;
- // avx512 permute ids (for reference)
- // static const __m512i I0 = { 4, 0, 1, 2, 3 },
- // I1 = { 1, 2, 3, 4, 0 };
-
+ // i0 = { 4, 0, 1, 2, 3 }, i1 = { 1, 2, 3, 4, 0 }
// d = xor(permute(i0, c), permute(i1, rol(c, 1)))
const __m256i d0_lo = (_mm256_permute4x64_epi64(c_lo, THETA_I0_LO) & ~LM0) | (c_hi & LM0),
d0_hi = _mm256_permute4x64_epi64(c_lo, THETA_I0_HI) & LM0,
@@ -735,7 +712,17 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds
}
// store rows to state
- AVX2_STORE(s);
+ union { long long int *i64; uint64_t *u64; } p = { .u64 = s };
+ _mm256_storeu_si256((__m256i*) (p.i64 + 0), r0_lo); /* row 0, cols 0-3 */
+ _mm256_storeu_si256((__m256i*) (p.i64 + 5), r1_lo); /* row 1, cols 0-3 */
+ _mm256_storeu_si256((__m256i*) (p.i64 + 10), r2_lo); /* row 2, cols 0-3 */
+ _mm256_storeu_si256((__m256i*) (p.i64 + 15), r3_lo); /* row 3, cols 0-3 */
+ _mm256_storeu_si256((__m256i*) (p.i64 + 20), r4_lo); /* row 4, cols 0-3 */
+ _mm256_maskstore_epi64(p.i64 + 4, LM0, r0_hi); /* row 0, col 4 */
+ _mm256_maskstore_epi64(p.i64 + 9, LM0, r1_hi); /* row 1, col 4 */
+ _mm256_maskstore_epi64(p.i64 + 14, LM0, r2_hi); /* row 2, col 4 */
+ _mm256_maskstore_epi64(p.i64 + 19, LM0, r3_hi); /* row 3, col 4 */
+ _mm256_maskstore_epi64(p.i64 + 24, LM0, r4_hi); /* row 4, col 4 */
}
#endif /* BACKEND == BACKEND_AVX2 */