aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Duncan <pabs@pablotron.org>2024-05-27 01:09:02 -0400
committerPaul Duncan <pabs@pablotron.org>2024-05-27 01:09:02 -0400
commit77d301f602167df1df05a86d7da1c961ff2f4f1a (patch)
tree6cd05ed9558829be1b6002c4889cc905a4aa4b5a
parent3371742d975779ed342468831606b52047c0dfc6 (diff)
downloadsha3-77d301f602167df1df05a86d7da1c961ff2f4f1a.tar.bz2
sha3-77d301f602167df1df05a86d7da1c961ff2f4f1a.zip
sha3.c: permute_n_avx2(): replace two instances of permute4x64_epi64() with shuffle_epi32()
-rw-r--r--sha3.c6
1 files changed, 2 insertions, 4 deletions
diff --git a/sha3.c b/sha3.c
index b582cc2..cedd670 100644
--- a/sha3.c
+++ b/sha3.c
@@ -581,7 +581,6 @@ static const __m256i LM0 = { ~0, 0, 0, 0 }, // only lane 0
#define PI_T3_LO 0x90 // 0b10010000 -> 0x90
#define PI_T3_HI 0x03
#define PI_T4_LO 0x0e // 0b00001110 -> 0x0e
-#define PI_T4_HI 0x01
// chi blend mask
#define CHI_MASK 0xc0 // 0b11000000
@@ -589,14 +588,13 @@ static const __m256i LM0 = { ~0, 0, 0, 0 }, // only lane 0
// chi permute IDs
#define CHI_I0_LO 0x39 // 1, 2, 3, 0 -> 0b00111001 -> 0x39
#define CHI_I1_LO 0x0e // 2, 3, 0, 0 -> 0b00001110 -> 0x0e
-#define CHI_I1_HI 0x01 // 1, 0, 0, 0 -> 0b00000001 -> 0x01
// chi step
#define CHI(LO, HI) do { \
const __m256i a_lo = _mm256_blend_epi32(_mm256_permute4x64_epi64(LO, CHI_I0_LO), _mm256_permute4x64_epi64(HI, CHI_I0_LO), CHI_MASK), \
a_hi = LO, \
b_lo = (_mm256_permute4x64_epi64(LO, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(HI, CHI_I1_LO) & ~LM0), \
- b_hi = _mm256_permute4x64_epi64(LO, CHI_I1_HI); \
+ b_hi = _mm256_shuffle_epi32(LO, 0x0e); \
\
LO ^= _mm256_andnot_si256(a_lo, b_lo); HI ^= _mm256_andnot_si256(a_hi, b_hi); \
} while (0)
@@ -713,7 +711,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds
(_mm256_permute4x64_epi64(r1_lo, PI_T4_LO) & LM1) |
(_mm256_permute4x64_epi64(r2_hi, PI_T4_LO) & LM2) |
(_mm256_permute4x64_epi64(r3_lo, PI_T4_LO) & LM3),
- t4_hi = _mm256_permute4x64_epi64(r4_lo, PI_T4_HI);
+ t4_hi = _mm256_shuffle_epi32(r4_lo, 0x0e);
r0_lo = t0_lo; r0_hi = t0_hi & LM0;
r1_lo = t1_lo; r1_hi = t1_hi & LM0;