aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Duncan <pabs@pablotron.org>2024-05-26 17:00:20 -0400
committerPaul Duncan <pabs@pablotron.org>2024-05-26 17:00:20 -0400
commit8222f3107dbd24cc1f9afcbbab60a0cfc379a64d (patch)
tree7236ac1bb0f0f6146d8c5d4696f6a2e3ec7c576d
parentcc37060f29d872df8372fce4712a00cdb0afa0ba (diff)
downloadsha3-8222f3107dbd24cc1f9afcbbab60a0cfc379a64d.tar.bz2
sha3-8222f3107dbd24cc1f9afcbbab60a0cfc379a64d.zip
tests/avx2: use permutes for pi
-rw-r--r--tests/avx2/main.c63
1 files changed, 50 insertions, 13 deletions
diff --git a/tests/avx2/main.c b/tests/avx2/main.c
index 38e5153..79afe3e 100644
--- a/tests/avx2/main.c
+++ b/tests/avx2/main.c
@@ -320,19 +320,56 @@ static void rho_avx2(uint64_t s[static 25]) {
// pi step of avx2 keccak permutation.
static inline void pi_avx2(uint64_t s[static 25]) {
- static const __m256i V0_LO = { 0, 6, 12, 18 },
- V1_LO = { 3, 9, 10, 16 },
- V2_LO = { 1, 7, 13, 19 },
- V3_LO = { 4, 5, 11, 17 },
- V4_LO = { 2, 8, 14, 15 };
- static const size_t V0_HI = 24, V1_HI = 22, V2_HI = 20, V3_HI = 23, V4_HI = 21;
- union { long long int *i64; uint64_t *u64; } p = { .u64 = s }; \
-
- __m256i r0_lo = _mm256_i64gather_epi64(p.i64, V0_LO, 8), r0_hi = { s[V0_HI] },
- r1_lo = _mm256_i64gather_epi64(p.i64, V1_LO, 8), r1_hi = { s[V1_HI] },
- r2_lo = _mm256_i64gather_epi64(p.i64, V2_LO, 8), r2_hi = { s[V2_HI] },
- r3_lo = _mm256_i64gather_epi64(p.i64, V3_LO, 8), r3_hi = { s[V3_HI] },
- r4_lo = _mm256_i64gather_epi64(p.i64, V4_LO, 8), r4_hi = { s[V4_HI] };
+ static const __m256i LM0 = { ~0, 0, 0, 0 },
+ LM1 = { 0, ~0, 0, 0 },
+ LM2 = { 0, 0, ~0, 0 },
+ LM3 = { 0, 0, 0, ~0 };
+ static const uint8_t T0_LO = 0xe4, T0_HI = 0x00, // 0b11100100 -> 0xe4
+ T1_LO = 0x43, T1_HI = 0x02, // 0b01000011 -> 0x43
+ T2_LO = 0x39, T2_HI = 0x00, // 0b00111001 -> 0x39
+ T3_LO = 0x90, T3_HI = 0x03, // 0b10010000 -> 0x90
+ T4_LO = 0x0e, T4_HI = 0x01; // 0b00001110 -> 0x0e
+ LOAD(s);
+/*
+ * static const __m256i V0_LO = { 0, 6, 12, 18 },
+ * V1_LO = { 3, 9, 10, 16 },
+ * V2_LO = { 1, 7, 13, 19 },
+ * V3_LO = { 4, 5, 11, 17 },
+ * V4_LO = { 2, 8, 14, 15 };
+ * static const size_t V0_HI = 24, V1_HI = 22, V2_HI = 20, V3_HI = 23, V4_HI = 21;
+ */
+
+ const __m256i t0_lo = (_mm256_permute4x64_epi64(r0_lo, T0_LO) & LM0) |
+ (_mm256_permute4x64_epi64(r1_lo, T0_LO) & LM1) |
+ (_mm256_permute4x64_epi64(r2_lo, T0_LO) & LM2) |
+ (_mm256_permute4x64_epi64(r3_lo, T0_LO) & LM3),
+ t0_hi = (_mm256_permute4x64_epi64(r4_hi, T0_HI) & LM0),
+ t1_lo = (_mm256_permute4x64_epi64(r0_lo, T1_LO) & LM0) |
+ (_mm256_permute4x64_epi64(r1_hi, T1_LO) & LM1) |
+ (_mm256_permute4x64_epi64(r2_lo, T1_LO) & LM2) |
+ (_mm256_permute4x64_epi64(r3_lo, T1_LO) & LM3),
+ t1_hi = (_mm256_permute4x64_epi64(r4_lo, T1_HI) & LM0),
+ t2_lo = (_mm256_permute4x64_epi64(r0_lo, T2_LO) & LM0) |
+ (_mm256_permute4x64_epi64(r1_lo, T2_LO) & LM1) |
+ (_mm256_permute4x64_epi64(r2_lo, T2_LO) & LM2) |
+ (_mm256_permute4x64_epi64(r3_hi, T2_LO) & LM3),
+ t2_hi = (_mm256_permute4x64_epi64(r4_lo, T2_HI) & LM0),
+ t3_lo = (_mm256_permute4x64_epi64(r0_hi, T3_LO) & LM0) |
+ (_mm256_permute4x64_epi64(r1_lo, T3_LO) & LM1) |
+ (_mm256_permute4x64_epi64(r2_lo, T3_LO) & LM2) |
+ (_mm256_permute4x64_epi64(r3_lo, T3_LO) & LM3),
+ t3_hi = (_mm256_permute4x64_epi64(r4_lo, T3_HI) & LM0),
+ t4_lo = (_mm256_permute4x64_epi64(r0_lo, T4_LO) & LM0) |
+ (_mm256_permute4x64_epi64(r1_lo, T4_LO) & LM1) |
+ (_mm256_permute4x64_epi64(r2_hi, T4_LO) & LM2) |
+ (_mm256_permute4x64_epi64(r3_lo, T4_LO) & LM3),
+ t4_hi = (_mm256_permute4x64_epi64(r4_lo, T4_HI) & LM0);
+
+ r0_lo = t0_lo; r0_hi = t0_hi;
+ r1_lo = t1_lo; r1_hi = t1_hi;
+ r2_lo = t2_lo; r2_hi = t2_hi;
+ r3_lo = t3_lo; r3_hi = t3_hi;
+ r4_lo = t4_lo; r4_hi = t4_hi;
STORE(s);
}