aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Duncan <pabs@pablotron.org>2024-05-26 18:42:10 -0400
committerPaul Duncan <pabs@pablotron.org>2024-05-26 18:42:10 -0400
commitd710dae94359c5a1c014531162b75981d384f28d (patch)
tree0d5eae620116f7135099216fc9afbb41d9340eac
parent689c96bff01673816832e565b9b4c5588d548b88 (diff)
downloadsha3-d710dae94359c5a1c014531162b75981d384f28d.tar.bz2
sha3-d710dae94359c5a1c014531162b75981d384f28d.zip
sha3.c: permute_n_avx2(): chi: explicit andnot
bench results ------------- before (gcc, avx2): > make clean all BACKEND=6 CC=gcc && ./bench info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx2 num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,30.8,15.3,15.4,14.0,13.8 sha3_256,32,30.7,15.3,15.4,15.0,14.6 sha3_384,48,30.6,23.0,19.2,19.2,19.0 sha3_512,64,30.2,30.7,28.8,27.4,27.4 shake128,32,30.6,15.3,13.5,12.1,11.9 shake256,32,30.8,15.4,15.4,15.0,14.6 after (gcc, avx2): > make clean all BACKEND=6 CC=gcc && ./bench info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx2 num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,29.4,14.8,14.8,13.5,13.2 sha3_256,32,29.4,14.8,14.8,14.4,14.0 sha3_384,48,29.3,22.1,18.5,18.5,18.3 sha3_512,64,29.4,29.4,27.6,26.3,26.3 shake128,32,29.4,14.8,13.0,11.6,11.4 shake256,32,29.5,14.8,14.9,14.4,14.1
-rw-r--r--sha3.c10
1 files changed, 5 insertions, 5 deletions
diff --git a/sha3.c b/sha3.c
index 43b60aa..6bc1dbf 100644
--- a/sha3.c
+++ b/sha3.c
@@ -727,7 +727,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds
b_lo = (_mm256_permute4x64_epi64(r0_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r0_hi, CHI_I1_LO) & ~LM0),
b_hi = _mm256_permute4x64_epi64(r0_lo, CHI_I1_HI);
- r0_lo ^= ~a_lo & b_lo; r0_hi ^= ~a_hi & b_hi; // r0 ^= ~a & b
+ r0_lo ^= _mm256_andnot_si256(a_lo, b_lo); r0_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r0 ^= ~a & b
}
// r1
@@ -737,7 +737,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds
b_lo = (_mm256_permute4x64_epi64(r1_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r1_hi, CHI_I1_LO) & ~LM0),
b_hi = _mm256_permute4x64_epi64(r1_lo, CHI_I1_HI);
- r1_lo ^= ~a_lo & b_lo; r1_hi ^= ~a_hi & b_hi; // r1 ^= ~a & b
+ r1_lo ^= _mm256_andnot_si256(a_lo, b_lo); r1_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r1 ^= ~a & b
}
// r2
@@ -747,7 +747,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds
b_lo = (_mm256_permute4x64_epi64(r2_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r2_hi, CHI_I1_LO) & ~LM0),
b_hi = _mm256_permute4x64_epi64(r2_lo, CHI_I1_HI);
- r2_lo ^= ~a_lo & b_lo; r2_hi ^= ~a_hi & b_hi; // r2 ^= ~a & b
+ r2_lo ^= _mm256_andnot_si256(a_lo, b_lo); r2_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r2 ^= ~a & b
}
// r3
@@ -757,7 +757,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds
b_lo = (_mm256_permute4x64_epi64(r3_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r3_hi, CHI_I1_LO) & ~LM0),
b_hi = _mm256_permute4x64_epi64(r3_lo, CHI_I1_HI);
- r3_lo ^= ~a_lo & b_lo; r3_hi ^= ~a_hi & b_hi; // r3 ^= ~a & b
+ r3_lo ^= _mm256_andnot_si256(a_lo, b_lo); r3_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r3 ^= ~a & b
}
// r4
@@ -767,7 +767,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds
b_lo = (_mm256_permute4x64_epi64(r4_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r4_hi, CHI_I1_LO) & ~LM0),
b_hi = _mm256_permute4x64_epi64(r4_lo, CHI_I1_HI);
- r4_lo ^= ~a_lo & b_lo; r4_hi ^= ~a_hi & b_hi; // r4 ^= ~a & b
+ r4_lo ^= _mm256_andnot_si256(a_lo, b_lo); r4_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r4 ^= ~a & b
}
}