diff options
author | Paul Duncan <pabs@pablotron.org> | 2024-05-26 18:42:10 -0400 |
---|---|---|
committer | Paul Duncan <pabs@pablotron.org> | 2024-05-26 18:42:10 -0400 |
commit | d710dae94359c5a1c014531162b75981d384f28d (patch) | |
tree | 0d5eae620116f7135099216fc9afbb41d9340eac | |
parent | 689c96bff01673816832e565b9b4c5588d548b88 (diff) | |
download | sha3-d710dae94359c5a1c014531162b75981d384f28d.tar.bz2 sha3-d710dae94359c5a1c014531162b75981d384f28d.zip |
sha3.c: permute_n_avx2(): chi: explicit andnot
bench results
-------------
before (gcc, avx2):
> make clean all BACKEND=6 CC=gcc && ./bench
info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
info: backend=avx2 num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,30.8,15.3,15.4,14.0,13.8
sha3_256,32,30.7,15.3,15.4,15.0,14.6
sha3_384,48,30.6,23.0,19.2,19.2,19.0
sha3_512,64,30.2,30.7,28.8,27.4,27.4
shake128,32,30.6,15.3,13.5,12.1,11.9
shake256,32,30.8,15.4,15.4,15.0,14.6
after (gcc, avx2):
> make clean all BACKEND=6 CC=gcc && ./bench
info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
info: backend=avx2 num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,29.4,14.8,14.8,13.5,13.2
sha3_256,32,29.4,14.8,14.8,14.4,14.0
sha3_384,48,29.3,22.1,18.5,18.5,18.3
sha3_512,64,29.4,29.4,27.6,26.3,26.3
shake128,32,29.4,14.8,13.0,11.6,11.4
shake256,32,29.5,14.8,14.9,14.4,14.1
-rw-r--r-- | sha3.c | 10 |
1 files changed, 5 insertions, 5 deletions
@@ -727,7 +727,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds b_lo = (_mm256_permute4x64_epi64(r0_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r0_hi, CHI_I1_LO) & ~LM0), b_hi = _mm256_permute4x64_epi64(r0_lo, CHI_I1_HI); - r0_lo ^= ~a_lo & b_lo; r0_hi ^= ~a_hi & b_hi; // r0 ^= ~a & b + r0_lo ^= _mm256_andnot_si256(a_lo, b_lo); r0_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r0 ^= ~a & b } // r1 @@ -737,7 +737,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds b_lo = (_mm256_permute4x64_epi64(r1_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r1_hi, CHI_I1_LO) & ~LM0), b_hi = _mm256_permute4x64_epi64(r1_lo, CHI_I1_HI); - r1_lo ^= ~a_lo & b_lo; r1_hi ^= ~a_hi & b_hi; // r1 ^= ~a & b + r1_lo ^= _mm256_andnot_si256(a_lo, b_lo); r1_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r1 ^= ~a & b } // r2 @@ -747,7 +747,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds b_lo = (_mm256_permute4x64_epi64(r2_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r2_hi, CHI_I1_LO) & ~LM0), b_hi = _mm256_permute4x64_epi64(r2_lo, CHI_I1_HI); - r2_lo ^= ~a_lo & b_lo; r2_hi ^= ~a_hi & b_hi; // r2 ^= ~a & b + r2_lo ^= _mm256_andnot_si256(a_lo, b_lo); r2_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r2 ^= ~a & b } // r3 @@ -757,7 +757,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds b_lo = (_mm256_permute4x64_epi64(r3_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r3_hi, CHI_I1_LO) & ~LM0), b_hi = _mm256_permute4x64_epi64(r3_lo, CHI_I1_HI); - r3_lo ^= ~a_lo & b_lo; r3_hi ^= ~a_hi & b_hi; // r3 ^= ~a & b + r3_lo ^= _mm256_andnot_si256(a_lo, b_lo); r3_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r3 ^= ~a & b } // r4 @@ -767,7 +767,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds b_lo = (_mm256_permute4x64_epi64(r4_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r4_hi, CHI_I1_LO) & ~LM0), b_hi = _mm256_permute4x64_epi64(r4_lo, CHI_I1_HI); - r4_lo ^= ~a_lo & b_lo; r4_hi ^= ~a_hi & b_hi; // r4 ^= ~a & b + r4_lo ^= _mm256_andnot_si256(a_lo, b_lo); r4_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r4 ^= ~a & b } } |