From d710dae94359c5a1c014531162b75981d384f28d Mon Sep 17 00:00:00 2001 From: Paul Duncan Date: Sun, 26 May 2024 18:42:10 -0400 Subject: sha3.c: permute_n_avx2(): chi: explicit andnot bench results ------------- before (gcc, avx2): > make clean all BACKEND=6 CC=gcc && ./bench info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx2 num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,30.8,15.3,15.4,14.0,13.8 sha3_256,32,30.7,15.3,15.4,15.0,14.6 sha3_384,48,30.6,23.0,19.2,19.2,19.0 sha3_512,64,30.2,30.7,28.8,27.4,27.4 shake128,32,30.6,15.3,13.5,12.1,11.9 shake256,32,30.8,15.4,15.4,15.0,14.6 after (gcc, avx2): > make clean all BACKEND=6 CC=gcc && ./bench info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx2 num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,29.4,14.8,14.8,13.5,13.2 sha3_256,32,29.4,14.8,14.8,14.4,14.0 sha3_384,48,29.3,22.1,18.5,18.5,18.3 sha3_512,64,29.4,29.4,27.6,26.3,26.3 shake128,32,29.4,14.8,13.0,11.6,11.4 shake256,32,29.5,14.8,14.9,14.4,14.1 --- sha3.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'sha3.c') diff --git a/sha3.c b/sha3.c index 43b60aa..6bc1dbf 100644 --- a/sha3.c +++ b/sha3.c @@ -727,7 +727,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds b_lo = (_mm256_permute4x64_epi64(r0_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r0_hi, CHI_I1_LO) & ~LM0), b_hi = _mm256_permute4x64_epi64(r0_lo, CHI_I1_HI); - r0_lo ^= ~a_lo & b_lo; r0_hi ^= ~a_hi & b_hi; // r0 ^= ~a & b + r0_lo ^= _mm256_andnot_si256(a_lo, b_lo); r0_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r0 ^= ~a & b } // r1 @@ -737,7 +737,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds b_lo = (_mm256_permute4x64_epi64(r1_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r1_hi, CHI_I1_LO) & ~LM0), b_hi = _mm256_permute4x64_epi64(r1_lo, CHI_I1_HI); - r1_lo ^= ~a_lo & b_lo; r1_hi ^= ~a_hi & b_hi; // r1 ^= ~a & b + r1_lo ^= _mm256_andnot_si256(a_lo, b_lo); r1_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r1 ^= ~a & b } // r2 @@ -747,7 +747,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds b_lo = (_mm256_permute4x64_epi64(r2_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r2_hi, CHI_I1_LO) & ~LM0), b_hi = _mm256_permute4x64_epi64(r2_lo, CHI_I1_HI); - r2_lo ^= ~a_lo & b_lo; r2_hi ^= ~a_hi & b_hi; // r2 ^= ~a & b + r2_lo ^= _mm256_andnot_si256(a_lo, b_lo); r2_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r2 ^= ~a & b } // r3 @@ -757,7 +757,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds b_lo = (_mm256_permute4x64_epi64(r3_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r3_hi, CHI_I1_LO) & ~LM0), b_hi = _mm256_permute4x64_epi64(r3_lo, CHI_I1_HI); - r3_lo ^= ~a_lo & b_lo; r3_hi ^= ~a_hi & b_hi; // r3 ^= ~a & b + r3_lo ^= _mm256_andnot_si256(a_lo, b_lo); r3_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r3 ^= ~a & b } // r4 @@ -767,7 +767,7 @@ static inline void permute_n_avx2(uint64_t s[static 25], const size_t num_rounds b_lo = (_mm256_permute4x64_epi64(r4_lo, CHI_I1_LO) & ~LM2) | (_mm256_permute4x64_epi64(r4_hi, CHI_I1_LO) & ~LM0), b_hi = _mm256_permute4x64_epi64(r4_lo, CHI_I1_HI); - r4_lo ^= ~a_lo & b_lo; r4_hi ^= ~a_hi & b_hi; // r4 ^= ~a & b + r4_lo ^= _mm256_andnot_si256(a_lo, b_lo); r4_hi ^= _mm256_andnot_si256(a_hi, b_hi); // r4 ^= ~a & b } } -- cgit v1.2.3