From 031cb5390c050aa7fc4d84ec78a78316cc0989e2 Mon Sep 17 00:00:00 2001 From: Paul Duncan Date: Fri, 17 May 2024 01:50:47 -0400 Subject: sha3.c: permute_n_avx512(): simplify num_rounds logic bench results (samish across the board) --------------------------------------- x86-64-avx512-gcc before: > make clean all BACKEND=2 && ./bench 10000 ... info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx512 num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,15.4,7.8,7.8,7.1,7.0 sha3_256,32,15.4,7.8,7.8,7.6,7.4 sha3_384,48,15.4,11.6,9.8,9.8,9.7 sha3_512,64,15.5,15.5,14.5,13.8,13.8 shake128,32,15.6,7.8,6.9,6.2,6.1 shake256,32,15.6,7.8,7.9,7.6,7.4 x86-64-avx512-clang before: > make clean all CC=clang BACKEND=2 && ./bench 10000 ... info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx512 num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,21.6,10.8,10.7,9.6,9.5 sha3_256,32,21.6,10.8,10.7,10.3,10.1 sha3_384,48,21.5,16.0,13.3,13.2,13.1 sha3_512,64,21.5,21.2,19.8,18.8,18.7 shake128,32,21.8,10.8,9.4,8.4,8.2 shake256,32,21.6,10.8,10.7,10.3,10.1 x86-64-avx512-gcc after: > make clean all BACKEND=2 && ./bench 10000 ... info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx512 num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,15.4,7.8,7.8,7.1,7.0 sha3_256,32,15.4,7.8,7.8,7.6,7.4 sha3_384,48,15.4,11.6,9.8,9.8,9.7 sha3_512,64,15.5,15.5,14.6,13.8,13.8 shake128,32,15.5,7.8,6.9,6.2,6.1 shake256,32,15.5,7.8,7.9,7.6,7.4 x86-64-avx512-clang after: > make clean all CC=clang BACKEND=2 && ./bench 10000 ... info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx512 num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,21.5,10.8,10.7,9.6,9.5 sha3_256,32,21.5,10.8,10.7,10.3,10.0 sha3_384,48,21.6,16.0,13.3,13.2,13.1 sha3_512,64,21.5,21.2,19.8,18.8,18.7 shake128,32,21.7,10.8,9.3,8.3,8.2 shake256,32,21.7,10.8,10.7,10.3,10.1 --- sha3.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sha3.c b/sha3.c index b846719..9b213d0 100644 --- a/sha3.c +++ b/sha3.c @@ -343,7 +343,7 @@ static inline void permute_n_avx512(uint64_t s[static 25], const size_t num_roun r4 = _mm512_maskz_loadu_epi64(0x1f, s + 20); // row 4 // loop over rounds - for (size_t i = 0; i < num_rounds; i++) { + for (size_t i = (SHA3_NUM_ROUNDS - num_rounds); __builtin_expect(i < SHA3_NUM_ROUNDS, 1); i++) { // theta { // permute ids @@ -506,11 +506,8 @@ static inline void permute_n_avx512(uint64_t s[static 25], const size_t num_roun // iota { - // calculate RCS offset - const size_t ofs = SHA3_NUM_ROUNDS - num_rounds + i; - // xor round constant to first cell - r0 = _mm512_mask_xor_epi64(r0, 1, r0, _mm512_maskz_loadu_epi64(1, RCS + ofs)); + r0 = _mm512_mask_xor_epi64(r0, 1, r0, _mm512_maskz_loadu_epi64(1, RCS + i)); } } -- cgit v1.2.3