diff options
author | Paul Duncan <pabs@pablotron.org> | 2024-05-17 01:50:47 -0400 |
---|---|---|
committer | Paul Duncan <pabs@pablotron.org> | 2024-05-17 01:50:47 -0400 |
commit | 031cb5390c050aa7fc4d84ec78a78316cc0989e2 (patch) | |
tree | d789375858651e11f4d2e0f50676ce50ec11832f | |
parent | 22c3a095a29a860d1672a542919181ea31faf9e6 (diff) | |
download | sha3-031cb5390c050aa7fc4d84ec78a78316cc0989e2.tar.bz2 sha3-031cb5390c050aa7fc4d84ec78a78316cc0989e2.zip |
sha3.c: permute_n_avx512(): simplify num_rounds logic
bench results (samish across the board)
---------------------------------------
x86-64-avx512-gcc before:
> make clean all BACKEND=2 && ./bench 10000
...
info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
info: backend=avx512 num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,15.4,7.8,7.8,7.1,7.0
sha3_256,32,15.4,7.8,7.8,7.6,7.4
sha3_384,48,15.4,11.6,9.8,9.8,9.7
sha3_512,64,15.5,15.5,14.5,13.8,13.8
shake128,32,15.6,7.8,6.9,6.2,6.1
shake256,32,15.6,7.8,7.9,7.6,7.4
x86-64-avx512-clang before:
> make clean all CC=clang BACKEND=2 && ./bench 10000
...
info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
info: backend=avx512 num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,21.6,10.8,10.7,9.6,9.5
sha3_256,32,21.6,10.8,10.7,10.3,10.1
sha3_384,48,21.5,16.0,13.3,13.2,13.1
sha3_512,64,21.5,21.2,19.8,18.8,18.7
shake128,32,21.8,10.8,9.4,8.4,8.2
shake256,32,21.6,10.8,10.7,10.3,10.1
x86-64-avx512-gcc after:
> make clean all BACKEND=2 && ./bench 10000
...
info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
info: backend=avx512 num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,15.4,7.8,7.8,7.1,7.0
sha3_256,32,15.4,7.8,7.8,7.6,7.4
sha3_384,48,15.4,11.6,9.8,9.8,9.7
sha3_512,64,15.5,15.5,14.6,13.8,13.8
shake128,32,15.5,7.8,6.9,6.2,6.1
shake256,32,15.5,7.8,7.9,7.6,7.4
x86-64-avx512-clang after:
> make clean all CC=clang BACKEND=2 && ./bench 10000
...
info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
info: backend=avx512 num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,21.5,10.8,10.7,9.6,9.5
sha3_256,32,21.5,10.8,10.7,10.3,10.0
sha3_384,48,21.6,16.0,13.3,13.2,13.1
sha3_512,64,21.5,21.2,19.8,18.8,18.7
shake128,32,21.7,10.8,9.3,8.3,8.2
shake256,32,21.7,10.8,10.7,10.3,10.1
-rw-r--r-- | sha3.c | 7 |
1 files changed, 2 insertions, 5 deletions
@@ -343,7 +343,7 @@ static inline void permute_n_avx512(uint64_t s[static 25], const size_t num_roun r4 = _mm512_maskz_loadu_epi64(0x1f, s + 20); // row 4 // loop over rounds - for (size_t i = 0; i < num_rounds; i++) { + for (size_t i = (SHA3_NUM_ROUNDS - num_rounds); __builtin_expect(i < SHA3_NUM_ROUNDS, 1); i++) { // theta { // permute ids @@ -506,11 +506,8 @@ static inline void permute_n_avx512(uint64_t s[static 25], const size_t num_roun // iota { - // calculate RCS offset - const size_t ofs = SHA3_NUM_ROUNDS - num_rounds + i; - // xor round constant to first cell - r0 = _mm512_mask_xor_epi64(r0, 1, r0, _mm512_maskz_loadu_epi64(1, RCS + ofs)); + r0 = _mm512_mask_xor_epi64(r0, 1, r0, _mm512_maskz_loadu_epi64(1, RCS + i)); } } |