aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Duncan <pabs@pablotron.org>2024-05-17 01:50:47 -0400
committerPaul Duncan <pabs@pablotron.org>2024-05-17 01:50:47 -0400
commit031cb5390c050aa7fc4d84ec78a78316cc0989e2 (patch)
treed789375858651e11f4d2e0f50676ce50ec11832f
parent22c3a095a29a860d1672a542919181ea31faf9e6 (diff)
downloadsha3-031cb5390c050aa7fc4d84ec78a78316cc0989e2.tar.bz2
sha3-031cb5390c050aa7fc4d84ec78a78316cc0989e2.zip
sha3.c: permute_n_avx512(): simplify num_rounds logic
bench results (samish across the board) --------------------------------------- x86-64-avx512-gcc before: > make clean all BACKEND=2 && ./bench 10000 ... info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx512 num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,15.4,7.8,7.8,7.1,7.0 sha3_256,32,15.4,7.8,7.8,7.6,7.4 sha3_384,48,15.4,11.6,9.8,9.8,9.7 sha3_512,64,15.5,15.5,14.5,13.8,13.8 shake128,32,15.6,7.8,6.9,6.2,6.1 shake256,32,15.6,7.8,7.9,7.6,7.4 x86-64-avx512-clang before: > make clean all CC=clang BACKEND=2 && ./bench 10000 ... info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx512 num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,21.6,10.8,10.7,9.6,9.5 sha3_256,32,21.6,10.8,10.7,10.3,10.1 sha3_384,48,21.5,16.0,13.3,13.2,13.1 sha3_512,64,21.5,21.2,19.8,18.8,18.7 shake128,32,21.8,10.8,9.4,8.4,8.2 shake256,32,21.6,10.8,10.7,10.3,10.1 x86-64-avx512-gcc after: > make clean all BACKEND=2 && ./bench 10000 ... info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx512 num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,15.4,7.8,7.8,7.1,7.0 sha3_256,32,15.4,7.8,7.8,7.6,7.4 sha3_384,48,15.4,11.6,9.8,9.8,9.7 sha3_512,64,15.5,15.5,14.6,13.8,13.8 shake128,32,15.5,7.8,6.9,6.2,6.1 shake256,32,15.5,7.8,7.9,7.6,7.4 x86-64-avx512-clang after: > make clean all CC=clang BACKEND=2 && ./bench 10000 ... info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx512 num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,21.5,10.8,10.7,9.6,9.5 sha3_256,32,21.5,10.8,10.7,10.3,10.0 sha3_384,48,21.6,16.0,13.3,13.2,13.1 sha3_512,64,21.5,21.2,19.8,18.8,18.7 shake128,32,21.7,10.8,9.3,8.3,8.2 shake256,32,21.7,10.8,10.7,10.3,10.1
-rw-r--r--sha3.c7
1 files changed, 2 insertions, 5 deletions
diff --git a/sha3.c b/sha3.c
index b846719..9b213d0 100644
--- a/sha3.c
+++ b/sha3.c
@@ -343,7 +343,7 @@ static inline void permute_n_avx512(uint64_t s[static 25], const size_t num_roun
r4 = _mm512_maskz_loadu_epi64(0x1f, s + 20); // row 4
// loop over rounds
- for (size_t i = 0; i < num_rounds; i++) {
+ for (size_t i = (SHA3_NUM_ROUNDS - num_rounds); __builtin_expect(i < SHA3_NUM_ROUNDS, 1); i++) {
// theta
{
// permute ids
@@ -506,11 +506,8 @@ static inline void permute_n_avx512(uint64_t s[static 25], const size_t num_roun
// iota
{
- // calculate RCS offset
- const size_t ofs = SHA3_NUM_ROUNDS - num_rounds + i;
-
// xor round constant to first cell
- r0 = _mm512_mask_xor_epi64(r0, 1, r0, _mm512_maskz_loadu_epi64(1, RCS + ofs));
+ r0 = _mm512_mask_xor_epi64(r0, 1, r0, _mm512_maskz_loadu_epi64(1, RCS + i));
}
}