diff options
author | Paul Duncan <pabs@pablotron.org> | 2024-05-17 01:44:41 -0400 |
---|---|---|
committer | Paul Duncan <pabs@pablotron.org> | 2024-05-17 01:44:41 -0400 |
commit | 22c3a095a29a860d1672a542919181ea31faf9e6 (patch) | |
tree | 1d0b9ede6a8ee99afe12d45c7f880b24e19b0358 | |
parent | 061dc7ec46e7e2dba7b8c3ef6f09d7d383ad692c (diff) | |
download | sha3-22c3a095a29a860d1672a542919181ea31faf9e6.tar.bz2 sha3-22c3a095a29a860d1672a542919181ea31faf9e6.zip |
sha3.c: permute_n_scalar(): simplify num_rounds logic
bench results (samish across the board)
-------------
x86-64-scalar-gcc before:
> make clean all BACKEND=1 && perf record ./bench 10000
...
info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
info: backend=scalar num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,19.5,10.0,10.0,9.0,8.8
sha3_256,32,19.5,10.0,9.9,9.5,9.3
sha3_384,48,20.1,14.7,12.3,12.2,12.0
sha3_512,64,19.5,19.5,18.2,17.2,17.1
shake128,32,19.6,10.0,8.7,7.8,7.6
shake256,32,19.6,10.1,10.0,9.6,9.3
x86-64-scalar-clang before:
> make clean all CC=clang BACKEND=1 && perf record ./bench 10000
...
info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
info: backend=scalar num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,19.7,9.9,9.7,8.8,8.6
sha3_256,32,19.7,9.8,9.7,9.4,9.1
sha3_384,48,19.7,14.5,12.0,11.9,11.8
sha3_512,64,19.7,19.1,17.7,16.8,16.8
shake128,32,20.4,10.0,8.7,7.7,7.5
shake256,32,20.3,10.0,9.8,9.4,9.1
x86-64-scalar-gcc after:
> make clean all BACKEND=1 && perf record ./bench 10000
...
info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
info: backend=scalar num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,19.5,10.0,9.9,9.0,8.8
sha3_256,32,19.5,10.0,9.9,9.5,9.3
sha3_384,48,19.5,14.7,12.3,12.2,12.0
sha3_512,64,19.7,19.5,18.2,17.2,17.1
shake128,32,19.6,10.0,8.7,7.8,7.6
shake256,32,19.6,10.1,10.0,9.6,9.3
x86-64-scalar-gcc after:
> make clean all CC=clang BACKEND=1 && perf record ./bench 10000
...
info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000
info: backend=scalar num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,19.7,9.9,9.8,8.8,8.7
sha3_256,32,19.6,9.9,9.8,9.4,9.2
sha3_384,48,19.7,14.6,12.1,12.0,11.9
sha3_512,64,19.7,19.3,17.9,16.9,16.9
shake128,32,19.7,9.9,8.7,7.7,7.5
shake256,32,19.7,9.9,9.8,9.5,9.2
a76-scalar-gcc before:
> make clean all BACKEND=1 && ./bench 5000
info: cpucycles: version=20240318 implementation=arm64-vct persecond=2400000000
info: backend=scalar num_trials=5000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,20.2,10.1,10.2,9.3,9.2
sha3_256,32,20.2,10.3,10.3,10.0,9.7
sha3_384,48,20.9,15.3,12.8,12.7,12.5
sha3_512,64,20.9,20.3,18.9,18.0,17.9
shake128,32,20.2,10.3,9.0,8.1,7.9
shake256,32,20.2,10.1,10.3,9.9,9.7
a76-scalar-clang before:
> make clean all CC=clang BACKEND=1 && ./bench 5000
info: cpucycles: version=20240318 implementation=arm64-vct persecond=2400000000
info: backend=scalar num_trials=5000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,18.8,9.6,9.5,8.7,8.5
sha3_256,32,18.8,9.6,9.5,9.2,9.0
sha3_384,48,18.8,14.1,11.8,11.8,11.6
sha3_512,64,18.8,18.6,17.5,16.6,16.6
shake128,32,18.8,9.6,8.4,7.5,7.4
shake256,32,18.8,9.6,9.6,9.2,9.0
a76-scalar-gcc after:
> make clean all BACKEND=1 && ./bench 5000
info: cpucycles: version=20240318 implementation=arm64-vct persecond=2400000000
info: backend=scalar num_trials=5000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,20.2,10.1,10.3,9.3,9.2
sha3_256,32,20.2,10.3,10.3,10.0,9.7
sha3_384,48,20.9,15.3,12.8,12.7,12.5
sha3_512,64,20.9,20.3,18.9,18.0,17.9
shake128,32,20.2,10.3,9.0,8.1,7.9
shake256,32,20.2,10.3,10.3,9.9,9.7
a76-scalar-clang after:
> make clean all CC=clang BACKEND=1 && ./bench 5000
info: cpucycles: version=20240318 implementation=arm64-vct persecond=2400000000
info: backend=scalar num_trials=5000 src_lens=64,256,1024,4096,16384 dst_lens=32
function,dst_len,64,256,1024,4096,16384
sha3_224,28,18.8,9.4,9.3,8.5,8.3
sha3_256,32,18.8,9.4,9.3,9.0,8.8
sha3_384,48,18.8,13.9,11.6,11.6,11.4
sha3_512,64,18.8,18.3,17.1,16.3,16.3
shake128,32,18.8,9.4,8.3,7.4,7.2
shake256,32,18.8,9.4,9.4,9.1,8.9
-rw-r--r-- | sha3.c | 4 |
1 files changed, 2 insertions, 2 deletions
@@ -290,12 +290,12 @@ static inline void iota(uint64_t a[static 25], const int i) { */ static inline void permute_n_scalar(uint64_t a[static 25], const size_t num_rounds) { uint64_t tmp[25] = { 0 }; - for (size_t i = 0; i < num_rounds; i++) { + for (size_t i = (SHA3_NUM_ROUNDS - num_rounds); __builtin_expect(i < SHA3_NUM_ROUNDS, 1); i++) { theta(a); rho(a); pi(tmp, a); chi(a, tmp); - iota(a, (SHA3_NUM_ROUNDS - num_rounds + i)); + iota(a, i); } } #endif /* (BACKEND == BACKEND_SCALAR) || defined(TEST_SHA3) */ |