aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Duncan <pabs@pablotron.org>2024-05-17 01:44:41 -0400
committerPaul Duncan <pabs@pablotron.org>2024-05-17 01:44:41 -0400
commit22c3a095a29a860d1672a542919181ea31faf9e6 (patch)
tree1d0b9ede6a8ee99afe12d45c7f880b24e19b0358
parent061dc7ec46e7e2dba7b8c3ef6f09d7d383ad692c (diff)
downloadsha3-22c3a095a29a860d1672a542919181ea31faf9e6.tar.bz2
sha3-22c3a095a29a860d1672a542919181ea31faf9e6.zip
sha3.c: permute_n_scalar(): simplify num_rounds logic
bench results (samish across the board) ------------- x86-64-scalar-gcc before: > make clean all BACKEND=1 && perf record ./bench 10000 ... info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=scalar num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,19.5,10.0,10.0,9.0,8.8 sha3_256,32,19.5,10.0,9.9,9.5,9.3 sha3_384,48,20.1,14.7,12.3,12.2,12.0 sha3_512,64,19.5,19.5,18.2,17.2,17.1 shake128,32,19.6,10.0,8.7,7.8,7.6 shake256,32,19.6,10.1,10.0,9.6,9.3 x86-64-scalar-clang before: > make clean all CC=clang BACKEND=1 && perf record ./bench 10000 ... info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=scalar num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,19.7,9.9,9.7,8.8,8.6 sha3_256,32,19.7,9.8,9.7,9.4,9.1 sha3_384,48,19.7,14.5,12.0,11.9,11.8 sha3_512,64,19.7,19.1,17.7,16.8,16.8 shake128,32,20.4,10.0,8.7,7.7,7.5 shake256,32,20.3,10.0,9.8,9.4,9.1 x86-64-scalar-gcc after: > make clean all BACKEND=1 && perf record ./bench 10000 ... info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=scalar num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,19.5,10.0,9.9,9.0,8.8 sha3_256,32,19.5,10.0,9.9,9.5,9.3 sha3_384,48,19.5,14.7,12.3,12.2,12.0 sha3_512,64,19.7,19.5,18.2,17.2,17.1 shake128,32,19.6,10.0,8.7,7.8,7.6 shake256,32,19.6,10.1,10.0,9.6,9.3 x86-64-scalar-gcc after: > make clean all CC=clang BACKEND=1 && perf record ./bench 10000 ... info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=scalar num_trials=10000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,19.7,9.9,9.8,8.8,8.7 sha3_256,32,19.6,9.9,9.8,9.4,9.2 sha3_384,48,19.7,14.6,12.1,12.0,11.9 sha3_512,64,19.7,19.3,17.9,16.9,16.9 shake128,32,19.7,9.9,8.7,7.7,7.5 shake256,32,19.7,9.9,9.8,9.5,9.2 a76-scalar-gcc before: > make clean all BACKEND=1 && ./bench 5000 info: cpucycles: version=20240318 implementation=arm64-vct persecond=2400000000 info: backend=scalar num_trials=5000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,20.2,10.1,10.2,9.3,9.2 sha3_256,32,20.2,10.3,10.3,10.0,9.7 sha3_384,48,20.9,15.3,12.8,12.7,12.5 sha3_512,64,20.9,20.3,18.9,18.0,17.9 shake128,32,20.2,10.3,9.0,8.1,7.9 shake256,32,20.2,10.1,10.3,9.9,9.7 a76-scalar-clang before: > make clean all CC=clang BACKEND=1 && ./bench 5000 info: cpucycles: version=20240318 implementation=arm64-vct persecond=2400000000 info: backend=scalar num_trials=5000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,18.8,9.6,9.5,8.7,8.5 sha3_256,32,18.8,9.6,9.5,9.2,9.0 sha3_384,48,18.8,14.1,11.8,11.8,11.6 sha3_512,64,18.8,18.6,17.5,16.6,16.6 shake128,32,18.8,9.6,8.4,7.5,7.4 shake256,32,18.8,9.6,9.6,9.2,9.0 a76-scalar-gcc after: > make clean all BACKEND=1 && ./bench 5000 info: cpucycles: version=20240318 implementation=arm64-vct persecond=2400000000 info: backend=scalar num_trials=5000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,20.2,10.1,10.3,9.3,9.2 sha3_256,32,20.2,10.3,10.3,10.0,9.7 sha3_384,48,20.9,15.3,12.8,12.7,12.5 sha3_512,64,20.9,20.3,18.9,18.0,17.9 shake128,32,20.2,10.3,9.0,8.1,7.9 shake256,32,20.2,10.3,10.3,9.9,9.7 a76-scalar-clang after: > make clean all CC=clang BACKEND=1 && ./bench 5000 info: cpucycles: version=20240318 implementation=arm64-vct persecond=2400000000 info: backend=scalar num_trials=5000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,18.8,9.4,9.3,8.5,8.3 sha3_256,32,18.8,9.4,9.3,9.0,8.8 sha3_384,48,18.8,13.9,11.6,11.6,11.4 sha3_512,64,18.8,18.3,17.1,16.3,16.3 shake128,32,18.8,9.4,8.3,7.4,7.2 shake256,32,18.8,9.4,9.4,9.1,8.9
-rw-r--r--sha3.c4
1 files changed, 2 insertions, 2 deletions
diff --git a/sha3.c b/sha3.c
index 14d518e..b846719 100644
--- a/sha3.c
+++ b/sha3.c
@@ -290,12 +290,12 @@ static inline void iota(uint64_t a[static 25], const int i) {
*/
static inline void permute_n_scalar(uint64_t a[static 25], const size_t num_rounds) {
uint64_t tmp[25] = { 0 };
- for (size_t i = 0; i < num_rounds; i++) {
+ for (size_t i = (SHA3_NUM_ROUNDS - num_rounds); __builtin_expect(i < SHA3_NUM_ROUNDS, 1); i++) {
theta(a);
rho(a);
pi(tmp, a);
chi(a, tmp);
- iota(a, (SHA3_NUM_ROUNDS - num_rounds + i));
+ iota(a, i);
}
}
#endif /* (BACKEND == BACKEND_SCALAR) || defined(TEST_SHA3) */