aboutsummaryrefslogtreecommitdiff
path: root/tests/neon/Makefile
diff options
context:
space:
mode:
authorPaul Duncan <pabs@pablotron.org>2024-05-26 16:01:09 -0400
committerPaul Duncan <pabs@pablotron.org>2024-05-26 16:01:09 -0400
commitcc37060f29d872df8372fce4712a00cdb0afa0ba (patch)
tree7f99a0f1ec7f81a677192ac5c2bdccb3382781c3 /tests/neon/Makefile
parent1544f849c5fd17a9a405987a2ebc7381a3c74758 (diff)
downloadsha3-cc37060f29d872df8372fce4712a00cdb0afa0ba.tar.bz2
sha3-cc37060f29d872df8372fce4712a00cdb0afa0ba.zip
sha3.c: add initial avx2 backend (slow)
bench results (oof) ------------------- scalar (gcc): > make clean all BACKEND=1 && ./bench ... info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=scalar num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,19.5,10.0,10.0,8.9,8.8 sha3_256,32,19.5,10.0,9.9,9.5,9.3 sha3_384,48,19.5,14.7,12.3,12.2,12.0 sha3_512,64,19.5,19.4,18.2,17.2,17.1 shake128,32,19.7,9.9,8.7,7.8,7.6 shake256,32,20.2,10.1,10.0,9.6,9.3 scalar (clang): > make clean all BACKEND=1 CC=clang && ./bench info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=scalar num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,19.7,9.9,9.8,8.8,8.7 sha3_256,32,19.7,9.9,9.8,9.4,9.2 sha3_384,48,19.7,14.6,12.1,12.0,11.8 sha3_512,64,19.7,19.3,17.8,16.9,16.9 shake128,32,19.7,9.8,8.6,7.7,7.5 shake256,32,19.7,9.9,9.8,9.4,9.2 avx2 (gcc): > make clean all BACKEND=6 && ./bench info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx2 num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,80.0,40.5,40.9,37.2,36.6 sha3_256,32,79.8,40.5,40.9,39.8,38.9 sha3_384,48,80.1,61.0,51.2,51.3,50.7 sha3_512,64,79.9,81.5,76.7,73.0,73.1 shake128,32,86.0,43.6,38.6,34.5,33.9 shake256,32,86.0,43.6,44.1,42.8,41.9 avx2 (clang): > make clean all BACKEND=6 CC=clang && ./bench info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 info: backend=avx2 num_trials=2000 src_lens=64,256,1024,4096,16384 dst_lens=32 function,dst_len,64,256,1024,4096,16384 sha3_224,28,62.0,31.6,31.9,29.0,28.5 sha3_256,32,61.9,31.6,32.0,31.1,30.3 sha3_384,48,62.6,47.7,40.1,40.2,39.8 sha3_512,64,62.0,63.8,60.2,57.2,57.3 shake128,32,62.5,31.7,27.9,24.9,24.5 shake256,32,62.0,31.5,31.9,31.0,30.3 the culprit is using gather in the pi step for permutes, so those will need to be rewritten as permutes: > perf annotate -Mintel --stdio --quiet permute_n_avx2 | grep gatherqq 24.09 : 207e: vpgatherqq ymm0,QWORD PTR [rdi+ymm3*8],ymm6 14.07 : 20a0: vpgatherqq ymm4,QWORD PTR [rdi+ymm3*8],ymm6 13.36 : 20aa: vpgatherqq ymm8,QWORD PTR [rdi+ymm7*8],ymm6 13.50 : 20c4: vpgatherqq ymm3,QWORD PTR [rdi+ymm7*8],ymm6 13.61 : 20ce: vpgatherqq ymm7,QWORD PTR [rdi+ymm1*8],ymm6
Diffstat (limited to 'tests/neon/Makefile')
0 files changed, 0 insertions, 0 deletions