From fc38ab69ff049f0a87d58d185f9931b8d0d6b5e4 Mon Sep 17 00:00:00 2001 From: Paul Duncan Date: Mon, 29 Apr 2024 16:49:42 -0400 Subject: tests/bench: refactor so bench prints a cpb table to stdout --- tests/bench/README.md | 41 +++++++----- tests/bench/bench.c | 181 ++++++++++++++++++++++++++++---------------------- 2 files changed, 129 insertions(+), 93 deletions(-) (limited to 'tests/bench') diff --git a/tests/bench/README.md b/tests/bench/README.md index 0d792e9..c2c3b73 100644 --- a/tests/bench/README.md +++ b/tests/bench/README.md @@ -1,21 +1,21 @@ # bench -Benchmark hash and XOF speed and then print summary statistics to -standard output in [CSV][] format and metadata to standard error. +Benchmark [hash][] functions and [XOFs][xof], then print metadata to +standard error and print a table of [median][] [cycles per byte (cpb)][] +for each function and input message length to standard output in [CSV][] +format. Requires [libcpucycles][]. The columns of the [CSV][] printed to standard output are as follows: * `function`: Function name. -* `dst`: Output length, in bytes. -* `src`: Input length, in bytes. -* `median_cpb`: [Median][] [cycles per byte][] (e.g. `median_cycles/src`). -* `median_cycles`: [Median][] of observed CPU cycles. -* `mean_cycles`: [Arithmetic mean][mean] of observed CPU cycles. -* `stddev_cycles`: [Standard deviation][stddev] of observed CPU cycles. -* `min_cycles`: Minimum observed CPU cycles. -* `max_cycles`: Maximum observed CPU cycles. +* `dst_len`: Output digest length, in bytes. +* `64`: [Median][] [cycles per byte (cpb)][] for a 64 byte input message. +* `256`: [Median][] [cycles per byte (cpb)][] for a 256 byte input message. +* `1024`: [Median][] [cycles per byte (cpb)][] for a 1024 byte input message. +* `4096`: [Median][] [cycles per byte (cpb)][] for a 4096 byte input message. +* `16384`: [Median][] [cycles per byte (cpb)][] for a 16384 byte input message. The metadata printed to standard error is as follows: @@ -23,6 +23,9 @@ The metadata printed to standard error is as follows: * `implementation`: [libcpucycles][] backend as reported by `cpucycles_implementation()` * `persecond`: CPU cycles per second, as reported by `cpucycles_persecond()` * `num_trials`: Number of trials. +* `src_lens`: Comma-delimited list of input messages lengths, in bytes. +* `dst_lens`: Comma-delimited list of output digest lengths, in bytes + (only used for [XOFs][]). ## Build @@ -62,11 +65,15 @@ gcc (Debian 12.2.0-14) 12.2.0 # benchmark with 100k trials > ./bench -TODO... - -# benchmark with 1M trials -> ./bench 1000000 -TODO... +info: cpucycles: version=20240318 implementation=amd64-pmc persecond=4800000000 +info: num_trials=100000 src_lens=64,256,1024,4096,16384 dst_lens=32 +function,dst_len,64,256,1024,4096,16384 +sha3_224,28,15.4,7.8,7.8,7.1,7.0 +sha3_256,32,15.4,7.8,7.8,7.6,7.4 +sha3_384,48,15.5,11.7,9.8,9.8,9.7 +sha3_512,64,15.4,15.5,14.6,13.9,13.9 +shake128,32,15.5,7.8,6.9,6.2,6.1 +shake256,32,15.6,7.8,7.9,7.6,7.4 ``` ### Odroid N2L (Cortex-A73) @@ -110,3 +117,7 @@ TODO... "AVX-512: 512-bit extensions to the Advanced Vector Extensions (AVX) instruction set." [cycles per byte]: https://en.wikipedia.org/wiki/Encryption_software#Performance "Observed CPU cycles divided by the number of input bytes." +[xof]: https://en.wikipedia.org/wiki/Extendable-output_function + "Extendable-Output Function (XOF)" +[hash]: https://en.wikipedia.org/wiki/Cryptographic_hash_function + "Cryptographic hash function" diff --git a/tests/bench/bench.c b/tests/bench/bench.c index b147ad5..cd03205 100644 --- a/tests/bench/bench.c +++ b/tests/bench/bench.c @@ -1,6 +1,8 @@ // -// Benchmark all three ML-KEM parameter sets and print summary -// statistics to standard output in CSV format. +// Benchmark hash functions and extendable output functions (XOFs), then +// print metadata to standard error and print a table of median cycles +// per byte (cpb) for each function and input message length to standard +// output in CSV format. // // Requires libcpucycles (https://cpucycles.cr.yp.to/). // @@ -22,19 +24,32 @@ // default number of trials #define NUM_TRIALS 100000 -// Random data used for key generation and encapsulation. -typedef struct { - uint8_t keygen[64], // random data for keygen() - encaps[32]; // random data for encaps() -} seeds_t; +// input sizes (used for hashes and xofs) +static const size_t SRC_LENS[] = { 64, 256, 1024, 4096, 16384 }; +#define NUM_SRC_LENS (sizeof(SRC_LENS)/sizeof(SRC_LENS[0])) + +// output sizes (used for xofs) +static const size_t DST_LENS[] = { 32 }; +#define NUM_DST_LENS (sizeof(DST_LENS)/sizeof(DST_LENS[0])) + +// get maximum source length +static size_t get_max_src_len(void) { + size_t r = 0; + + for (size_t i = 0; i < NUM_SRC_LENS; i++) { + r = (SRC_LENS[i] > r) ? SRC_LENS[i] : r; + } + + return r; +} // Aggregate statistics for a series of tests. typedef struct { // min/max/median times long long lo, hi, median; - // mean/stddev - double mean, stddev; + // mean/stddev, median_cpb + double mean, stddev, median_cpb; } stats_t; static void *checked_calloc(const char *name, const size_t nmemb, const size_t size) { @@ -54,7 +69,7 @@ static int sort_asc_cb(const void *ap, const void *bp) { } // Get summary statistics of a series of test times. -static stats_t get_stats(long long * const vals, const size_t num_vals) { +static stats_t get_stats(long long * const vals, const size_t num_vals, const size_t len) { stats_t stats = { 0 }; // sort values in ascending order (used for min, max, and median) @@ -65,6 +80,9 @@ static stats_t get_stats(long long * const vals, const size_t num_vals) { stats.hi = vals[num_vals - 1]; stats.median = vals[num_vals / 2]; + // calculate median cpb + stats.median_cpb = 1.0 * stats.median / len; + // calculate mean for (size_t i = 0; i < num_vals; i++) { stats.mean += vals[i]; @@ -83,70 +101,72 @@ static stats_t get_stats(long long * const vals, const size_t num_vals) { // define xof benchmark function #define DEF_BENCH_XOF(FN) \ - static stats_t bench_ ## FN (const size_t num_trials, const size_t src_len, const size_t dst_len) { \ + static void bench_ ## FN (double * const cpbs, const size_t num_trials, const size_t dst_len) { \ /* allocate times, src, and dst buffers */ \ long long *times = checked_calloc(__func__, num_trials, sizeof(long long)); \ - uint8_t *src = checked_calloc(__func__, 1, src_len); \ + uint8_t *src = checked_calloc(__func__, 1, get_max_src_len()); \ uint8_t *dst = checked_calloc(__func__, num_trials, dst_len); \ \ - /* run trials */ \ - for (size_t i = 0; i < num_trials; i++) { \ - /* generate random source data */ \ - rand_bytes(src, src_len); \ + for (size_t i = 0; i < NUM_SRC_LENS; i++) { \ + const size_t src_len = SRC_LENS[i]; /* get source length */ \ \ - /* call function */ \ - const long long t0 = cpucycles(); \ - FN (src, src_len, dst + (i * dst_len), dst_len); \ - const long long t1 = cpucycles() - t0; \ + /* run trials */ \ + for (size_t j = 0; j < num_trials; j++) { \ + /* generate random source data */ \ + rand_bytes(src, src_len); \ + \ + /* call function */ \ + const long long t0 = cpucycles(); \ + FN (src, src_len, dst + (j * dst_len), dst_len); \ + const long long t1 = cpucycles() - t0; \ + \ + /* save time */ \ + times[j] = t1; \ + } \ \ - /* save time */ \ - times[i] = t1; \ + /* generate summary stats, save cpb */ \ + cpbs[i] = 1.0 * get_stats(times, num_trials, src_len).median_cpb; \ } \ \ - /* generate summary stats */ \ - const stats_t stats = get_stats(times, num_trials); \ - \ /* free buffers */ \ free(times); \ free(src); \ free(dst); \ - \ - /* return summary stats */ \ - return stats; \ } // define hash benchmark function #define DEF_BENCH_HASH(FN, OUT_LEN) \ - static stats_t bench_ ## FN (const size_t num_trials, const size_t src_len) { \ + static void bench_ ## FN (double * const cpbs, const size_t num_trials) { \ /* allocate times, src, and dst buffers */ \ long long *times = checked_calloc(__func__, num_trials, sizeof(long long)); \ - uint8_t *src = checked_calloc(__func__, 1, src_len); \ + uint8_t *src = checked_calloc(__func__, 1, get_max_src_len()); \ uint8_t *dst = checked_calloc(__func__, num_trials, OUT_LEN); \ \ - /* run trials */ \ - for (size_t i = 0; i < num_trials; i++) { \ - /* generate random source data */ \ - rand_bytes(src, src_len); \ + for (size_t i = 0; i < NUM_SRC_LENS; i++) { \ + const size_t src_len = SRC_LENS[i]; /* get source length */ \ \ - /* call function */ \ - const long long t0 = cpucycles(); \ - FN (src, src_len, dst + (i * OUT_LEN)); \ - const long long t1 = cpucycles() - t0; \ + /* run trials */ \ + for (size_t j = 0; j < num_trials; j++) { \ + /* generate random source data */ \ + rand_bytes(src, src_len); \ + \ + /* call function */ \ + const long long t0 = cpucycles(); \ + FN (src, src_len, dst + (j * OUT_LEN)); \ + const long long t1 = cpucycles() - t0; \ + \ + /* save time */ \ + times[j] = t1; \ + } \ \ - /* save time */ \ - times[i] = t1; \ + /* generate summary stats, save cpb */ \ + cpbs[i] = 1.0 * get_stats(times, num_trials, src_len).median_cpb; \ } \ \ - /* generate summary stats */ \ - const stats_t stats = get_stats(times, num_trials); \ - \ /* free buffers */ \ free(times); \ free(src); \ free(dst); \ - \ - /* return summary stats */ \ - return stats; \ } // define xof benchmarks @@ -160,20 +180,17 @@ DEF_BENCH_HASH(sha3_384, 48) DEF_BENCH_HASH(sha3_512, 64) // print function stats to standard output as CSV row. -static void print_row(const char *name, const size_t src_len, const size_t dst_len, stats_t fs) { - const double median_cpb = 1.0 * fs.median / src_len; - printf("%s,%zu,%zu,%.1f,%lld,%.0f,%.0f,%lld,%lld\n", name, dst_len, src_len, median_cpb, fs.median, fs.mean, fs.stddev, fs.lo, fs.hi); +static void print_row(const char *name, const size_t dst_len, double * const cpbs) { + printf("%s,%zu", name, dst_len); + for (size_t i = 0; i < NUM_SRC_LENS; i++) { + printf(",%.1f", cpbs[i]); + } + fputs("\n", stdout); } -// input sizes (used for hashes and xofs) -#define MIN_SRC_LEN (1<<6) // minimum source length (inclusive) -#define MAX_SRC_LEN (1<<14) // maximum source length (exclusive) - -// output sizes (used for xofs) -#define MIN_DST_LEN (1<<5) // minimum source length (inclusive) -#define MAX_DST_LEN (1<<7) // maximum source length (exclusive) - int main(int argc, char *argv[]) { + double cpbs[NUM_SRC_LENS]; + // get number of trials from first command-line argument, or fall back // to default if no argument was provided const size_t num_trials = (argc > 1) ? atoi(argv[1]) : NUM_TRIALS; @@ -183,42 +200,50 @@ int main(int argc, char *argv[]) { } // print metadata to stderr - fprintf(stderr,"info: cpucycles: version=%s implementation=%s persecond=%lld\ninfo: num_trials=%zu\n", cpucycles_version(), cpucycles_implementation(), cpucycles_persecond(), num_trials); + fprintf(stderr,"info: cpucycles: version=%s implementation=%s persecond=%lld\ninfo: num_trials=%zu src_lens", cpucycles_version(), cpucycles_implementation(), cpucycles_persecond(), num_trials); + for (size_t i = 0; i < NUM_SRC_LENS; i++) { + fprintf(stderr, "%s%zu", (i > 0) ? "," : "=", SRC_LENS[i]); + } + fputs(" dst_lens", stderr); + for (size_t i = 0; i < NUM_DST_LENS; i++) { + fprintf(stderr, "%s%zu", (i > 0) ? "," : "=", DST_LENS[i]); + } + fputs("\n", stderr); // print column headers to stdout - printf("function,dst,src,median_cpb,median_cycles,mean_cycles,stddev_cycles,min_cycles,max_cycles\n"); + fputs("function,dst_len", stdout); + for (size_t i = 0; i < NUM_SRC_LENS; i++) { + printf(",%zu", SRC_LENS[i]); + } + fputs("\n", stdout); // sha3-224 - for (size_t src_len = MIN_SRC_LEN; src_len < MAX_SRC_LEN; src_len <<= 1) { - print_row("sha3_224", src_len, 28, bench_sha3_224(num_trials, src_len)); - } + bench_sha3_224(cpbs, num_trials); + print_row("sha3_224", 28, cpbs); // sha3-256 - for (size_t src_len = MIN_SRC_LEN; src_len < MAX_SRC_LEN; src_len <<= 1) { - print_row("sha3_256", src_len, 32, bench_sha3_256(num_trials, src_len)); - } + bench_sha3_256(cpbs, num_trials); + print_row("sha3_256", 32, cpbs); // sha3-384 - for (size_t src_len = MIN_SRC_LEN; src_len < MAX_SRC_LEN; src_len <<= 1) { - print_row("sha3_384", src_len, 48, bench_sha3_384(num_trials, src_len)); - } + bench_sha3_384(cpbs, num_trials); + print_row("sha3_384", 48, cpbs); // sha3-512 - for (size_t src_len = MIN_SRC_LEN; src_len < MAX_SRC_LEN; src_len <<= 1) { - print_row("sha3_512", src_len, 64, bench_sha3_512(num_trials, src_len)); - } + bench_sha3_512(cpbs, num_trials); + print_row("sha3_512", 64, cpbs); // test xofs - for (size_t dst_len = MIN_DST_LEN; dst_len < MAX_DST_LEN; dst_len <<= 1) { + for (size_t i = 0; i < NUM_DST_LENS; i++) { + const size_t dst_len = DST_LENS[i]; + // shake128 - for (size_t src_len = MIN_SRC_LEN; src_len < MAX_SRC_LEN; src_len <<= 1) { - print_row("shake128", src_len, dst_len, bench_shake128(num_trials, src_len, dst_len)); - } + bench_shake128(cpbs, num_trials, dst_len); + print_row("shake128", dst_len, cpbs); // shake256 - for (size_t src_len = MIN_SRC_LEN; src_len < MAX_SRC_LEN; src_len <<= 1) { - print_row("shake256", src_len, dst_len, bench_shake256(num_trials, src_len, dst_len)); - } + bench_shake256(cpbs, num_trials, dst_len); + print_row("shake256", dst_len, cpbs); } // return success -- cgit v1.2.3