From fc456ac824b9ef682aaa5b9050c99304ef108c29 Mon Sep 17 00:00:00 2001
From: Paul Duncan <pabs@pablotron.org>
Date: Mon, 29 Apr 2024 12:03:56 -0400
Subject: add tests/bench

---
 tests/bench/Makefile     |  16 ++++
 tests/bench/README.md    | 111 ++++++++++++++++++++++++
 tests/bench/bench.c      | 222 +++++++++++++++++++++++++++++++++++++++++++++++
 tests/bench/rand-bytes.h |   1 +
 tests/bench/sha3.c       |   1 +
 tests/bench/sha3.h       |   1 +
 6 files changed, 352 insertions(+)
 create mode 100644 tests/bench/Makefile
 create mode 100644 tests/bench/README.md
 create mode 100644 tests/bench/bench.c
 create mode 120000 tests/bench/rand-bytes.h
 create mode 120000 tests/bench/sha3.c
 create mode 120000 tests/bench/sha3.h

(limited to 'tests')

diff --git a/tests/bench/Makefile b/tests/bench/Makefile
new file mode 100644
index 0000000..3e08426
--- /dev/null
+++ b/tests/bench/Makefile
@@ -0,0 +1,16 @@
+CFLAGS=-std=c11 -W -Wall -Wextra -Wpedantic -Werror -g -O3 -march=native -mtune=native
+APP=./bench
+OBJS=sha3.o bench.o
+
+.PHONY=all test clean
+
+all: $(APP)
+
+$(APP): $(OBJS)
+	$(CC) -o $(APP) $(CFLAGS) $(OBJS) -lcpucycles -lm
+
+%.o: %.c
+	$(CC) -c $(CFLAGS) $<
+
+clean:
+	$(RM) -f $(APP) $(OBJS)
diff --git a/tests/bench/README.md b/tests/bench/README.md
new file mode 100644
index 0000000..1758c04
--- /dev/null
+++ b/tests/bench/README.md
@@ -0,0 +1,111 @@
+# bench
+
+Benchmark hash and XOF speed and then print summary statistics to
+standard output in [CSV][] format and metadata to standard error.
+
+Requires [libcpucycles][].
+
+The columns of the [CSV][] printed to standard output are as follows:
+
+* `function`: Function name.
+* `dst`: Output size, in bytes.
+* `src`: Input size, in bytes.
+* `median_cpb`: [Median][] of observed CPU cycles divided by the input size.
+* `mean_cpb`: [Arithmetic mean][mean] of observed CPU cycles divided by the input size.
+* `median_cycles`: [Median][] of observed CPU cycles.
+* `mean_cycles`: [Arithmetic mean][mean] of observed CPU cycles.
+* `stddev_cycles`: [Standard deviation][stddev] of observed CPU cycles.
+* `min_cycles`: Minimum observed CPU cycles.
+* `max_cycles`: Maximum observed CPU cycles.
+
+The metadata printed to standard error is as follows:
+
+* `version`: version of [libcpucycles][] as reported by `cpucycles_version()`
+* `implementation`: [libcpucycles][] backend as reported by `cpucycles_implementation()`
+* `persecond`: CPU cycles per second, as reported by `cpucycles_persecond()`
+* `num_trials`: Number of trials.
+
+## Build
+
+1. Install [libcpucycles][].
+2. Type `make`.  Creates an executable named `./bench` in the current
+   directory.
+
+## Run
+
+Type `./bench` to run benchmarks with the default number of trials
+(100,000), or `./bench NUM` to run benchmarks with a custom number of
+trials.
+
+**Note:** You may need to adjust your system configuration or run
+`bench` as root to grant [libcpucycles][] access to the high-resolution
+cycle counter.
+
+See [the libcpucycles security page][libcpucycles-security] for details.
+
+## Examples
+
+Below are example runs of `bench` on a ThinkPad X1 Carbon ([x86-64][],
+[AVX-512][] backend) and on an [Odroid N2L][] ([ARM64][], scalar
+backend).
+
+### Lenovo ThinkPad X1 Carbon, 6th Gen (i7-1185G7)
+
+```
+# enable user-level RDPMC access (run as root)
+root> echo 2 > /proc/sys/kernel/perf_event_paranoid
+
+# print cpu and compiler info
+> lscpu | grep -i '^model name:' | sed 's/.*: *//'
+11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz
+> gcc --version | head -1
+gcc (Debian 12.2.0-14) 12.2.0
+
+# benchmark with 100k trials
+> ./bench
+TODO...
+
+# benchmark with 1M trials
+> ./bench 1000000
+TODO...
+```
+
+### Odroid N2L (Cortex-A73)
+
+```
+# enable user-level perf_even access (run as root)
+root> echo 2 > /proc/sys/kernel/perf_event_paranoid
+
+# print cpu and compiler info
+> lscpu | grep -i '^model name' | sed 's/.*: *//'
+Cortex-A73
+> gcc --version | head -1
+gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
+
+# benchmark with 100k trials
+> ./bench
+info: cpucycles: version=20240318 implementation=arm64-vct persecond=1800000000
+info: num_trials=100000
+TODO...
+```
+
+[csv]: https://en.wikipedia.org/wiki/Comma-separated_values
+  "Comma-Separated Value (CSV)"
+[libcpucycles]: https://cpucycles.cr.yp.to/
+  "Microlibrary for counting CPU cycles."
+[libcpucycles-security]: https://cpucycles.cr.yp.to/security.html
+  "libcpucycles security documentation"
+[median]: https://en.wikipedia.org/wiki/Median
+  "Median"
+[mean]: https://en.wikipedia.org/wiki/Arithmetic_mean
+  "Arithmetic mean"
+[stddev]: https://en.wikipedia.org/wiki/Standard_deviation
+  "Standard deviation"
+[odroid n2l]: https://en.odroid.se/products/odroid-n2l-4gb
+  "Odroid N2L"
+[x86-64]: https://en.wikipedia.org/wiki/X86-64
+  "64-bit x86 instruction set."
+[arm64]: https://en.wikipedia.org/wiki/AArch64
+  "64-bit extension to the ARM instruction set."
+[avx-512]: https://en.wikipedia.org/wiki/AVX-512
+  "AVX-512: 512-bit extensions to the Advanced Vector Extensions (AVX) instruction set."
diff --git a/tests/bench/bench.c b/tests/bench/bench.c
new file mode 100644
index 0000000..9144e91
--- /dev/null
+++ b/tests/bench/bench.c
@@ -0,0 +1,222 @@
+//
+// Benchmark all three ML-KEM parameter sets and print summary
+// statistics to standard output in CSV format.
+//
+// Requires libcpucycles (https://cpucycles.cr.yp.to/).
+//
+// Note: You may need to adjust your system configuration or run `bench`
+// as root to grant libcpucycles access to the high-resolution cycle
+// counter.  See the following URL for details:
+//
+// https://cpucycles.cr.yp.to/security.html
+//
+
+#include <stdlib.h> // exit(), qsort()
+#include <stdio.h> // printf()
+#include <string.h> // memcmp()
+#include <math.h> // sqrt(), pow()
+#include <cpucycles.h> // cpucycles()
+#include "sha3.h" // sha3_*(), shake*()
+#include "rand-bytes.h" // rand_bytes()
+
+// default number of trials
+#define NUM_TRIALS 100000
+
+// Random data used for key generation and encapsulation.
+typedef struct {
+  uint8_t keygen[64], // random data for keygen()
+          encaps[32]; // random data for encaps()
+} seeds_t;
+
+// Aggregate statistics for a series of tests.
+typedef struct {
+  // min/max/median times
+  long long lo, hi, median;
+
+  // mean/stddev
+  double mean, stddev;
+} stats_t;
+
+static void *checked_calloc(const char *name, const size_t nmemb, const size_t size) {
+  // alloc keygen times
+  void *mem = calloc(nmemb, size);
+  if (!mem) {
+    fprintf(stderr, "%s: calloc() failed\n", name);
+    exit(-1);
+  }
+  return mem;
+}
+
+// Callback for `qsort()` to sort observed times in ascending order.
+static int sort_asc_cb(const void *ap, const void *bp) {
+  const long long *a = ap, *b = bp;
+  return *a - *b;
+}
+
+// Get summary statistics of a series of test times.
+static stats_t get_stats(long long * const vals, const size_t num_vals) {
+  stats_t stats = { 0 };
+
+  // sort values in ascending order (used for min, max, and median)
+  qsort(vals, num_vals, sizeof(long long), sort_asc_cb);
+
+  // get low, high, and median
+  stats.lo = vals[0];
+  stats.hi = vals[num_vals - 1];
+  stats.median = vals[num_vals / 2];
+
+  // calculate mean
+  for (size_t i = 0; i < num_vals; i++) {
+    stats.mean += vals[i];
+  }
+  stats.mean /= num_vals;
+
+  // calculate standard deviation
+  for (size_t i = 0; i < num_vals; i++) {
+    stats.stddev += pow(stats.mean - vals[i], 2);
+  }
+  stats.stddev = sqrt(stats.stddev / num_vals);
+
+  // return stats
+  return stats;
+}
+
+// define xof benchmark function
+#define DEF_BENCH_XOF(FN) \
+  static stats_t bench_ ## FN (const size_t num_trials, const size_t src_len, const size_t dst_len) { \
+    /* allocate times, src, and dst buffers */ \
+    long long *times = checked_calloc(__func__, num_trials, sizeof(long long)); \
+    uint8_t *src = checked_calloc(__func__, num_trials, src_len); \
+    uint8_t *dst = checked_calloc(__func__, num_trials, dst_len); \
+    \
+    /* generate random source data */ \
+    rand_bytes(src, num_trials * src_len); \
+    \
+    /* run trials */ \
+    for (size_t i = 0; i < num_trials; i++) { \
+      /* call function */ \
+      const long long t0 = cpucycles(); \
+      FN (src + (i * src_len), src_len, dst + (i * dst_len), dst_len); \
+      const long long t1 = cpucycles() - t0; \
+      \
+      /* save time */ \
+      times[i] = t1; \
+    } \
+    \
+    /* generate summary stats */ \
+    const stats_t stats = get_stats(times, num_trials); \
+    \
+    /* free buffers */ \
+    free(src); \
+    free(times); \
+    \
+    /* return summary stats */ \
+    return stats; \
+  }
+
+// define hash benchmark function
+#define DEF_BENCH_HASH(FN, OUT_LEN) \
+  static stats_t bench_ ## FN (const size_t num_trials, const size_t src_len) { \
+    /* allocate times and src buffers */ \
+    long long *times = checked_calloc(__func__, num_trials, sizeof(long long)); \
+    uint8_t *src = checked_calloc(__func__, src_len, 1); \
+    \
+    /* run trials */ \
+    for (size_t i = 0; i < num_trials; i++) { \
+      /* generate random source data */ \
+      rand_bytes(src, src_len); \
+      \
+      /* call function */ \
+      uint8_t dst[OUT_LEN] = { 0 }; \
+      const long long t0 = cpucycles(); \
+      FN (src, src_len, dst); \
+      const long long t1 = cpucycles() - t0; \
+      \
+      /* save time */ \
+      times[i] = t1; \
+    } \
+    \
+    /* generate summary stats */ \
+    const stats_t stats = get_stats(times, num_trials); \
+    \
+    /* free buffers */ \
+    free(src); \
+    free(times); \
+    \
+    /* return summary stats */ \
+    return stats; \
+  }
+
+// define xof benchmarks *()
+DEF_BENCH_XOF(shake128)
+DEF_BENCH_XOF(shake256)
+
+// define hash benchmarks
+DEF_BENCH_HASH(sha3_224, 28)
+DEF_BENCH_HASH(sha3_256, 32)
+DEF_BENCH_HASH(sha3_384, 48)
+DEF_BENCH_HASH(sha3_512, 64)
+
+// print function stats to standard output as CSV row.
+static void print_row(const char *name, const size_t src_len, const size_t dst_len, stats_t fs) {
+    const double median_cpb = 1.0 * fs.median / src_len,
+                 mean_cpb = 1.0 * fs.mean / src_len;
+  printf("%s,%zu,%zu,%.0f,%.0f,%lld,%.0f,%.0f,%lld,%lld\n", name, dst_len, src_len, median_cpb, mean_cpb, fs.median, fs.mean, fs.stddev, fs.lo, fs.hi);
+}
+
+#define MIN_SRC_LEN 64
+#define MAX_SRC_LEN 2048
+
+#define MIN_DST_LEN 32
+#define MAX_DST_LEN 128
+
+int main(int argc, char *argv[]) {
+  // get number of trials from first command-line argument, or fall back
+  // to default if no argument was provided
+  const size_t num_trials = (argc > 1) ? atoi(argv[1]) : NUM_TRIALS;
+  if (num_trials < 2) {
+    fprintf(stderr, "num_trials must be greater than 1\n");
+    return -1;
+  }
+
+  // print metadata to stderr
+  fprintf(stderr,"info: cpucycles: version=%s implementation=%s persecond=%lld\ninfo: num_trials=%zu\n", cpucycles_version(), cpucycles_implementation(), cpucycles_persecond(), num_trials);
+
+  // print column headers to stdout
+  printf("function,dst,src,median_cpb,mean_cpb,median_cycles,mean_cycles,stddev_cycles,min_cycles,max_cycles\n");
+
+  // sha3-224
+  for (size_t src_len = MIN_SRC_LEN; src_len < MAX_SRC_LEN; src_len <<= 1) {
+    print_row("sha3_224", src_len, 28, bench_sha3_224(num_trials, src_len));
+  }
+
+  // sha3-256
+  for (size_t src_len = MIN_SRC_LEN; src_len < MAX_SRC_LEN; src_len <<= 1) {
+    print_row("sha3_256", src_len, 32, bench_sha3_256(num_trials, src_len));
+  }
+
+  // sha3-384
+  for (size_t src_len = MIN_SRC_LEN; src_len < MAX_SRC_LEN; src_len <<= 1) {
+    print_row("sha3_384", src_len, 48, bench_sha3_384(num_trials, src_len));
+  }
+
+  // sha3-512
+  for (size_t src_len = MIN_SRC_LEN; src_len < MAX_SRC_LEN; src_len <<= 1) {
+    print_row("sha3_512", src_len, 64, bench_sha3_512(num_trials, src_len));
+  }
+
+  for (size_t dst_len = MIN_DST_LEN; dst_len < MAX_DST_LEN; dst_len <<= 1) {
+    // shake128
+    for (size_t src_len = MIN_SRC_LEN; src_len < MAX_SRC_LEN; src_len <<= 1) {
+      print_row("shake128", src_len, dst_len, bench_shake128(num_trials, src_len, dst_len));
+    }
+
+    // shake256
+    for (size_t src_len = MIN_SRC_LEN; src_len < MAX_SRC_LEN; src_len <<= 1) {
+      print_row("shake256", src_len, dst_len, bench_shake256(num_trials, src_len, dst_len));
+    }
+  }
+
+  // return success
+  return 0;
+}
diff --git a/tests/bench/rand-bytes.h b/tests/bench/rand-bytes.h
new file mode 120000
index 0000000..421eaa6
--- /dev/null
+++ b/tests/bench/rand-bytes.h
@@ -0,0 +1 @@
+../../rand-bytes.h
\ No newline at end of file
diff --git a/tests/bench/sha3.c b/tests/bench/sha3.c
new file mode 120000
index 0000000..4748193
--- /dev/null
+++ b/tests/bench/sha3.c
@@ -0,0 +1 @@
+../../sha3.c
\ No newline at end of file
diff --git a/tests/bench/sha3.h b/tests/bench/sha3.h
new file mode 120000
index 0000000..b7c53d4
--- /dev/null
+++ b/tests/bench/sha3.h
@@ -0,0 +1 @@
+../../sha3.h
\ No newline at end of file
-- 
cgit v1.2.3