aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sha3.c212
1 files changed, 134 insertions, 78 deletions
diff --git a/sha3.c b/sha3.c
index 3925f25..6f936b9 100644
--- a/sha3.c
+++ b/sha3.c
@@ -26,15 +26,15 @@
/** @cond INTERNAL */
// available backends
-#define SHA3_BACKEND_AVX512 8
-#define SHA3_BACKEND_SCALAR 0
+#define BACKEND_AVX512 8 // avx512 backend
+#define BACKEND_SCALAR 0 // scalar (default) backend
-// auto-detect backend if unspecified
+// auto-detect backend
#ifndef SHA3_BACKEND
#ifdef __AVX512F__
-#define SHA3_BACKEND SHA3_BACKEND_AVX512
+#define SHA3_BACKEND BACKEND_AVX512
#else /* !__AVX512F__ */
-#define SHA3_BACKEND SHA3_BACKEND_SCALAR
+#define SHA3_BACKEND BACKEND_SCALAR
#endif /* __AVX512F__ */
#endif /* SHA3_BACKEND */
@@ -50,7 +50,7 @@
// align memory to N bytes
#define ALIGN(N) __attribute__((aligned(N)))
-// round constants (used by iota)
+// Iota round constants.
static const uint64_t RCS[] = {
0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, 0x8000000080008000ULL,
0x000000000000808bULL, 0x0000000080000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL,
@@ -60,13 +60,16 @@ static const uint64_t RCS[] = {
0x8000000080008081ULL, 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL,
};
-#if (SHA3_BACKEND == SHA3_BACKEND_SCALAR) || defined(SHA3_TEST)
-// If AVX512 is supported and we are not building the test suite,
-// then do not compile the scalar step functions below.
+#if (SHA3_BACKEND == BACKEND_SCALAR) || defined(SHA3_TEST)
+// If AVX-512 is supported and we are not building the test suite,
+// then do not compile the scalar step functions.
//
-// (because they aren't used by the AVX512 implementation).
+// (because they aren't used by the AVX-512 implementation).
-// theta step of keccak permutation (scalar implementation)
+/**
+ * @brief Theta step of scalar Keccak permutation.
+ * @param[in,out] a Keccak state (array of 25 64-bit integers).
+ */
static inline void theta(uint64_t a[static 25]) {
const uint64_t c[5] = {
a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20],
@@ -91,7 +94,10 @@ static inline void theta(uint64_t a[static 25]) {
a[20] ^= d[0]; a[21] ^= d[1]; a[22] ^= d[2]; a[23] ^= d[3]; a[24] ^= d[4];
}
-// rho step of keccak permutation (scalar implementation)
+/**
+ * @brief Rho step of scalar Keccak permutation.
+ * @param[in,out] a Keccak state (array of 25 64-bit integers).
+ */
static inline void rho(uint64_t a[static 25]) {
a[ 1] = ROL(a[ 1], 1); // 1 % 64 = 1
a[ 2] = ROL(a[ 2], 62); // 190 % 64 = 62
@@ -123,7 +129,10 @@ static inline void rho(uint64_t a[static 25]) {
a[24] = ROL(a[24], 14); // 78 % 64 = 14
}
-// pi step of keccak permutation (scalar implementation)
+/**
+ * @brief Pi step of scalar Keccak permutation.
+ * @param[in,out] a Keccak state (array of 25 64-bit integers).
+ */
static inline void pi(uint64_t dst[static 25], const uint64_t src[static 25]) {
dst[ 0] = src[ 0];
dst[ 1] = src[ 6];
@@ -156,7 +165,10 @@ static inline void pi(uint64_t dst[static 25], const uint64_t src[static 25]) {
dst[24] = src[21];
}
-// chi step of keccak permutation (scalar implementation)
+/**
+ * @brief Chi step of scalar Keccak permutation.
+ * @param[in,out] a Keccak state (array of 25 64-bit integers).
+ */
static inline void chi(uint64_t dst[static 25], const uint64_t src[static 25]) {
dst[ 0] = src[ 0] ^ (~src[ 1] & src[ 2]);
dst[ 1] = src[ 1] ^ (~src[ 2] & src[ 3]);
@@ -189,12 +201,26 @@ static inline void chi(uint64_t dst[static 25], const uint64_t src[static 25]) {
dst[24] = src[24] ^ (~src[20] & src[21]);
}
-// iota step of keccak permutation (scalar implementation)
+/**
+ * @brief Iota step of scalar Keccak permutation.
+ * @param[in,out] a Keccak state (array of 25 64-bit integers).
+ */
static inline void iota(uint64_t a[static 25], const int i) {
a[0] ^= RCS[i];
}
-// keccak permutation (scalar implementation)
+/**
+ * @brief Scalar Keccak permutation.
+ *
+ * Apply `num_rounds` of Keccak permutation. This function is only
+ * called by:
+ *
+ * - `permute_scalar()`: 24 rounds
+ * - `permute12_scalar()`: 12 rounds. Used by TurboSHAKE and KangarooTwelve.
+ *
+ * @param[in,out] a Keccak state (array of 25 64-bit integers).
+ * @param[in] num_rounds Number of rounds (12 or 24).
+ */
static inline void permute_n_scalar(uint64_t a[static 25], const size_t num_rounds) {
uint64_t tmp[25] = { 0 };
for (size_t i = 0; i < num_rounds; i++) {
@@ -206,49 +232,64 @@ static inline void permute_n_scalar(uint64_t a[static 25], const size_t num_roun
}
}
-// 24 round keccak permutation (scalar implementation)
+/**
+ * @brief 24 round scalar Keccak permutation.
+ * @param[in,out] a Keccak state (array of 25 64-bit integers).
+ */
static inline void permute_scalar(uint64_t a[static 25]) {
permute_n_scalar(a, 24);
}
-// 12 round keccak permutation (scalar implementation)
-// (only used by turboshake)
+/**
+ * @brief 12 round scalar Keccak permutation.
+ * @note Only used by TurboSHAKE and KangarooTwelve.
+ * @param[in,out] a Keccak state (array of 25 64-bit integers).
+ */
static inline void permute12_scalar(uint64_t a[static 25]) {
permute_n_scalar(a, 12);
}
-#endif /* (SHA3_BACKEND == SHA3_BACKEND_SCALAR) || defined(SHA3_TEST) */
+#endif /* (SHA3_BACKEND == BACKEND_SCALAR) || defined(SHA3_TEST) */
-#if SHA3_BACKEND == SHA3_BACKEND_AVX512
+#if SHA3_BACKEND == BACKEND_AVX512
#include <immintrin.h>
-// keccak permutation (avx512 implementation).
-//
-// how it operates (roughly):
-//
-// 1. load rows from state `s` into the first 5 64-bit lanes of AVX-512
-// registers r0-r4, like so:
-//
-// -----------------------------------------------------------------
-// | | Lanes |
-// |-----|---------------------------------------------------------|
-// | Reg | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
-// |-----|-------|-------|-------|-------|-------|-----|-----|-----|
-// | r0 | s[ 0] | s[ 1] | s[ 2] | s[ 3] | s[ 4] | n/a | n/a | n/a |
-// | r0 | s[ 0] | s[ 1] | s[ 2] | s[ 3] | s[ 4] | n/a | n/a | n/a |
-// | r1 | s[ 5] | s[ 6] | s[ 7] | s[ 8] | s[ 9] | n/a | n/a | n/a |
-// | r2 | s[10] | s[11] | s[12] | s[13] | s[14] | n/a | n/a | n/a |
-// | r3 | s[15] | s[16] | s[17] | s[18] | s[19] | n/a | n/a | n/a |
-// | r4 | s[20] | s[21] | s[22] | s[23] | s[24] | n/a | n/a | n/a |
-// -----------------------------------------------------------------
-//
-// 2. For each round of 24 rounds:
-// a. Perform theta, rho, pi, and chi steps. pi, in particular, has
-// a large number of permutation registers (so it may spill).
-// b. Load round constant for current round and perform iota step.
-//
-// 3. store the rows first 5 64-bit lanes of registers r0-r4 back to the
-// state `s`.
-//
+/**
+ * @brief AVX-512 Keccak permutation.
+ *
+ * @param[in,out] s Keccak state (array of 25 64-bit integers).
+ * @param[in] num_rounds Number of rounds (12 or 24).
+ *
+ * Apply `num_rounds` of Keccak permutation. This function is only
+ * called by:
+ *
+ * - `permute_avx512()`: 24 rounds.
+ * - `permute12_avx512()`: 12 rounds. Used by TurboSHAKE and KangarooTwelve.
+ *
+ * How it works:
+ *
+ * 1. The Keccak state is loaded from `s` (an array of 25 64-bit
+ * unsigned integers) into the first 5 64-bit lanes of 5 AVX-512
+ * registers r0-r4, like this:
+ *
+ * -----------------------------------------------------------------
+ * | | 64-bit Lane |
+ * |-----|---------------------------------------------------------|
+ * | Reg | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+ * |-----|-------|-------|-------|-------|-------|-----|-----|-----|
+ * | r0 | s[ 0] | s[ 1] | s[ 2] | s[ 3] | s[ 4] | n/a | n/a | n/a |
+ * | r1 | s[ 5] | s[ 6] | s[ 7] | s[ 8] | s[ 9] | n/a | n/a | n/a |
+ * | r2 | s[10] | s[11] | s[12] | s[13] | s[14] | n/a | n/a | n/a |
+ * | r3 | s[15] | s[16] | s[17] | s[18] | s[19] | n/a | n/a | n/a |
+ * | r4 | s[20] | s[21] | s[22] | s[23] | s[24] | n/a | n/a | n/a |
+ * -----------------------------------------------------------------
+ *
+ * 2. The Keccak permutation is applied `num_rounds` times, where
+ * `num_rounds` is either 12 for TurboSHAKE and KangarooTwelve or 24
+ * otherwise.
+ *
+ * 3. The permuted Keccak state is copied from the first 5 64-bit lanes
+ * of AVX-512 registers r0-r4 back to `s`.
+ */
static inline void permute_n_avx512(uint64_t s[static 25], const size_t num_rounds) {
// load rows (r0-r4)
__m512i r0 = _mm512_maskz_loadu_epi64(0x1f, s + 0), // row 0
@@ -257,7 +298,7 @@ static inline void permute_n_avx512(uint64_t s[static 25], const size_t num_roun
r3 = _mm512_maskz_loadu_epi64(0x1f, s + 15), // row 3
r4 = _mm512_maskz_loadu_epi64(0x1f, s + 20); // row 4
- // 24 rounds
+ // loop over rounds
for (size_t i = 0; i < num_rounds; i++) {
// theta
{
@@ -437,21 +478,30 @@ static inline void permute_n_avx512(uint64_t s[static 25], const size_t num_roun
_mm512_mask_storeu_epi64(s + 5 * 4, 0x1f, r4);
}
-// 24 round keccak permutation (avx512 implementation).
+/**
+ * @brief 24 round AVX-512 Keccak permutation.
+ * @param[in,out] a Keccak state (array of 25 64-bit integers).
+ */
static inline void permute_avx512(uint64_t s[static 25]) {
permute_n_avx512(s, 24);
}
-// 12 round keccak permutation (avx512 implementation).
+/**
+ * @brief 12 round AVX-512 Keccak permutation.
+ * @note Only used by TurboSHAKE and KangarooTwelve.
+ * @param[in,out] a Keccak state (array of 25 64-bit integers).
+ */
static inline void permute12_avx512(uint64_t s[static 25]) {
permute_n_avx512(s, 12);
}
-#endif /* SHA3_BACKEND == SHA3_BACKEND_AVX512 */
+#endif /* SHA3_BACKEND == BACKEND_AVX512 */
-#if SHA3_BACKEND == SHA3_BACKEND_AVX512
+#if SHA3_BACKEND == BACKEND_AVX512
+// use avx512 backend
#define permute permute_avx512
#define permute12 permute12_avx512
-#elif SHA3_BACKEND == SHA3_BACKEND_SCALAR
+#elif SHA3_BACKEND == BACKEND_SCALAR
+// use scalar backend
#define permute permute_scalar
#define permute12 permute12_scalar
#else
@@ -576,29 +626,35 @@ static inline size_t absorb12(sha3_state_t * const a, size_t num_bytes, const si
return num_bytes;
}
-// Get rate (number of bytes that can be absorbed before the internal
-// state is permuted).
+// Get the rate of a FIPS 202 hash function or extendable-output
+// function (XOF).
+//
+// The "rate" is the number of bytes that can be absorbed or squeezed
+// before the internal state is permuted. It is calculated by the total
+// state size (200 bytes) minus the capacity (FIPS 202, Section 5.2).
//
-// For hash functions, the capacity is always 2 times the output length
-// of the hash, and the rate is the total state size (200 bytes) minus
-// the capacity (FIPS 202, Section 5.2).
+// The capacity and rate is determined as follows:
//
-// XOFs do not have fixed-length output, but the capacity is 2 times the
-// named strength, and the rate is the total state size (200 bytes)
-// minus the capacity.
+// * Hash functions: The capacity is 2 times the length of the output
+// digest. For example, the length of a SHA3-256 digest is 32 bytes
+// (256 bits), so the capacity is 64 bytes and the rate is 136 bytes
+// (200-64 = 136).
+// * XOFs: The capacity is 2 times the named strength. For example, the
+// named strength of SHAKE128 is 16 bytes (128 bits), so the capacity
+// is 32 bytes and the rate is 168 bytes (200-32 = 168).
//
-// The table below shows the output size, capacity, and rate for each
+// The table below shows the output size, rate, and capacity for each
// FIPS 202 function. All values are in bytes.
//
// ---------------------------------------
-// | Function | Output | Capacity | Rate |
-// |----------|--------|----------|------|
-// | SHA3-224 | 28 | 56 | 144 |
-// | SHA3-256 | 32 | 64 | 136 |
-// | SHA3-384 | 48 | 96 | 104 |
-// | SHA3-512 | 64 | 136 | 72 |
-// | SHAKE128 | n/a | 32 | 168 |
-// | SHAKE256 | n/a | 64 | 136 |
+// | Function | Output | Rate | Capacity |
+// |----------|--------|------|----------|
+// | SHA3-224 | 28 | 144 | 56 |
+// | SHA3-256 | 32 | 136 | 64 |
+// | SHA3-384 | 48 | 104 | 96 |
+// | SHA3-512 | 64 | 72 | 128 |
+// | SHAKE128 | n/a | 168 | 32 |
+// | SHAKE256 | n/a | 136 | 64 |
// ---------------------------------------
//
#define RATE(len) (200 - 2 * (len))
@@ -1920,9 +1976,9 @@ void k12_once(const uint8_t *src, const size_t src_len, uint8_t *dst, const size
// Return backend name.
const char *sha3_backend(void) {
-#if SHA3_BACKEND == SHA3_BACKEND_AVX512
+#if SHA3_BACKEND == BACKEND_AVX512
return "avx512";
-#elif SHA3_BACKEND == SHA3_BACKEND_SCALAR
+#elif SHA3_BACKEND == BACKEND_SCALAR
return "scalar";
#endif /* SHA3_BACKEND */
}
@@ -2184,7 +2240,7 @@ static void test_permute_scalar(void) {
}
static void test_permute_avx512(void) {
-#if SHA3_BACKEND == SHA3_BACKEND_AVX512
+#if SHA3_BACKEND == BACKEND_AVX512
for (size_t i = 0; i < sizeof(PERMUTE_TESTS) / sizeof(PERMUTE_TESTS[0]); i++) {
const size_t exp_len = PERMUTE_TESTS[i].exp_len;
@@ -2196,7 +2252,7 @@ static void test_permute_avx512(void) {
fail_test(__func__, "", (uint8_t*) got, exp_len, (uint8_t*) PERMUTE_TESTS[i].exp, exp_len);
}
}
-#endif /* SHA3_BACKEND == SHA3_BACKEND_AVX512 */
+#endif /* SHA3_BACKEND == BACKEND_AVX512 */
}
static const struct {
@@ -2224,7 +2280,7 @@ static void test_permute12_scalar(void) {
}
static void test_permute12_avx512(void) {
-#if SHA3_BACKEND == SHA3_BACKEND_AVX512
+#if SHA3_BACKEND == BACKEND_AVX512
for (size_t i = 0; i < sizeof(PERMUTE12_TESTS) / sizeof(PERMUTE12_TESTS[0]); i++) {
const size_t exp_len = PERMUTE12_TESTS[i].exp_len;
@@ -2236,7 +2292,7 @@ static void test_permute12_avx512(void) {
fail_test(__func__, "", (uint8_t*) got, exp_len, (uint8_t*) PERMUTE12_TESTS[i].exp, exp_len);
}
}
-#endif /* SHA3_BACKEND == SHA3_BACKEND_AVX512 */
+#endif /* SHA3_BACKEND == BACKEND_AVX512 */
}
static void test_sha3_224(void) {