From 484ba99c833e053446e134026115f2860a2ba641 Mon Sep 17 00:00:00 2001 From: Paul Duncan Date: Wed, 1 May 2024 03:11:55 -0400 Subject: sha3.c: improve internal documentation, s/SHA3_BACKEND_/BACKEND_/ --- sha3.c | 212 +++++++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 134 insertions(+), 78 deletions(-) (limited to 'sha3.c') diff --git a/sha3.c b/sha3.c index 3925f25..6f936b9 100644 --- a/sha3.c +++ b/sha3.c @@ -26,15 +26,15 @@ /** @cond INTERNAL */ // available backends -#define SHA3_BACKEND_AVX512 8 -#define SHA3_BACKEND_SCALAR 0 +#define BACKEND_AVX512 8 // avx512 backend +#define BACKEND_SCALAR 0 // scalar (default) backend -// auto-detect backend if unspecified +// auto-detect backend #ifndef SHA3_BACKEND #ifdef __AVX512F__ -#define SHA3_BACKEND SHA3_BACKEND_AVX512 +#define SHA3_BACKEND BACKEND_AVX512 #else /* !__AVX512F__ */ -#define SHA3_BACKEND SHA3_BACKEND_SCALAR +#define SHA3_BACKEND BACKEND_SCALAR #endif /* __AVX512F__ */ #endif /* SHA3_BACKEND */ @@ -50,7 +50,7 @@ // align memory to N bytes #define ALIGN(N) __attribute__((aligned(N))) -// round constants (used by iota) +// Iota round constants. static const uint64_t RCS[] = { 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, 0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL, @@ -60,13 +60,16 @@ static const uint64_t RCS[] = { 0x8000000080008081ULL, 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL, }; -#if (SHA3_BACKEND == SHA3_BACKEND_SCALAR) || defined(SHA3_TEST) -// If AVX512 is supported and we are not building the test suite, -// then do not compile the scalar step functions below. +#if (SHA3_BACKEND == BACKEND_SCALAR) || defined(SHA3_TEST) +// If AVX-512 is supported and we are not building the test suite, +// then do not compile the scalar step functions. // -// (because they aren't used by the AVX512 implementation). +// (because they aren't used by the AVX-512 implementation). -// theta step of keccak permutation (scalar implementation) +/** + * @brief Theta step of scalar Keccak permutation. + * @param[in,out] a Keccak state (array of 25 64-bit integers). + */ static inline void theta(uint64_t a[static 25]) { const uint64_t c[5] = { a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20], @@ -91,7 +94,10 @@ static inline void theta(uint64_t a[static 25]) { a[20] ^= d[0]; a[21] ^= d[1]; a[22] ^= d[2]; a[23] ^= d[3]; a[24] ^= d[4]; } -// rho step of keccak permutation (scalar implementation) +/** + * @brief Rho step of scalar Keccak permutation. + * @param[in,out] a Keccak state (array of 25 64-bit integers). + */ static inline void rho(uint64_t a[static 25]) { a[ 1] = ROL(a[ 1], 1); // 1 % 64 = 1 a[ 2] = ROL(a[ 2], 62); // 190 % 64 = 62 @@ -123,7 +129,10 @@ static inline void rho(uint64_t a[static 25]) { a[24] = ROL(a[24], 14); // 78 % 64 = 14 } -// pi step of keccak permutation (scalar implementation) +/** + * @brief Pi step of scalar Keccak permutation. + * @param[in,out] a Keccak state (array of 25 64-bit integers). + */ static inline void pi(uint64_t dst[static 25], const uint64_t src[static 25]) { dst[ 0] = src[ 0]; dst[ 1] = src[ 6]; @@ -156,7 +165,10 @@ static inline void pi(uint64_t dst[static 25], const uint64_t src[static 25]) { dst[24] = src[21]; } -// chi step of keccak permutation (scalar implementation) +/** + * @brief Chi step of scalar Keccak permutation. + * @param[in,out] a Keccak state (array of 25 64-bit integers). + */ static inline void chi(uint64_t dst[static 25], const uint64_t src[static 25]) { dst[ 0] = src[ 0] ^ (~src[ 1] & src[ 2]); dst[ 1] = src[ 1] ^ (~src[ 2] & src[ 3]); @@ -189,12 +201,26 @@ static inline void chi(uint64_t dst[static 25], const uint64_t src[static 25]) { dst[24] = src[24] ^ (~src[20] & src[21]); } -// iota step of keccak permutation (scalar implementation) +/** + * @brief Iota step of scalar Keccak permutation. + * @param[in,out] a Keccak state (array of 25 64-bit integers). + */ static inline void iota(uint64_t a[static 25], const int i) { a[0] ^= RCS[i]; } -// keccak permutation (scalar implementation) +/** + * @brief Scalar Keccak permutation. + * + * Apply `num_rounds` of Keccak permutation. This function is only + * called by: + * + * - `permute_scalar()`: 24 rounds + * - `permute12_scalar()`: 12 rounds. Used by TurboSHAKE and KangarooTwelve. + * + * @param[in,out] a Keccak state (array of 25 64-bit integers). + * @param[in] num_rounds Number of rounds (12 or 24). + */ static inline void permute_n_scalar(uint64_t a[static 25], const size_t num_rounds) { uint64_t tmp[25] = { 0 }; for (size_t i = 0; i < num_rounds; i++) { @@ -206,49 +232,64 @@ static inline void permute_n_scalar(uint64_t a[static 25], const size_t num_roun } } -// 24 round keccak permutation (scalar implementation) +/** + * @brief 24 round scalar Keccak permutation. + * @param[in,out] a Keccak state (array of 25 64-bit integers). + */ static inline void permute_scalar(uint64_t a[static 25]) { permute_n_scalar(a, 24); } -// 12 round keccak permutation (scalar implementation) -// (only used by turboshake) +/** + * @brief 12 round scalar Keccak permutation. + * @note Only used by TurboSHAKE and KangarooTwelve. + * @param[in,out] a Keccak state (array of 25 64-bit integers). + */ static inline void permute12_scalar(uint64_t a[static 25]) { permute_n_scalar(a, 12); } -#endif /* (SHA3_BACKEND == SHA3_BACKEND_SCALAR) || defined(SHA3_TEST) */ +#endif /* (SHA3_BACKEND == BACKEND_SCALAR) || defined(SHA3_TEST) */ -#if SHA3_BACKEND == SHA3_BACKEND_AVX512 +#if SHA3_BACKEND == BACKEND_AVX512 #include -// keccak permutation (avx512 implementation). -// -// how it operates (roughly): -// -// 1. load rows from state `s` into the first 5 64-bit lanes of AVX-512 -// registers r0-r4, like so: -// -// ----------------------------------------------------------------- -// | | Lanes | -// |-----|---------------------------------------------------------| -// | Reg | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | -// |-----|-------|-------|-------|-------|-------|-----|-----|-----| -// | r0 | s[ 0] | s[ 1] | s[ 2] | s[ 3] | s[ 4] | n/a | n/a | n/a | -// | r0 | s[ 0] | s[ 1] | s[ 2] | s[ 3] | s[ 4] | n/a | n/a | n/a | -// | r1 | s[ 5] | s[ 6] | s[ 7] | s[ 8] | s[ 9] | n/a | n/a | n/a | -// | r2 | s[10] | s[11] | s[12] | s[13] | s[14] | n/a | n/a | n/a | -// | r3 | s[15] | s[16] | s[17] | s[18] | s[19] | n/a | n/a | n/a | -// | r4 | s[20] | s[21] | s[22] | s[23] | s[24] | n/a | n/a | n/a | -// ----------------------------------------------------------------- -// -// 2. For each round of 24 rounds: -// a. Perform theta, rho, pi, and chi steps. pi, in particular, has -// a large number of permutation registers (so it may spill). -// b. Load round constant for current round and perform iota step. -// -// 3. store the rows first 5 64-bit lanes of registers r0-r4 back to the -// state `s`. -// +/** + * @brief AVX-512 Keccak permutation. + * + * @param[in,out] s Keccak state (array of 25 64-bit integers). + * @param[in] num_rounds Number of rounds (12 or 24). + * + * Apply `num_rounds` of Keccak permutation. This function is only + * called by: + * + * - `permute_avx512()`: 24 rounds. + * - `permute12_avx512()`: 12 rounds. Used by TurboSHAKE and KangarooTwelve. + * + * How it works: + * + * 1. The Keccak state is loaded from `s` (an array of 25 64-bit + * unsigned integers) into the first 5 64-bit lanes of 5 AVX-512 + * registers r0-r4, like this: + * + * ----------------------------------------------------------------- + * | | 64-bit Lane | + * |-----|---------------------------------------------------------| + * | Reg | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + * |-----|-------|-------|-------|-------|-------|-----|-----|-----| + * | r0 | s[ 0] | s[ 1] | s[ 2] | s[ 3] | s[ 4] | n/a | n/a | n/a | + * | r1 | s[ 5] | s[ 6] | s[ 7] | s[ 8] | s[ 9] | n/a | n/a | n/a | + * | r2 | s[10] | s[11] | s[12] | s[13] | s[14] | n/a | n/a | n/a | + * | r3 | s[15] | s[16] | s[17] | s[18] | s[19] | n/a | n/a | n/a | + * | r4 | s[20] | s[21] | s[22] | s[23] | s[24] | n/a | n/a | n/a | + * ----------------------------------------------------------------- + * + * 2. The Keccak permutation is applied `num_rounds` times, where + * `num_rounds` is either 12 for TurboSHAKE and KangarooTwelve or 24 + * otherwise. + * + * 3. The permuted Keccak state is copied from the first 5 64-bit lanes + * of AVX-512 registers r0-r4 back to `s`. + */ static inline void permute_n_avx512(uint64_t s[static 25], const size_t num_rounds) { // load rows (r0-r4) __m512i r0 = _mm512_maskz_loadu_epi64(0x1f, s + 0), // row 0 @@ -257,7 +298,7 @@ static inline void permute_n_avx512(uint64_t s[static 25], const size_t num_roun r3 = _mm512_maskz_loadu_epi64(0x1f, s + 15), // row 3 r4 = _mm512_maskz_loadu_epi64(0x1f, s + 20); // row 4 - // 24 rounds + // loop over rounds for (size_t i = 0; i < num_rounds; i++) { // theta { @@ -437,21 +478,30 @@ static inline void permute_n_avx512(uint64_t s[static 25], const size_t num_roun _mm512_mask_storeu_epi64(s + 5 * 4, 0x1f, r4); } -// 24 round keccak permutation (avx512 implementation). +/** + * @brief 24 round AVX-512 Keccak permutation. + * @param[in,out] a Keccak state (array of 25 64-bit integers). + */ static inline void permute_avx512(uint64_t s[static 25]) { permute_n_avx512(s, 24); } -// 12 round keccak permutation (avx512 implementation). +/** + * @brief 12 round AVX-512 Keccak permutation. + * @note Only used by TurboSHAKE and KangarooTwelve. + * @param[in,out] a Keccak state (array of 25 64-bit integers). + */ static inline void permute12_avx512(uint64_t s[static 25]) { permute_n_avx512(s, 12); } -#endif /* SHA3_BACKEND == SHA3_BACKEND_AVX512 */ +#endif /* SHA3_BACKEND == BACKEND_AVX512 */ -#if SHA3_BACKEND == SHA3_BACKEND_AVX512 +#if SHA3_BACKEND == BACKEND_AVX512 +// use avx512 backend #define permute permute_avx512 #define permute12 permute12_avx512 -#elif SHA3_BACKEND == SHA3_BACKEND_SCALAR +#elif SHA3_BACKEND == BACKEND_SCALAR +// use scalar backend #define permute permute_scalar #define permute12 permute12_scalar #else @@ -576,29 +626,35 @@ static inline size_t absorb12(sha3_state_t * const a, size_t num_bytes, const si return num_bytes; } -// Get rate (number of bytes that can be absorbed before the internal -// state is permuted). +// Get the rate of a FIPS 202 hash function or extendable-output +// function (XOF). +// +// The "rate" is the number of bytes that can be absorbed or squeezed +// before the internal state is permuted. It is calculated by the total +// state size (200 bytes) minus the capacity (FIPS 202, Section 5.2). // -// For hash functions, the capacity is always 2 times the output length -// of the hash, and the rate is the total state size (200 bytes) minus -// the capacity (FIPS 202, Section 5.2). +// The capacity and rate is determined as follows: // -// XOFs do not have fixed-length output, but the capacity is 2 times the -// named strength, and the rate is the total state size (200 bytes) -// minus the capacity. +// * Hash functions: The capacity is 2 times the length of the output +// digest. For example, the length of a SHA3-256 digest is 32 bytes +// (256 bits), so the capacity is 64 bytes and the rate is 136 bytes +// (200-64 = 136). +// * XOFs: The capacity is 2 times the named strength. For example, the +// named strength of SHAKE128 is 16 bytes (128 bits), so the capacity +// is 32 bytes and the rate is 168 bytes (200-32 = 168). // -// The table below shows the output size, capacity, and rate for each +// The table below shows the output size, rate, and capacity for each // FIPS 202 function. All values are in bytes. // // --------------------------------------- -// | Function | Output | Capacity | Rate | -// |----------|--------|----------|------| -// | SHA3-224 | 28 | 56 | 144 | -// | SHA3-256 | 32 | 64 | 136 | -// | SHA3-384 | 48 | 96 | 104 | -// | SHA3-512 | 64 | 136 | 72 | -// | SHAKE128 | n/a | 32 | 168 | -// | SHAKE256 | n/a | 64 | 136 | +// | Function | Output | Rate | Capacity | +// |----------|--------|------|----------| +// | SHA3-224 | 28 | 144 | 56 | +// | SHA3-256 | 32 | 136 | 64 | +// | SHA3-384 | 48 | 104 | 96 | +// | SHA3-512 | 64 | 72 | 128 | +// | SHAKE128 | n/a | 168 | 32 | +// | SHAKE256 | n/a | 136 | 64 | // --------------------------------------- // #define RATE(len) (200 - 2 * (len)) @@ -1920,9 +1976,9 @@ void k12_once(const uint8_t *src, const size_t src_len, uint8_t *dst, const size // Return backend name. const char *sha3_backend(void) { -#if SHA3_BACKEND == SHA3_BACKEND_AVX512 +#if SHA3_BACKEND == BACKEND_AVX512 return "avx512"; -#elif SHA3_BACKEND == SHA3_BACKEND_SCALAR +#elif SHA3_BACKEND == BACKEND_SCALAR return "scalar"; #endif /* SHA3_BACKEND */ } @@ -2184,7 +2240,7 @@ static void test_permute_scalar(void) { } static void test_permute_avx512(void) { -#if SHA3_BACKEND == SHA3_BACKEND_AVX512 +#if SHA3_BACKEND == BACKEND_AVX512 for (size_t i = 0; i < sizeof(PERMUTE_TESTS) / sizeof(PERMUTE_TESTS[0]); i++) { const size_t exp_len = PERMUTE_TESTS[i].exp_len; @@ -2196,7 +2252,7 @@ static void test_permute_avx512(void) { fail_test(__func__, "", (uint8_t*) got, exp_len, (uint8_t*) PERMUTE_TESTS[i].exp, exp_len); } } -#endif /* SHA3_BACKEND == SHA3_BACKEND_AVX512 */ +#endif /* SHA3_BACKEND == BACKEND_AVX512 */ } static const struct { @@ -2224,7 +2280,7 @@ static void test_permute12_scalar(void) { } static void test_permute12_avx512(void) { -#if SHA3_BACKEND == SHA3_BACKEND_AVX512 +#if SHA3_BACKEND == BACKEND_AVX512 for (size_t i = 0; i < sizeof(PERMUTE12_TESTS) / sizeof(PERMUTE12_TESTS[0]); i++) { const size_t exp_len = PERMUTE12_TESTS[i].exp_len; @@ -2236,7 +2292,7 @@ static void test_permute12_avx512(void) { fail_test(__func__, "", (uint8_t*) got, exp_len, (uint8_t*) PERMUTE12_TESTS[i].exp, exp_len); } } -#endif /* SHA3_BACKEND == SHA3_BACKEND_AVX512 */ +#endif /* SHA3_BACKEND == BACKEND_AVX512 */ } static void test_sha3_224(void) { -- cgit v1.2.3