diff options
author | Paul Duncan <pabs@pablotron.org> | 2024-04-29 08:51:50 -0400 |
---|---|---|
committer | Paul Duncan <pabs@pablotron.org> | 2024-04-29 08:51:50 -0400 |
commit | bd3e6fdb474903d0acdb9b9e01add0fa2462273f (patch) | |
tree | d7ada8dfa7caf8f4fb73eb45882e2e5ef6aa700a | |
parent | bfdddfb2953da843074f73560eaedf14cb024a21 (diff) | |
download | sha3-bd3e6fdb474903d0acdb9b9e01add0fa2462273f.tar.bz2 sha3-bd3e6fdb474903d0acdb9b9e01add0fa2462273f.zip |
sha3.c: use shared RCS, rename scalar and avx512 permute() to permute_{scalar,axv512}(), hard-code num_rounds to 24 in permute_{scalar,avx512}(), add permute12_{scalar,avx512}(), absorb12(), and xof12_{init,absorb,raw,absorb,squeeze_raw,squeeze,once}(), update turboshake to use xof12_*(), move permute tests to PERMUTE_TESTS static array, rename test_permute() to test_permute_scalar(), add test_permute_avx512(), add PERMUTE12_TESTS and test_permute12_{scalar,avx512}()
-rw-r--r-- | sha3.c | 625 |
1 files changed, 547 insertions, 78 deletions
@@ -34,6 +34,16 @@ // number of rounds for permute() #define SHA3_NUM_ROUNDS 24 +// round constants (used by iota) +static const uint64_t RCS[] = { + 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, 0x8000000080008000ULL, + 0x000000000000808bULL, 0x0000000080000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL, + 0x000000000000008aULL, 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL, + 0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL, 0x8000000000008003ULL, + 0x8000000000008002ULL, 0x8000000000000080ULL, 0x000000000000800aULL, 0x800000008000000aULL, + 0x8000000080008081ULL, 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL, +}; + #if !defined(__AVX512F__) || defined(SHA3_TEST) // If AVX512 is supported and we are not building the test suite, // then do not compile the scalar step functions below. @@ -153,35 +163,33 @@ static inline void chi(uint64_t dst[static 25], const uint64_t src[static 25]) { // iota step of keccak permutation (scalar implementation) static inline void iota(uint64_t a[static 25], const int i) { - // round constants (ambiguous in spec) - static const uint64_t RCS[] = { - 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, 0x8000000080008000ULL, - 0x000000000000808bULL, 0x0000000080000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL, - 0x000000000000008aULL, 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL, - 0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL, 0x8000000000008003ULL, - 0x8000000000008002ULL, 0x8000000000000080ULL, 0x000000000000800aULL, 0x800000008000000aULL, - 0x8000000080008081ULL, 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL, - }; - a[0] ^= RCS[i]; } #endif /* !defined(__AVX512F__) || defined(SHA3_TEST) */ #ifndef __AVX512F__ -// keccak permutation (scalar implementation) -// -// note: clang is better about inlining this than gcc with a -// configurable number of rounds. the configurable number of rounds is -// only used by turboshake, so it might be worth creating a specialized -// `permute12()` to handle turboshake. -static inline void permute(uint64_t a[static 25], const size_t num_rounds) { +// 24-round keccak permutation (scalar implementation) +static inline void permute_scalar(uint64_t a[static 25]) { uint64_t tmp[25] = { 0 }; - for (int i = 0; i < (int) num_rounds; i++) { + for (size_t i = 0; i < SHA3_NUM_ROUNDS; i++) { theta(a); rho(a); pi(tmp, a); chi(a, tmp); - iota(a, 24 - num_rounds + i); + iota(a, i); + } +} + +// 12 round keccak permutation (scalar implementation) +// (only used by turboshake) +static inline void permute12_scalar(uint64_t a[static 25]) { + uint64_t tmp[25] = { 0 }; + for (size_t i = 0; i < 12; i++) { + theta(a); + rho(a); + pi(tmp, a); + chi(a, tmp); + iota(a, 12 + i); } } #endif /* !__AVX512F__ */ @@ -189,7 +197,7 @@ static inline void permute(uint64_t a[static 25], const size_t num_rounds) { #ifdef __AVX512F__ #include <immintrin.h> -// keccak permutation (avx512 implementation). +// 24 round keccak permutation (avx512 implementation). // // copied from `permute_avx512_fast()` in `tests/permute/permute.c`. all // steps are inlined as blocks. ~3x faster than scalar implementation, @@ -220,26 +228,240 @@ static inline void permute(uint64_t a[static 25], const size_t num_rounds) { // as noted above, this is not the most efficient avx512 implementation; // the row registers have three empty slots and there are a lot of loads // that could be removed with a little more work. -static inline void permute(uint64_t s[static 25], const size_t num_rounds) { +static inline void permute_avx512(uint64_t s[static 25]) { // unaligned load mask and permutation indices uint8_t mask = 0x1f, m0b = 0x01; const __mmask8 m = _load_mask8(&mask), m0 = _load_mask8(&m0b); - // round constants (used in iota) - static const uint64_t RCS[] = { - 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, 0x8000000080008000ULL, - 0x000000000000808bULL, 0x0000000080000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL, - 0x000000000000008aULL, 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL, - 0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL, 0x8000000000008003ULL, - 0x8000000000008002ULL, 0x8000000000000080ULL, 0x000000000000800aULL, 0x800000008000000aULL, - 0x8000000080008081ULL, 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL, - }; + // load round constant + // note: this will bomb if num_rounds < 8 or num_rounds > 24. + __m512i rc = _mm512_loadu_epi64((void*) RCS); + + // load rc permutation + static const uint64_t rc_ps[8] = { 1, 2, 3, 4, 5, 6, 7, 0 }; + const __m512i rc_p = _mm512_loadu_epi64((void*) rc_ps); + + // load rows + __m512i r0 = _mm512_maskz_loadu_epi64(m, (void*) (s)), + r1 = _mm512_maskz_loadu_epi64(m, (void*) (s + 5)), + r2 = _mm512_maskz_loadu_epi64(m, (void*) (s + 10)), + r3 = _mm512_maskz_loadu_epi64(m, (void*) (s + 15)), + r4 = _mm512_maskz_loadu_epi64(m, (void*) (s + 20)); + + for (int i = 0; i < SHA3_NUM_ROUNDS; i++) { + // theta + { + const __m512i i0 = _mm512_setr_epi64(4, 0, 1, 2, 3, 5, 6, 7), + i1 = _mm512_setr_epi64(1, 2, 3, 4, 0, 5, 6, 7); + + // c = xor(r0, r1, r2, r3, r4) + const __m512i r01 = _mm512_xor_epi64(r0, r1), + r23 = _mm512_xor_epi64(r2, r3), + r03 = _mm512_xor_epi64(r01, r23), + c = _mm512_xor_epi64(r03, r4); + + // d = xor(permute(i0, c), permute(i1, rol(c, 1))) + const __m512i d0 = _mm512_permutexvar_epi64(i0, c), + d1 = _mm512_permutexvar_epi64(i1, _mm512_rol_epi64(c, 1)), + d = _mm512_xor_epi64(d0, d1); + + // row = xor(row, d) + r0 = _mm512_xor_epi64(r0, d); + r1 = _mm512_xor_epi64(r1, d); + r2 = _mm512_xor_epi64(r2, d); + r3 = _mm512_xor_epi64(r3, d); + r4 = _mm512_xor_epi64(r4, d); + } + + // rho + { + // unaligned load mask and rotate values + static const uint64_t vs0[8] = { 0, 1, 62, 28, 27, 0, 0, 0 }, + vs1[8] = { 36, 44, 6, 55, 20, 0, 0, 0 }, + vs2[8] = { 3, 10, 43, 25, 39, 0, 0, 0 }, + vs3[8] = { 41, 45, 15, 21, 8, 0, 0, 0 }, + vs4[8] = { 18, 2, 61, 56, 14, 0, 0, 0 }; + + // load rotate values + const __m512i v0 = _mm512_loadu_epi64((void*) vs0), + v1 = _mm512_loadu_epi64((void*) vs1), + v2 = _mm512_loadu_epi64((void*) vs2), + v3 = _mm512_loadu_epi64((void*) vs3), + v4 = _mm512_loadu_epi64((void*) vs4); + + // rotate + r0 = _mm512_rolv_epi64(r0, v0); + r1 = _mm512_rolv_epi64(r1, v1); + r2 = _mm512_rolv_epi64(r2, v2); + r3 = _mm512_rolv_epi64(r3, v3); + r4 = _mm512_rolv_epi64(r4, v4); + } + + // pi + { + // mask bytes + uint8_t m01b = 0x03, + m23b = 0x0c, + m4b = 0x10; + + // load masks + const __mmask8 m01 = _load_mask8(&m01b), + m23 = _load_mask8(&m23b), + m4 = _load_mask8(&m4b); + + // permutation indices + // + // (note: these are masked so only the relevant indices for + // _mm512_maskz_permutex2var_epi64() in each array are filled in) + static const uint64_t t0_pis_01[8] = { 0, 8 + 1, 0, 0, 0, 0, 0, 0 }, + t0_pis_23[8] = { 0, 0, 2, 8 + 3, 0, 0, 0, 0 }, + t0_pis_4[8] = { 0, 0, 0, 0, 4, 0, 0, 0 }, + + t1_pis_01[8] = { 3, 8 + 4, 0, 0, 0, 0, 0, 0 }, + t1_pis_23[8] = { 0, 0, 0, 8 + 1, 0, 0, 0, 0 }, + t1_pis_4[8] = { 0, 0, 0, 0, 2, 0, 0, 0 }, + + t2_pis_01[8] = { 1, 8 + 2, 0, 0, 0, 0, 0, 0 }, + t2_pis_23[8] = { 0, 0, 3, 8 + 4, 0, 0, 0, 0 }, + t2_pis_4[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }, + + t3_pis_01[8] = { 4, 8 + 0, 0, 0, 0, 0, 0, 0 }, + t3_pis_23[8] = { 0, 0, 1, 8 + 2, 0, 0, 0, 0 }, + t3_pis_4[8] = { 0, 0, 0, 0, 3, 0, 0, 0 }, + + t4_pis_01[8] = { 2, 8 + 3, 0, 0, 0, 0, 0, 0 }, + t4_pis_23[8] = { 0, 0, 4, 8 + 0, 0, 0, 0, 0 }, + t4_pis_4[8] = { 0, 0, 0, 0, 1, 0, 0, 0 }; + + // load permutation indices + const __m512i t0_p01 = _mm512_maskz_loadu_epi64(m01, (void*) t0_pis_01), + t0_p23 = _mm512_maskz_loadu_epi64(m23, (void*) t0_pis_23), + t0_p4 = _mm512_maskz_loadu_epi64(m4, (void*) t0_pis_4), + + t1_p01 = _mm512_maskz_loadu_epi64(m01, (void*) t1_pis_01), + t1_p23 = _mm512_maskz_loadu_epi64(m23, (void*) t1_pis_23), + t1_p4 = _mm512_maskz_loadu_epi64(m4, (void*) t1_pis_4), + + t2_p01 = _mm512_maskz_loadu_epi64(m01, (void*) t2_pis_01), + t2_p23 = _mm512_maskz_loadu_epi64(m23, (void*) t2_pis_23), + t2_p4 = _mm512_maskz_loadu_epi64(m4, (void*) t2_pis_4), + + t3_p01 = _mm512_maskz_loadu_epi64(m01, (void*) t3_pis_01), + t3_p23 = _mm512_maskz_loadu_epi64(m23, (void*) t3_pis_23), + t3_p4 = _mm512_maskz_loadu_epi64(m4, (void*) t3_pis_4), + + t4_p01 = _mm512_maskz_loadu_epi64(m01, (void*) t4_pis_01), + t4_p23 = _mm512_maskz_loadu_epi64(m23, (void*) t4_pis_23), + t4_p4 = _mm512_maskz_loadu_epi64(m4, (void*) t4_pis_4); + + // permute rows + const __m512i t0e0 = _mm512_maskz_permutex2var_epi64(m01, r0, t0_p01, r1), + t0e2 = _mm512_maskz_permutex2var_epi64(m23, r2, t0_p23, r3), + t0e4 = _mm512_maskz_permutex2var_epi64(m4, r4, t0_p4, r0), + t0 = _mm512_or_epi64(_mm512_or_epi64(t0e0, t0e2), t0e4), + + t1e0 = _mm512_maskz_permutex2var_epi64(m01, r0, t1_p01, r1), + t1e2 = _mm512_maskz_permutex2var_epi64(m23, r2, t1_p23, r3), + t1e4 = _mm512_maskz_permutex2var_epi64(m4, r4, t1_p4, r0), + t1 = _mm512_or_epi64(_mm512_or_epi64(t1e0, t1e2), t1e4), + + t2e0 = _mm512_maskz_permutex2var_epi64(m01, r0, t2_p01, r1), + t2e2 = _mm512_maskz_permutex2var_epi64(m23, r2, t2_p23, r3), + t2e4 = _mm512_maskz_permutex2var_epi64(m4, r4, t2_p4, r0), + t2 = _mm512_or_epi64(_mm512_or_epi64(t2e0, t2e2), t2e4), + + t3e0 = _mm512_maskz_permutex2var_epi64(m01, r0, t3_p01, r1), + t3e2 = _mm512_maskz_permutex2var_epi64(m23, r2, t3_p23, r3), + t3e4 = _mm512_maskz_permutex2var_epi64(m4, r4, t3_p4, r0), + t3 = _mm512_or_epi64(_mm512_or_epi64(t3e0, t3e2), t3e4), + + t4e0 = _mm512_maskz_permutex2var_epi64(m01, r0, t4_p01, r1), + t4e2 = _mm512_maskz_permutex2var_epi64(m23, r2, t4_p23, r3), + t4e4 = _mm512_maskz_permutex2var_epi64(m4, r4, t4_p4, r0), + t4 = _mm512_or_epi64(_mm512_or_epi64(t4e0, t4e2), t4e4); + + // store rows + r0 = t0; + r1 = t1; + r2 = t2; + r3 = t3; + r4 = t4; + } + + // chi + { + // permutation indices + static const uint64_t pis0[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }, + pis1[8] = { 2, 3, 4, 0, 1, 0, 0, 0 }; + + // load permutation indices + const __m512i p0 = _mm512_maskz_loadu_epi64(m, (void*) pis0), + p1 = _mm512_maskz_loadu_epi64(m, (void*) pis1); + + // permute rows + const __m512i t0_e0 = _mm512_maskz_permutexvar_epi64(m, p0, r0), + t0_e1 = _mm512_maskz_permutexvar_epi64(m, p1, r0), + t0 = _mm512_xor_epi64(r0, _mm512_andnot_epi64(t0_e0, t0_e1)), + + t1_e0 = _mm512_maskz_permutexvar_epi64(m, p0, r1), + t1_e1 = _mm512_maskz_permutexvar_epi64(m, p1, r1), + t1 = _mm512_xor_epi64(r1, _mm512_andnot_epi64(t1_e0, t1_e1)), + + t2_e0 = _mm512_maskz_permutexvar_epi64(m, p0, r2), + t2_e1 = _mm512_maskz_permutexvar_epi64(m, p1, r2), + t2 = _mm512_xor_epi64(r2, _mm512_andnot_epi64(t2_e0, t2_e1)), + + t3_e0 = _mm512_maskz_permutexvar_epi64(m, p0, r3), + t3_e1 = _mm512_maskz_permutexvar_epi64(m, p1, r3), + t3 = _mm512_xor_epi64(r3, _mm512_andnot_epi64(t3_e0, t3_e1)), + + t4_e0 = _mm512_maskz_permutexvar_epi64(m, p0, r4), + t4_e1 = _mm512_maskz_permutexvar_epi64(m, p1, r4), + t4 = _mm512_xor_epi64(r4, _mm512_andnot_epi64(t4_e0, t4_e1)); + + // store rows + r0 = t0; + r1 = t1; + r2 = t2; + r3 = t3; + r4 = t4; + } + + // iota + { + // xor round constant, shuffle rc register + r0 = _mm512_mask_xor_epi64(r0, m0, r0, rc); + rc = _mm512_permutexvar_epi64(rc_p, rc); + + if (((i + 1) % 8) == 0 && i != 23) { + // load next set of round constants + // note: this will bomb if num_rounds < 8 or num_rounds > 24. + rc = _mm512_loadu_epi64((void*) (RCS + (i + 1))); + } + } + } + + // store rows + _mm512_mask_storeu_epi64((void*) (s), m, r0), + _mm512_mask_storeu_epi64((void*) (s + 5), m, r1), + _mm512_mask_storeu_epi64((void*) (s + 10), m, r2), + _mm512_mask_storeu_epi64((void*) (s + 15), m, r3), + _mm512_mask_storeu_epi64((void*) (s + 20), m, r4); +} + +// 12 round keccak permutation (avx512 implementation). +static inline void permute12_avx512(uint64_t s[static 25]) { + // unaligned load mask and permutation indices + uint8_t mask = 0x1f, + m0b = 0x01; + const __mmask8 m = _load_mask8(&mask), + m0 = _load_mask8(&m0b); // load round constant // note: this will bomb if num_rounds < 8 or num_rounds > 24. - __m512i rc = _mm512_loadu_epi64((void*) (RCS + 24 - num_rounds)); + __m512i rc = _mm512_loadu_epi64((void*) (RCS + 12)); // load rc permutation static const uint64_t rc_ps[8] = { 1, 2, 3, 4, 5, 6, 7, 0 }; @@ -252,7 +474,7 @@ static inline void permute(uint64_t s[static 25], const size_t num_rounds) { r3 = _mm512_maskz_loadu_epi64(m, (void*) (s + 15)), r4 = _mm512_maskz_loadu_epi64(m, (void*) (s + 20)); - for (int i = 0; i < (int) num_rounds; i++) { + for (int i = 0; i < SHA3_NUM_ROUNDS; i++) { // theta { const __m512i i0 = _mm512_setr_epi64(4, 0, 1, 2, 3, 5, 6, 7), @@ -437,10 +659,10 @@ static inline void permute(uint64_t s[static 25], const size_t num_rounds) { r0 = _mm512_mask_xor_epi64(r0, m0, r0, rc); rc = _mm512_permutexvar_epi64(rc_p, rc); - if (((24 - num_rounds + i + 1) % 8) == 0 && i != 23) { + if (((12 + i + 1) % 8) == 0 && (12 + i) != 23) { // load next set of round constants // note: this will bomb if num_rounds < 8 or num_rounds > 24. - rc = _mm512_loadu_epi64((void*) (RCS + 24 - num_rounds + (i + 1))); + rc = _mm512_loadu_epi64((void*) (RCS + (12 + i + 1))); } } } @@ -454,9 +676,76 @@ static inline void permute(uint64_t s[static 25], const size_t num_rounds) { } #endif /* __AVX512F__ */ +#ifdef __AVX512F__ +#define permute permute_avx512 +#define permute12 permute12_avx512 +#else /* !__AVX512F__ */ +#define permute permute_scalar +#define permute12 permute12_scalar +#endif /* __AVX512F__ */ + // absorb message into state, return updated byte count // used by `hash_absorb()`, `hash_once()`, and `xof_absorb_raw()` -static inline size_t absorb(sha3_state_t * const a, size_t num_bytes, const size_t rate, const size_t num_rounds, const uint8_t *m, size_t m_len) { +static inline size_t absorb(sha3_state_t * const a, size_t num_bytes, const size_t rate, const uint8_t *m, size_t m_len) { + // absorb aligned chunks + if ((num_bytes & 7) == 0 && (((uintptr_t) m) & 7) == 0) { + // absorb 32 byte chunks (4 x uint64) + while (m_len >= 32 && num_bytes <= rate - 32) { + // xor chunk into state + // (FIXME: does not vectorize for some reason, even when unrolled) + for (size_t i = 0; i < 4; i++) { + a->u64[num_bytes/8 + i] ^= ((uint64_t*) m)[i]; + } + + // update counters + num_bytes += 32; + m += 32; + m_len -= 32; + + if (num_bytes == rate) { + // permute state + permute(a->u64); + num_bytes = 0; + } + } + + // absorb 8 byte chunks (1 x uint64) + while (m_len >= 8 && num_bytes <= rate - 8) { + // xor chunk into state + a->u64[num_bytes/8] ^= *((uint64_t*) m); + + // update counters + num_bytes += 8; + m += 8; + m_len -= 8; + + if (num_bytes == rate) { + // permute state + permute(a->u64); + num_bytes = 0; + } + } + } + + // absorb remaining bytes + for (size_t i = 0; i < m_len; i++) { + // xor byte into state + a->u8[num_bytes++] ^= m[i]; + + if (num_bytes == rate) { + // permute state + permute(a->u64); + num_bytes = 0; + } + } + + // return byte count + return num_bytes; +} + +// absorb message into xof12 state, return updated byte count +// used by `xof12_absorb_raw()` +static inline size_t absorb12(sha3_state_t * const a, size_t num_bytes, const size_t rate, const uint8_t *m, size_t m_len) { // absorb aligned chunks if ((num_bytes & 7) == 0 && (((uintptr_t) m) & 7) == 0) { // absorb 32 byte chunks (4 x uint64) @@ -474,7 +763,7 @@ static inline size_t absorb(sha3_state_t * const a, size_t num_bytes, const size if (num_bytes == rate) { // permute state - permute(a->u64, num_rounds); + permute12(a->u64); num_bytes = 0; } } @@ -491,7 +780,7 @@ static inline size_t absorb(sha3_state_t * const a, size_t num_bytes, const size if (num_bytes == rate) { // permute state - permute(a->u64, num_rounds); + permute12(a->u64); num_bytes = 0; } } @@ -504,7 +793,7 @@ static inline size_t absorb(sha3_state_t * const a, size_t num_bytes, const size if (num_bytes == rate) { // permute state - permute(a->u64, num_rounds); + permute12(a->u64); num_bytes = 0; } } @@ -513,6 +802,7 @@ static inline size_t absorb(sha3_state_t * const a, size_t num_bytes, const size return num_bytes; } + // Get rate (number of bytes that can be absorbed before the internal // state is permuted). // @@ -546,7 +836,7 @@ static inline void hash_once(const uint8_t *m, size_t m_len, uint8_t * const dst sha3_state_t a = { .u64 = { 0 } }; // absorb message, get new internal length - const size_t len = absorb(&a, 0, RATE(dst_len), SHA3_NUM_ROUNDS, m, m_len); + const size_t len = absorb(&a, 0, RATE(dst_len), m, m_len); // append suffix and padding // (note: suffix and padding are ambiguous in spec) @@ -554,7 +844,7 @@ static inline void hash_once(const uint8_t *m, size_t m_len, uint8_t * const dst a.u8[RATE(dst_len)-1] ^= 0x80; // final permutation - permute(a.u64, SHA3_NUM_ROUNDS); + permute(a.u64); // copy to destination memcpy(dst, a.u8, dst_len); @@ -575,7 +865,7 @@ static inline bool hash_absorb(sha3_t * const hash, const size_t rate, const uin } // absorb bytes, return success - hash->num_bytes = absorb(&(hash->a), hash->num_bytes, rate, SHA3_NUM_ROUNDS, src, len); + hash->num_bytes = absorb(&(hash->a), hash->num_bytes, rate, src, len); return true; } @@ -593,7 +883,7 @@ static inline void hash_final(sha3_t * const hash, const size_t rate, uint8_t * hash->a.u8[rate - 1] ^= 0x80; // permute - permute(hash->a.u64, SHA3_NUM_ROUNDS); + permute(hash->a.u64); } // copy to destination @@ -637,14 +927,14 @@ static inline void xof_init(sha3_xof_t * const xof) { // context has already been squeezed. // // Called by `xof_absorb()` and `xof_once()`. -static inline void xof_absorb_raw(sha3_xof_t * const xof, const size_t rate, const size_t num_rounds, const uint8_t *m, size_t m_len) { - xof->num_bytes = absorb(&(xof->a), xof->num_bytes, rate, num_rounds, m, m_len); +static inline void xof_absorb_raw(sha3_xof_t * const xof, const size_t rate, const uint8_t *m, size_t m_len) { + xof->num_bytes = absorb(&(xof->a), xof->num_bytes, rate, m, m_len); } // Absorb data into XOF context. // // Returns `false` if this context has already been squeezed. -static inline _Bool xof_absorb(sha3_xof_t * const xof, const size_t rate, const size_t num_rounds, const uint8_t * const m, size_t m_len) { +static inline _Bool xof_absorb(sha3_xof_t * const xof, const size_t rate, const uint8_t * const m, size_t m_len) { // check context state if (xof->squeezing) { // xof has already been squeezed, return error @@ -652,19 +942,19 @@ static inline _Bool xof_absorb(sha3_xof_t * const xof, const size_t rate, const } // absorb, return success - xof_absorb_raw(xof, rate, num_rounds, m, m_len); + xof_absorb_raw(xof, rate, m, m_len); return true; } // Finalize absorb, switch mode of XOF context to squeezing. -static inline void xof_absorb_done(sha3_xof_t * const xof, const size_t rate, const size_t num_rounds, const uint8_t pad) { +static inline void xof_absorb_done(sha3_xof_t * const xof, const size_t rate, const uint8_t pad) { // append suffix (s6.2) and padding // (note: suffix and padding are ambiguous in spec) xof->a.u8[xof->num_bytes] ^= pad; xof->a.u8[rate - 1] ^= 0x80; // permute - permute(xof->a.u64, num_rounds); + permute(xof->a.u64); // switch to squeeze mode xof->num_bytes = 0; @@ -672,7 +962,7 @@ static inline void xof_absorb_done(sha3_xof_t * const xof, const size_t rate, co } // Squeeze data without checking mode (used by `xof_once()`). -static inline void xof_squeeze_raw(sha3_xof_t * const xof, const size_t rate, const size_t num_rounds, uint8_t *dst, size_t dst_len) { +static inline void xof_squeeze_raw(sha3_xof_t * const xof, const size_t rate, uint8_t *dst, size_t dst_len) { if (!xof->num_bytes) { // num_bytes is zero, so we are reading from the start of the // internal state buffer. while `dst_len` is greater than rate, @@ -681,7 +971,7 @@ static inline void xof_squeeze_raw(sha3_xof_t * const xof, const size_t rate, co // rate-sized chunks to destination while (dst_len >= rate) { memcpy(dst, xof->a.u8, rate); // copy rate-sized chunk - permute(xof->a.u64, num_rounds); // permute state + permute(xof->a.u64); // permute state // update destination pointer and length dst += rate; @@ -705,7 +995,7 @@ static inline void xof_squeeze_raw(sha3_xof_t * const xof, const size_t rate, co dst[i] = xof->a.u8[xof->num_bytes++]; // squeeze byte to destination if (xof->num_bytes == rate) { - permute(xof->a.u64, num_rounds); // permute state + permute(xof->a.u64); // permute state xof->num_bytes = 0; // clear read bytes count } } @@ -713,28 +1003,137 @@ static inline void xof_squeeze_raw(sha3_xof_t * const xof, const size_t rate, co } // squeeze data from xof -static inline void xof_squeeze(sha3_xof_t * const xof, const size_t rate, const size_t num_rounds, const uint8_t pad, uint8_t * const dst, const size_t dst_len) { +static inline void xof_squeeze(sha3_xof_t * const xof, const size_t rate, const uint8_t pad, uint8_t * const dst, const size_t dst_len) { // check state if (!xof->squeezing) { // finalize absorb - xof_absorb_done(xof, rate, num_rounds, pad); + xof_absorb_done(xof, rate, pad); } - xof_squeeze_raw(xof, rate, num_rounds, dst, dst_len); + xof_squeeze_raw(xof, rate, dst, dst_len); } // one-shot xof absorb and squeeze -static inline void xof_once(const size_t rate, const size_t num_rounds, const uint8_t pad, const uint8_t * const src, const size_t src_len, uint8_t * const dst, const size_t dst_len) { +static inline void xof_once(const size_t rate, const uint8_t pad, const uint8_t * const src, const size_t src_len, uint8_t * const dst, const size_t dst_len) { // init sha3_xof_t xof; xof_init(&xof); // absorb - xof_absorb_raw(&xof, rate, num_rounds, src, src_len); - xof_absorb_done(&xof, rate, num_rounds, pad); + xof_absorb_raw(&xof, rate, src, src_len); + xof_absorb_done(&xof, rate, pad); + + // squeeze + xof_squeeze_raw(&xof, rate, dst, dst_len); +} + +// initialize xof12 context +static inline void xof12_init(sha3_xof_t * const xof) { + memset(xof, 0, sizeof(sha3_xof_t)); +} + +// Absorb data into XOF12 context without checking to see if the +// context has already been squeezed. +// +// Called by `xof12_absorb()` and `xof12_once()`. +static inline void xof12_absorb_raw(sha3_xof_t * const xof, const size_t rate, const uint8_t *m, size_t m_len) { + xof->num_bytes = absorb12(&(xof->a), xof->num_bytes, rate, m, m_len); +} + +// Absorb data into XOF context. +// +// Returns `false` if this XOF12 context has already been squeezed. +static inline _Bool xof12_absorb(sha3_xof_t * const xof, const size_t rate, const uint8_t * const m, size_t m_len) { + // check context state + if (xof->squeezing) { + // xof has already been squeezed, return error + return false; + } + + // absorb, return success + xof12_absorb_raw(xof, rate, m, m_len); + return true; +} + +// Finalize absorb, switch mode of XOF12 context to squeezing. +static inline void xof12_absorb_done(sha3_xof_t * const xof, const size_t rate, const uint8_t pad) { + // append suffix (s6.2) and padding + // (note: suffix and padding are ambiguous in spec) + xof->a.u8[xof->num_bytes] ^= pad; + xof->a.u8[rate - 1] ^= 0x80; + + // permute + permute12(xof->a.u64); + + // switch to squeeze mode + xof->num_bytes = 0; + xof->squeezing = true; +} + +// Squeeze data without checking mode (used by `xof12_once()`). +static inline void xof12_squeeze_raw(sha3_xof_t * const xof, const size_t rate, uint8_t *dst, size_t dst_len) { + if (!xof->num_bytes) { + // num_bytes is zero, so we are reading from the start of the + // internal state buffer. while `dst_len` is greater than rate, + // copy `rate` sized chunks directly from the internal state buffer + // to the destination, then permute the internal state. squeeze + // rate-sized chunks to destination + while (dst_len >= rate) { + memcpy(dst, xof->a.u8, rate); // copy rate-sized chunk + permute12(xof->a.u64); // permute state + + // update destination pointer and length + dst += rate; + dst_len -= rate; + } + + if (dst_len > 0) { + // the remaining destination length is less than `rate`, so copy a + // `dst_len`-sized chunk from the internal state to the + // destination buffer, then update the read byte count. + + // squeeze dst_len-sized block to destination + memcpy(dst, xof->a.u8, dst_len); // copy dst_len-sized chunk + xof->num_bytes = dst_len; // update read byte count + } + } else { + // fall back to squeezing one byte at a time + + // squeeze bytes to destination + for (size_t i = 0; i < dst_len; i++) { + dst[i] = xof->a.u8[xof->num_bytes++]; // squeeze byte to destination + + if (xof->num_bytes == rate) { + permute12(xof->a.u64); // permute state + xof->num_bytes = 0; // clear read bytes count + } + } + } +} + +// squeeze data from xof12 context +static inline void xof12_squeeze(sha3_xof_t * const xof, const size_t rate, const uint8_t pad, uint8_t * const dst, const size_t dst_len) { + // check state + if (!xof->squeezing) { + // finalize absorb + xof12_absorb_done(xof, rate, pad); + } + + xof12_squeeze_raw(xof, rate, dst, dst_len); +} + +// one-shot xof12 absorb and squeeze +static inline void xof12_once(const size_t rate, const uint8_t pad, const uint8_t * const src, const size_t src_len, uint8_t * const dst, const size_t dst_len) { + // init + sha3_xof_t xof; + xof12_init(&xof); + + // absorb + xof12_absorb_raw(&xof, rate, src, src_len); + xof12_absorb_done(&xof, rate, pad); // squeeze - xof_squeeze_raw(&xof, rate, num_rounds, dst, dst_len); + xof12_squeeze_raw(&xof, rate, dst, dst_len); } // define shake iterative context and one-shot functions @@ -746,17 +1145,17 @@ static inline void xof_once(const size_t rate, const size_t num_rounds, const ui \ /* absorb bytes into shake context */ \ _Bool shake ## BITS ## _absorb(sha3_xof_t * const xof, const uint8_t * const m, const size_t len) { \ - return xof_absorb(xof, SHAKE ## BITS ## _RATE, SHA3_NUM_ROUNDS, m, len); \ + return xof_absorb(xof, SHAKE ## BITS ## _RATE, m, len); \ } \ \ /* squeeze bytes from shake context */ \ void shake ## BITS ## _squeeze(sha3_xof_t * const xof, uint8_t * const dst, const size_t dst_len) { \ - xof_squeeze(xof, SHAKE ## BITS ## _RATE, SHA3_NUM_ROUNDS, SHAKE_PAD, dst, dst_len); \ + xof_squeeze(xof, SHAKE ## BITS ## _RATE, SHAKE_PAD, dst, dst_len); \ } \ \ /* one-shot shake absorb and squeeze */ \ void shake ## BITS(const uint8_t * const src, const size_t src_len, uint8_t * const dst, const size_t dst_len) { \ - xof_once(SHAKE ## BITS ## _RATE, SHA3_NUM_ROUNDS, SHAKE_PAD, src, src_len, dst, dst_len); \ + xof_once(SHAKE ## BITS ## _RATE, SHAKE_PAD, src, src_len, dst, dst_len); \ } // shake padding byte and rates @@ -1070,12 +1469,12 @@ static inline bytepad_t bytepad(const size_t data_len, const size_t width) { #define DEF_CSHAKE(BITS) \ /* absorb data into cshake context */ \ _Bool cshake ## BITS ## _xof_absorb(sha3_xof_t * const xof, const uint8_t * const msg, const size_t len) { \ - return xof_absorb(xof, SHAKE ## BITS ## _RATE, SHA3_NUM_ROUNDS, msg, len); \ + return xof_absorb(xof, SHAKE ## BITS ## _RATE, msg, len); \ } \ \ /* squeeze data from cshake context */ \ void cshake ## BITS ## _xof_squeeze(sha3_xof_t * const xof, uint8_t * const dst, const size_t len) { \ - xof_squeeze(xof, SHAKE ## BITS ## _RATE, SHA3_NUM_ROUNDS, CSHAKE_PAD, dst, len); \ + xof_squeeze(xof, SHAKE ## BITS ## _RATE, CSHAKE_PAD, dst, len); \ } \ \ /* initialize cshake context */ \ @@ -1522,7 +1921,7 @@ static inline _Bool turboshake_init(turboshake_t * const ts, const uint8_t pad) } // init xof - xof_init(&(ts->xof)); + xof12_init(&(ts->xof)); ts->pad = pad; // return success @@ -1544,22 +1943,22 @@ static inline _Bool turboshake_init(turboshake_t * const ts, const uint8_t pad) \ /* absorb bytes into turboshake context. */ \ _Bool turboshake ## BITS ## _absorb(turboshake_t * const ts, const uint8_t * const m, const size_t len) { \ - return xof_absorb(&(ts->xof), SHAKE ## BITS ## _RATE, TURBOSHAKE_NUM_ROUNDS, m, len); \ + return xof12_absorb(&(ts->xof), SHAKE ## BITS ## _RATE, m, len); \ } \ \ /* squeeze bytes from turboshake context */ \ void turboshake ## BITS ## _squeeze(turboshake_t * const ts, uint8_t * const dst, const size_t dst_len) { \ - xof_squeeze(&(ts->xof), SHAKE ## BITS ## _RATE, TURBOSHAKE_NUM_ROUNDS, ts->pad, dst, dst_len); \ + xof12_squeeze(&(ts->xof), SHAKE ## BITS ## _RATE, ts->pad, dst, dst_len); \ } \ \ /* one-shot turboshake with default pad byte */ \ void turboshake ## BITS (const uint8_t * const src, const size_t src_len, uint8_t * const dst, const size_t dst_len) { \ - xof_once(SHAKE ## BITS ## _RATE, TURBOSHAKE_NUM_ROUNDS, TURBOSHAKE_PAD, src, src_len, dst, dst_len); \ + xof12_once(SHAKE ## BITS ## _RATE, TURBOSHAKE_PAD, src, src_len, dst, dst_len); \ } \ \ /* one-shot turboshake with custom pad byte */ \ void turboshake ## BITS ## _custom(const uint8_t pad, const uint8_t * const src, const size_t src_len, uint8_t * const dst, const size_t dst_len) { \ - xof_once(SHAKE ## BITS ## _RATE, TURBOSHAKE_NUM_ROUNDS, pad, src, src_len, dst, dst_len); \ + xof12_once(SHAKE ## BITS ## _RATE, pad, src, src_len, dst, dst_len); \ } // declare turboshake functions @@ -1978,17 +2377,84 @@ static void test_iota(void) { } } -static void test_permute(void) { - uint64_t a[25] = { [0] = 0x00000001997b5853ULL, [16] = 0x8000000000000000ULL }; - const uint64_t exp[] = { - 0xE95A9E40EF2F24C8ULL, 0x24C64DAE57C8F1D1ULL, - 0x8CAA629F80192BB9ULL, 0xD0B178A0541C4107ULL, - }; +static const struct { + uint64_t a[25]; // input state + const uint64_t exp[25]; // expected value + const size_t exp_len; // length of exp, in bytes +} PERMUTE_TESTS[] = {{ + .a = { [0] = 0x00000001997b5853ULL, [16] = 0x8000000000000000ULL }, + .exp = { 0xE95A9E40EF2F24C8ULL, 0x24C64DAE57C8F1D1ULL, 0x8CAA629F80192BB9ULL, 0xD0B178A0541C4107ULL }, + .exp_len = 32, +}}; + +static void test_permute_scalar(void) { + for (size_t i = 0; i < sizeof(PERMUTE_TESTS) / sizeof(PERMUTE_TESTS[0]); i++) { + const size_t exp_len = PERMUTE_TESTS[i].exp_len; + + uint64_t got[25] = { 0 }; + memcpy(got, PERMUTE_TESTS[i].a, sizeof(got)); + permute_scalar(got); + + if (memcmp(got, PERMUTE_TESTS[i].exp, exp_len)) { + fail_test(__func__, "", (uint8_t*) got, exp_len, (uint8_t*) PERMUTE_TESTS[i].exp, exp_len); + } + } +} - permute(a, SHA3_NUM_ROUNDS); - if (memcmp(exp, a, sizeof(exp))) { - fail_test(__func__, "", (uint8_t*) a, 32, (uint8_t*) exp, 32); +static void test_permute_avx512(void) { +#ifdef __AVX512F__ + for (size_t i = 0; i < sizeof(PERMUTE_TESTS) / sizeof(PERMUTE_TESTS[0]); i++) { + const size_t exp_len = PERMUTE_TESTS[i].exp_len; + + uint64_t got[25] = { 0 }; + memcpy(got, PERMUTE_TESTS[i].a, sizeof(got)); + permute_avx512(got); + + if (memcmp(got, PERMUTE_TESTS[i].exp, exp_len)) { + fail_test(__func__, "", (uint8_t*) got, exp_len, (uint8_t*) PERMUTE_TESTS[i].exp, exp_len); + } + } +#endif /* __AVX512F__ */ +} + +static const struct { + uint64_t a[25]; // input state + const uint64_t exp[25]; // expected value + const size_t exp_len; // length of exp, in bytes +} PERMUTE12_TESTS[] = {{ + .a = { [0] = 0x00000001997b5853ULL, [16] = 0x8000000000000000ULL }, + .exp = { 0X8B346BAFF5DA94C6ULL, 0XD7D37EC35E3B2EECULL, 0XBBF724EABFD84018ULL, 0X5E3C1AFA4EA7B3A1ULL }, + .exp_len = 32, +}}; + +static void test_permute12_scalar(void) { + for (size_t i = 0; i < sizeof(PERMUTE12_TESTS) / sizeof(PERMUTE12_TESTS[0]); i++) { + const size_t exp_len = PERMUTE12_TESTS[i].exp_len; + + uint64_t got[25] = { 0 }; + memcpy(got, PERMUTE12_TESTS[i].a, sizeof(got)); + permute12_scalar(got); + + if (memcmp(got, PERMUTE12_TESTS[i].exp, exp_len)) { + fail_test(__func__, "", (uint8_t*) got, exp_len, (uint8_t*) PERMUTE12_TESTS[i].exp, exp_len); + } + } +} + +static void test_permute12_avx512(void) { +#ifdef __AVX512F__ + for (size_t i = 0; i < sizeof(PERMUTE12_TESTS) / sizeof(PERMUTE12_TESTS[0]); i++) { + const size_t exp_len = PERMUTE12_TESTS[i].exp_len; + + uint64_t got[25] = { 0 }; + memcpy(got, PERMUTE12_TESTS[i].a, sizeof(got)); + permute12_avx512(got); + + if (memcmp(got, PERMUTE12_TESTS[i].exp, exp_len)) { + fail_test(__func__, "", (uint8_t*) got, exp_len, (uint8_t*) PERMUTE12_TESTS[i].exp, exp_len); + } } +#endif /* __AVX512F__ */ } static void test_sha3_224(void) { @@ -6143,7 +6609,10 @@ int main(void) { test_pi(); test_chi(); test_iota(); - test_permute(); + test_permute_scalar(); + test_permute_avx512(); + test_permute12_scalar(); + test_permute12_avx512(); test_sha3_224(); test_sha3_256(); test_sha3_384(); |