aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Duncan <pabs@pablotron.org>2024-05-08 06:00:16 -0400
committerPaul Duncan <pabs@pablotron.org>2024-05-08 06:00:16 -0400
commit4d377bf007e346e086a0c6f925db3b6b7dfce731 (patch)
tree45266d2181f907081e842170e531f8cc16611989
parent509bd52d3232557c85f5d63bcc75e8ebbd7853be (diff)
downloadsha3-4d377bf007e346e086a0c6f925db3b6b7dfce731.tar.bz2
sha3-4d377bf007e346e086a0c6f925db3b6b7dfce731.zip
tests/neon/neon.c: port changes back from sha3.c
-rw-r--r--tests/neon/neon.c707
1 files changed, 405 insertions, 302 deletions
diff --git a/tests/neon/neon.c b/tests/neon/neon.c
index 6720d7b..76cdd03 100644
--- a/tests/neon/neon.c
+++ b/tests/neon/neon.c
@@ -1,4 +1,6 @@
-// test neon implementation of keccak permutation
+// test neon keccak permutation which validates steps against scalar
+// counterparts
+
#include <stdio.h> // uint64_t
#include <stdint.h> // uint64_t
#include <string.h> // memcmp(), memcpy()
@@ -176,25 +178,17 @@ void permute_scalar(uint64_t a[static 25]) {
// columns are stored in the low 5 64-bit lanes. this wastes one
// 64-bit lane per row at the expense of making many of the instructions
// simpler.
-typedef union {
- // uint64_t u64[6];
- uint64x2x3_t u64x2x3;
- // uint8_t u8[48];
- // uint8x16_t u8x16[3];
- uint8x16x3_t u8x16x3;
+typedef struct {
+ uint64x2_t p0, p1, p2;
} row_t;
-// FIXME:
-// add row_load_fast which reads 6 elems and does this
-// r2 = { .u64x2x3 = vld1q_u64_x3(a + 10) },
-
// load row from array
row_t row_load(const uint64_t p[static 5]) {
row_t a = { 0 };
- a.u64x2x3.val[0] = vld1q_u64(p + 0);
- a.u64x2x3.val[1] = vld1q_u64(p + 2);
- a.u64x2x3.val[2] = vdupq_n_u64(p[4]);
+ a.p0 = vld1q_u64(p + 0);
+ a.p1 = vld1q_u64(p + 2);
+ a.p2 = vdupq_n_u64(p[4]);
return a;
}
@@ -202,10 +196,9 @@ row_t row_load(const uint64_t p[static 5]) {
// store row to array
void row_store(uint64_t p[static 5], const row_t a) {
// row_print(stderr, __func__, a);
- vst1q_u64(p + 0, a.u64x2x3.val[0]);
- vst1q_u64(p + 2, a.u64x2x3.val[1]);
- vst1_u64(p + 4, vdup_laneq_u64(a.u64x2x3.val[2], 0));
- // p[4] = vgetq_lane_u64(a.u64x2x3.val[2], 0);
+ vst1q_u64(p + 0, a.p0);
+ vst1q_u64(p + 2, a.p1);
+ vst1_u64(p + 4, vdup_laneq_u64(a.p2, 0));
}
// print row
@@ -222,6 +215,7 @@ void row_print(FILE *fh, const char *name, const row_t row) {
fputs(" }\n", fh); // suffix
}
+#if 0
// low lane ids for rol_rc{l,r}()
static const uint8x16_t ROW_RL_LO_IDS = {
8, 9, 10, 11, 12, 13, 14, 15, 99, 99, 99, 99, 99, 99, 99, 99,
@@ -236,6 +230,15 @@ static const uint8x16_t ROW_RL_HI_IDS = {
static const uint8x16_t ROW_RL_TAIL_IDS = {
0, 1, 2, 3, 4, 5, 6, 7, 99, 99, 99, 99, 99, 99, 99, 99,
};
+#endif
+
+// get Nth pair of u64s from row
+#define ROW_GET(A, N) ((A).p ## N)
+
+// set contents of row
+static inline row_t row_set(const uint64x2_t a, const uint64x2_t b, const uint64x2_t c) {
+ return (row_t) { a, b, c };
+}
// rotate row lanes left
//
@@ -248,14 +251,11 @@ static const uint8x16_t ROW_RL_TAIL_IDS = {
// --------------------------- ---------------------------
//
static inline row_t row_rll(const row_t a) {
- row_t b = { 0 };
- for (size_t i = 0; i < 3; i++) {
- const uint8x16_t lo_ids = i ? ROW_RL_LO_IDS : ROW_RL_TAIL_IDS,
- hi = vqtbl1q_u8(a.u8x16x3.val[i], ROW_RL_HI_IDS),
- lo = vqtbl1q_u8(a.u8x16x3.val[(i + 2) % 3], lo_ids);
- b.u8x16x3.val[i] = vorrq_u8(lo, hi);
- }
- return b;
+ return row_set(
+ vzip1q_u64(ROW_GET(a, 2), ROW_GET(a, 0)), // { a4, a0 }
+ vextq_u64(ROW_GET(a, 0), ROW_GET(a, 1), 1), // { a1, a2 }
+ vdupq_laneq_u64(ROW_GET(a, 1), 1) // { a3, n/a }
+ );
}
// rotate row lanes right
@@ -269,93 +269,95 @@ static inline row_t row_rll(const row_t a) {
// --------------------------- ---------------------------
//
static row_t row_rlr(const row_t a) {
- row_t b = { 0 };
- for (size_t i = 0; i < 2; i++) {
- const uint8x16_t lo = vqtbl1q_u8(a.u8x16x3.val[i], ROW_RL_LO_IDS),
- hi = vqtbl1q_u8(a.u8x16x3.val[(i + 1) % 3], ROW_RL_HI_IDS);
- b.u8x16x3.val[i] = vorrq_u8(lo, hi);
- }
- b.u8x16x3.val[2] = vqtbl1q_u8(a.u8x16x3.val[0], ROW_RL_TAIL_IDS);
- return b;
+ return row_set(
+ vextq_u64(ROW_GET(a, 0), ROW_GET(a, 1), 1), // { a1, a2 }
+ vextq_u64(ROW_GET(a, 1), ROW_GET(a, 2), 1), // { a3, a4 }
+ ROW_GET(a, 0) // { a0, n/a }
+ );
}
// c = a ^ b
-row_t row_eor(const row_t a, const row_t b) {
- row_t c = a;
- for (size_t i = 0; i < 3; i++) {
- c.u8x16x3.val[i] ^= b.u8x16x3.val[i];
- }
- return c;
+#define ROW_EOR(A, B) row_set( \
+ ROW_GET(A, 0) ^ ROW_GET(B, 0), \
+ ROW_GET(A, 1) ^ ROW_GET(B, 1), \
+ ROW_GET(A, 2) ^ ROW_GET(B, 2) \
+)
+
+// f = a ^ b ^ c ^ d ^ e
+// FIXME want: veor3_u64(a, b, c);
+static inline row_t row_eor5(const row_t a, const row_t b, const row_t c, const row_t d, const row_t e) {
+ const uint64x2_t p0 = ROW_GET(a, 0) ^ ROW_GET(b, 0) ^ ROW_GET(c, 0) ^ ROW_GET(d, 0) ^ ROW_GET(e, 0),
+ p1 = ROW_GET(a, 1) ^ ROW_GET(b, 1) ^ ROW_GET(c, 1) ^ ROW_GET(d, 1) ^ ROW_GET(e, 1),
+ p2 = ROW_GET(a, 2) ^ ROW_GET(b, 2) ^ ROW_GET(c, 2) ^ ROW_GET(d, 2) ^ ROW_GET(e, 2);
+
+ return row_set(p0, p1, p2);
}
+
// rotate bits in each lane left one bit
-row_t row_rol1_u64(const row_t a) {
- row_t b = { 0 };
- for (size_t i = 0; i < 3; i++) {
- b.u64x2x3.val[i] = VROLQ(a.u64x2x3.val[i], 1);
- }
- return b;
-}
+static inline row_t row_rol1_u64(const row_t a) {
+ const uint64x2_t p0 = ROW_GET(a, 0),
+ p1 = ROW_GET(a, 1),
+ p2 = ROW_GET(a, 2);
-// rotate bits in each lane left by amounts in vector
-static inline row_t row_rotn_u64(const row_t a, const int64_t v[static 5]) {
- row_t b = { 0 };
- static const int64x2_t k64 = { 64, 64 };
- for (size_t i = 0; i < 3; i++) {
- const int64x2_t hi_ids = (i < 2) ? vld1q_s64(v + 2 * i) : vdupq_n_s64(v[4]),
- lo_ids = vsubq_s64(hi_ids, k64);
- b.u64x2x3.val[i] = vorrq_u64(vshlq_u64(a.u64x2x3.val[i], hi_ids), vshlq_u64(a.u64x2x3.val[i], lo_ids));
- }
- return b;
+ return row_set(VROLQ(p0, 1), VROLQ(p1, 1), VROLQ(p2, 1));
}
-// row compare (not constant-time)
-_Bool row_eq(const row_t a, const row_t b) {
- uint64_t a_u64[5], b_u64[5];
- row_store(a_u64, a);
- row_store(b_u64, b);
- return !memcmp(a_u64, b_u64, sizeof(a_u64));
+// apply rho rotation to row
+static inline row_t row_rho(const row_t a, const int64x2_t v0, const int64x2_t v1, const int64x2_t v2) {
+ const uint64x2_t p0 = ROW_GET(a, 0),
+ p1 = ROW_GET(a, 1),
+ p2 = ROW_GET(a, 2);
+
+ return row_set(
+ vorrq_u64(vshlq_u64(p0, v0), vshlq_u64(p0, v0 - 64)),
+ vorrq_u64(vshlq_u64(p1, v1), vshlq_u64(p1, v1 - 64)),
+ vorrq_u64(vshlq_u64(p2, v2), vshlq_u64(p2, v2 - 64))
+ );
}
-// return logical NOT of row
-static row_t row_not(const row_t a) {
- row_t b;
- for (size_t i = 0; i < 3; i++) {
- b.u8x16x3.val[i] = vmvnq_u8(a.u8x16x3.val[i]);
- }
- return b;
-}
-
-// return logical OR NOT of rows
-static row_t row_orn(const row_t a, const row_t b) {
- row_t c;
- for (size_t i = 0; i < 3; i++) {
- c.u8x16x3.val[i] = vornq_u8(a.u8x16x3.val[i], b.u8x16x3.val[i]);
- }
- return c;
+// c = (~a & b)
+// note: was using ~(a | ~b) = (~a & b) (demorgan's laws), but changed
+// to BIC b, a instead (b & ~a)
+static inline row_t row_andn(const row_t a, const row_t b) {
+ return row_set(
+ vbicq_u64(ROW_GET(b, 0), ROW_GET(a, 0)),
+ vbicq_u64(ROW_GET(b, 1), ROW_GET(a, 1)),
+ vbicq_u64(ROW_GET(b, 2), ROW_GET(a, 2))
+ );
}
// apply chi permutation to entire row
// note: ~(a | ~b) = (~a & b) (demorgan's laws)
static inline row_t row_chi(const row_t a) {
- const row_t b = row_rlr(a),
- c = row_rlr(b); // fixme, permute would be faster
- return row_eor(a, row_not(row_orn(b, c)));
+ // a ^ (rlr(a, 1) & rlr(a, 2)) (rlr = rotate lane right)
+ return ROW_EOR(a, row_andn(row_rlr(a), row_set(
+ ROW_GET(a, 1), // { a2, a3 }
+ vtrn1q_u64(ROW_GET(a, 2), ROW_GET(a, 0)), // { a4, a0 }
+ vdupq_laneq_u64(ROW_GET(a, 0), 1) // { a1, n/a }
+ )));
}
+// row compare (not constant-time)
+_Bool row_eq(const row_t a, const row_t b) {
+ uint64_t a_u64[5], b_u64[5];
+ row_store(a_u64, a);
+ row_store(b_u64, b);
+ return !memcmp(a_u64, b_u64, sizeof(a_u64));
+}
// theta step of neon keccak permutation
void theta_neon(uint64_t a[static 25]) {
// ---------------------------------------------------------
- // | | Column / Register |
+ // | | Column / Register and 64-Bit Lane |
// |-------------------------------------------------------|
// | Row | 3 | 4 | 0 | 1 | 2 |
// |-----|---------|---------|---------|---------|---------|
- // | 2 | r2_23.1 | r2_4 | r2_01.0 | r2_01.1 | r2_23.0 |
- // | 1 | r1_23.1 | r1_4 | r1_01.0 | r1_01.1 | r1_23.0 |
- // | 0 | r0_23.1 | r0_4 | r0_01.0 | r0_01.1 | r0_23.0 |
- // | 4 | r4_23.1 | r4_4 | r4_01.0 | r4_01.1 | r4_23.0 |
- // | 3 | r3_23.1 | r3_4 | r3_01.0 | r3_01.1 | r3_23.0 |
+ // | 2 | r2.p1.1 | r2.p2.0 | r2.p0.0 | r2.p0.1 | r2.p1.0 |
+ // | 1 | r1.p1.1 | r1.p2.0 | r1.p0.0 | r1.p0.1 | r1.p1.0 |
+ // | 0 | r0.p1.1 | r0.p2.0 | r0.p0.0 | r0.p0.1 | r1.p1.0 |
+ // | 4 | r4.p1.1 | r4.p2.0 | r4.p0.0 | r4.p0.1 | r1.p1.0 |
+ // | 3 | r3.p1.1 | r3.p2.0 | r3.p0.0 | r3.p0.1 | r1.p1.0 |
// ---------------------------------------------------------
// load rows
@@ -365,30 +367,79 @@ void theta_neon(uint64_t a[static 25]) {
r3 = row_load(a + 15),
r4 = row_load(a + 20);
- // c = r0 ^ r1 ^ r2 ^ r3 ^ r4
- const row_t c = row_eor(row_eor(row_eor(r0, r1), row_eor(r2, r3)), r4);
-
- // calculate d...
- const row_t d = row_eor(row_rll(c), row_rol1_u64(row_rlr(c)));
+ {
+ /* c = r0 ^ r1 ^ r2 ^ r3 ^ r4, d = rll(c) ^ (rlr(c) << 1) */
+ const row_t c = row_eor5(r0, r1, r2, r3, r4),
+ d = ROW_EOR(row_rll(c), row_rol1_u64(row_rlr(c)));
+
+ r0 = ROW_EOR(r0, d); /* r0 ^= d */
+ r1 = ROW_EOR(r1, d); /* r1 ^= d */
+ r2 = ROW_EOR(r2, d); /* r2 ^= d */
+ r3 = ROW_EOR(r3, d); /* r3 ^= d */
+ r4 = ROW_EOR(r4, d); /* r4 ^= d */
+ }
// store rows
- row_store(a + 0, row_eor(r0, d));
- row_store(a + 5, row_eor(r1, d));
- row_store(a + 10, row_eor(r2, d));
- row_store(a + 15, row_eor(r3, d));
- row_store(a + 20, row_eor(r4, d));
-}
-
-static const int64_t RHO_IDS[25] = {
- 0, 1, 62, 28, 27,
- 36, 44, 6, 55, 20,
- 3, 10, 43, 25, 39,
- 41, 45, 15, 21, 8,
- 18, 2, 61, 56, 14,
-};
+ row_store(a + 0, r0);
+ row_store(a + 5, r1);
+ row_store(a + 10, r2);
+ row_store(a + 15, r3);
+ row_store(a + 20, r4);
+}
// rho step of neon keccak permutation
void rho_neon(uint64_t a[static 25]) {
+ // encoded rho rotate values
+ //
+ // original values:
+ //
+ // static const int64x2_t
+ // r0_a = { 0, 1 }, r0_b = { 62, 28 }, r02_c = { 27, 39 },
+ // r1_a = { 36, 44 }, r1_b = { 6, 55 }, r13_c = { 20, 8 },
+ // r2_a = { 3, 10 }, r2_b = { 43, 25 },
+ // r3_a = { 41, 45 }, r3_b = { 15, 21 },
+ // r4_a = { 18, 2 }, r4_b = { 61, 56 }, r4_c = { 14, 0 };
+ //
+ // low element of r[0-4]_{a,b} packed into low lane of r_ab, like so:
+ //
+ // >> v = [0, 36, 3, 41, 18, 62, 6, 43, 15, 61].each_with_index.reduce(0) { |r, (c, i)| r+(64**i)*
+ // c }
+ // => 1103290028930644224
+ // >> (v >> 6*9) & 0x3f
+ // => 61
+ // >> 6*9
+ // => 54
+ // >> v
+ // => 1103290028930644224
+ // >> '0x%016x' % v
+ // => "0x0f4fac6f92a43900"
+ //
+ // high element of r[0-4]_{a,b} packed into high lane of r_ab, like so:
+ //
+ // >> v = [1, 44, 10, 45, 2, 28, 55, 25, 21, 56].each_with_index.reduce(0) { |r, (c, i)| r+(64**i)
+ // *c }
+ // => 1014831051886078721
+ // >> '0x%016x' % v
+ // => "0x0e15677702b4ab01"
+ //
+ // low elements of r[0-4]_c packed into low lane of r_c, like so:
+ //
+ // >> v = [27, 20, 39, 8, 14].each_with_index.reduce(0) { |r, (c, i)| r+(64**i)*c }
+ // => 237139227
+ // >> '0x%016x' % v
+ // => "0x000000000e22751b"
+ //
+ // (there are no high elements of r[0-4]_c, all zero)
+ //
+ // to extract elements, right shift by 6*Y (where Y is the row
+ // number), then mask to lower 6 bits (0x3f). for example, to
+ // extract r4_b:
+ //
+ // >> (v >> 6*9) & 0x3f
+ // => 61
+ static const int64x2_t r_ab = { 0x0f4fac6f92a43900LL, 0x0e15677702b4ab01LL },
+ r_c = { 0x000000000e22751bLL, 0 };
+
// load rows
row_t r0 = row_load(a + 0),
r1 = row_load(a + 5),
@@ -396,12 +447,11 @@ void rho_neon(uint64_t a[static 25]) {
r3 = row_load(a + 15),
r4 = row_load(a + 20);
- // rotate rows
- r0 = row_rotn_u64(r0, RHO_IDS + 0);
- r1 = row_rotn_u64(r1, RHO_IDS + 5);
- r2 = row_rotn_u64(r2, RHO_IDS + 10);
- r3 = row_rotn_u64(r3, RHO_IDS + 15);
- r4 = row_rotn_u64(r4, RHO_IDS + 20);
+ r0 = row_rho(r0, r_ab & 0x3f, (r_ab >> 30) & 0x3f, r_c & 0x3f);
+ r1 = row_rho(r1, (r_ab >> 6) & 0x3f, (r_ab >> 36) & 0x3f, (r_c >> 6) & 0x3f);
+ r2 = row_rho(r2, (r_ab >> 12) & 0x3f, (r_ab >> 42) & 0x3f, (r_c >> 12) & 0x3f);
+ r3 = row_rho(r3, (r_ab >> 18) & 0x3f, (r_ab >> 48) & 0x3f, (r_c >> 18) & 0x3f);
+ r4 = row_rho(r4, (r_ab >> 24) & 0x3f, (r_ab >> 54) & 0x3f, (r_c >> 24) & 0x3f);
// store rows
row_store(a + 0, r0);
@@ -411,34 +461,12 @@ void rho_neon(uint64_t a[static 25]) {
row_store(a + 20, r4);
}
-// permute IDS to take low lane of first pair and low lane of second pair
-// TODO: replace with transpose or zip1q_u64?
-// a = [ a0, a1 ], b = [ b0, b1 ] => c = [ a0, b0 ]
-// static const uint8x16_t PI_LO_LO_IDS = {
-// 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
-// };
-
-// permute IDS to take low lane of first pair and hi lane of second pair
-// a = [ a0, a1 ], b = [ b0, b1 ] => c = [ a0, b1 ]
-static const uint8x16_t PI_LO_HI_IDS = {
- 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31,
-};
-
-// permute IDS to take high lane of first pair and low lane of second pair
-// a = [ a0, a1 ], b = [ b0, b1 ] => c = [ a1, b0 ]
-static const uint8x16_t PI_HI_LO_IDS = {
- 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-};
-
-// permute IDS to take high lane of both pairs
-// a = [ a0, a1 ], b = [ b0, b1 ] => c = [ a1, b1 ]
-// static const uint8x16_t PI_HI_HI_IDS = {
-// 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31,
-// };
-
-static uint8x16_t pi_tbl(const uint8x16_t a, const uint8x16_t b, const uint8x16_t ids) {
- uint8x16x2_t quad = { .val = { a, b } };
- return vqtbl2q_u8(quad, ids);
+// return new vector with low lane of first argument and high lane of
+// second argument
+static inline uint64x2_t pi_lo_hi(const uint64x2_t a, const uint64x2_t b) {
+ // was using vqtbl2q_u8() with tables, but this is faster
+ const uint64x2_t c = vextq_u64(b, a, 1);
+ return vextq_u64(c, c, 1);
}
// pi step of neon keccak permutation
@@ -451,72 +479,49 @@ void pi_neon(uint64_t a[static 25]) {
r4 = row_load(a + 20);
{
- row_t t0 = { 0 };
-
- // dst[ 0] = src[ 0]; dst[ 1] = src[ 6];
- t0.u8x16x3.val[0] = pi_tbl(r0.u8x16x3.val[0], r1.u8x16x3.val[0], PI_LO_HI_IDS);
- // dst[ 2] = src[12]; dst[ 3] = src[18];
- t0.u8x16x3.val[1] = pi_tbl(r2.u8x16x3.val[1], r3.u8x16x3.val[1], PI_LO_HI_IDS);
- // dst[ 4] = src[24];
- t0.u8x16x3.val[2] = r4.u8x16x3.val[2];
-
- row_store(a + 0, t0);
+ const row_t t0 = row_set(
+ pi_lo_hi(ROW_GET(r0, 0), ROW_GET(r1, 0)),
+ pi_lo_hi(ROW_GET(r2, 1), ROW_GET(r3, 1)),
+ ROW_GET(r4, 2)
+ );
+
+ const row_t t1 = row_set(
+ vextq_u64(ROW_GET(r0, 1), ROW_GET(r1, 2), 1),
+ pi_lo_hi(ROW_GET(r2, 0), ROW_GET(r3, 0)),
+ ROW_GET(r4, 1)
+ );
+
+ const row_t t2 = row_set(
+ vextq_u64(ROW_GET(r0, 0), ROW_GET(r1, 1), 1),
+ vextq_u64(ROW_GET(r2, 1), ROW_GET(r3, 2), 1),
+ ROW_GET(r4, 0)
+ );
+
+ const row_t t3 = row_set(
+ vtrn1q_u64(ROW_GET(r0, 2), ROW_GET(r1, 0)),
+ vextq_u64(ROW_GET(r2, 0), ROW_GET(r3, 1), 1),
+ vdupq_laneq_u64(ROW_GET(r4, 1), 1)
+ );
+
+ const row_t t4 = row_set(
+ pi_lo_hi(ROW_GET(r0, 1), ROW_GET(r1, 1)),
+ vtrn1q_u64(ROW_GET(r2, 2), ROW_GET(r3, 0)),
+ vdupq_laneq_u64(ROW_GET(r4, 0), 1)
+ );
+
+ /* store rows */
+ r0 = t0;
+ r1 = t1;
+ r2 = t2;
+ r3 = t3;
+ r4 = t4;
}
- {
- row_t t1 = { 0 };
-
- // dst[ 5] = src[ 3]; dst[ 6] = src[ 9];
- t1.u8x16x3.val[0] = pi_tbl(r0.u8x16x3.val[1], r1.u8x16x3.val[2], PI_HI_LO_IDS);
- // dst[ 7] = src[10]; dst[ 8] = src[16];
- t1.u8x16x3.val[1] = pi_tbl(r2.u8x16x3.val[0], r3.u8x16x3.val[0], PI_LO_HI_IDS);
- // dst[ 9] = src[22];
- t1.u8x16x3.val[2] = r4.u8x16x3.val[1];
-
- row_store(a + 5, t1);
- }
-
- {
- row_t t2 = { 0 };
-
- // dst[10] = src[ 1]; dst[11] = src[ 7];
- t2.u8x16x3.val[0] = pi_tbl(r0.u8x16x3.val[0], r1.u8x16x3.val[1], PI_HI_LO_IDS);
- // dst[12] = src[13]; dst[13] = src[19];
- t2.u8x16x3.val[1] = pi_tbl(r2.u8x16x3.val[1], r3.u8x16x3.val[2], PI_HI_LO_IDS);
- // dst[14] = src[20];
- t2.u8x16x3.val[2] = r4.u8x16x3.val[0];
-
- row_store(a + 10, t2);
- }
-
- {
- row_t t3 = { 0 };
-
- // dst[15] = src[ 4]; dst[16] = src[ 5];
- // t3.u8x16x3.val[0] = pi_tbl(r0.u8x16x3.val[2], r1.u8x16x3.val[0], PI_LO_LO_IDS);
- t3.u64x2x3.val[0] = vtrn1q_u64(r0.u64x2x3.val[2], r1.u64x2x3.val[0]);
- // dst[17] = src[11]; dst[18] = src[17];
- t3.u8x16x3.val[1] = pi_tbl(r2.u8x16x3.val[0], r3.u8x16x3.val[1], PI_HI_LO_IDS);
- // dst[19] = src[23];
- t3.u8x16x3.val[2] = pi_tbl(r4.u8x16x3.val[1], r4.u8x16x3.val[1], PI_HI_LO_IDS);
-
- row_store(a + 15, t3);
- }
-
- {
- row_t t4 = { 0 };
-
- // dst[20] = src[ 2]; dst[21] = src[ 8];
- t4.u8x16x3.val[0] = pi_tbl(r0.u8x16x3.val[1], r1.u8x16x3.val[1], PI_LO_HI_IDS);
- // dst[22] = src[14]; dst[23] = src[15];
- // t4.u8x16x3.val[1] = pi_tbl(r2.u8x16x3.val[2], r3.u8x16x3.val[0], PI_LO_LO_IDS);
- t4.u64x2x3.val[1] = vtrn1q_u64(r2.u64x2x3.val[2], r3.u64x2x3.val[0]);
-
- // dst[24] = src[21];
- t4.u8x16x3.val[2] = pi_tbl(r4.u8x16x3.val[0], r4.u8x16x3.val[0], PI_HI_LO_IDS);
-
- row_store(a + 20, t4);
- }
+ row_store(a + 0, r0);
+ row_store(a + 5, r1);
+ row_store(a + 10, r2);
+ row_store(a + 15, r3);
+ row_store(a + 20, r4);
}
// chi step of neon keccak permutation
@@ -530,7 +535,7 @@ void chi_neon(uint64_t a[static 25]) {
void iota_neon(uint64_t a[static 25], const size_t i) {
row_t r0 = row_load(a);
const uint64x2_t rc = { RCS[i], 0 };
- r0.u64x2x3.val[0] ^= rc;
+ r0.p0 ^= rc;
row_store(a, r0);
}
@@ -549,15 +554,15 @@ void permute_neon_slow(uint64_t a[static 25]) {
// 24-round neon keccak permutation with inlined steps
void permute_neon_inline(uint64_t a[static 25]) {
// ---------------------------------------------------------
- // | | Column / Register |
+ // | | Column / Register and 64-Bit Lane |
// |-------------------------------------------------------|
// | Row | 3 | 4 | 0 | 1 | 2 |
// |-----|---------|---------|---------|---------|---------|
- // | 2 | r2_23.1 | r2_4 | r2_01.0 | r2_01.1 | r2_23.0 |
- // | 1 | r1_23.1 | r1_4 | r1_01.0 | r1_01.1 | r1_23.0 |
- // | 0 | r0_23.1 | r0_4 | r0_01.0 | r0_01.1 | r0_23.0 |
- // | 4 | r4_23.1 | r4_4 | r4_01.0 | r4_01.1 | r4_23.0 |
- // | 3 | r3_23.1 | r3_4 | r3_01.0 | r3_01.1 | r3_23.0 |
+ // | 2 | r2.p1.1 | r2.p2.0 | r2.p0.0 | r2.p0.1 | r2.p1.0 |
+ // | 1 | r1.p1.1 | r1.p2.0 | r1.p0.0 | r1.p0.1 | r1.p1.0 |
+ // | 0 | r0.p1.1 | r0.p2.0 | r0.p0.0 | r0.p0.1 | r1.p1.0 |
+ // | 4 | r4.p1.1 | r4.p2.0 | r4.p0.0 | r4.p0.1 | r1.p1.0 |
+ // | 3 | r3.p1.1 | r3.p2.0 | r3.p0.0 | r3.p0.1 | r1.p1.0 |
// ---------------------------------------------------------
// load rows
@@ -570,83 +575,110 @@ void permute_neon_inline(uint64_t a[static 25]) {
for (size_t i = 0; i < 24; i++) {
// theta
{
- // c = r0 ^ r1 ^ r2 ^ r3 ^ r4
- const row_t c = row_eor(row_eor(row_eor(r0, r1), row_eor(r2, r3)), r4);
-
- // calculate d...
- const row_t d = row_eor(row_rll(c), row_rol1_u64(row_rlr(c)));
-
- r0 = row_eor(r0, d);
- r1 = row_eor(r1, d);
- r2 = row_eor(r2, d);
- r3 = row_eor(r3, d);
- r4 = row_eor(r4, d);
+ /* c = r0 ^ r1 ^ r2 ^ r3 ^ r4, d = rll(c) ^ (rlr(c) << 1) */
+ const row_t c = row_eor5(r0, r1, r2, r3, r4),
+ d = ROW_EOR(row_rll(c), row_rol1_u64(row_rlr(c)));
+
+ r0 = ROW_EOR(r0, d); /* r0 ^= d */
+ r1 = ROW_EOR(r1, d); /* r1 ^= d */
+ r2 = ROW_EOR(r2, d); /* r2 ^= d */
+ r3 = ROW_EOR(r3, d); /* r3 ^= d */
+ r4 = ROW_EOR(r4, d); /* r4 ^= d */
}
// rho
{
- r0 = row_rotn_u64(r0, RHO_IDS + 0);
- r1 = row_rotn_u64(r1, RHO_IDS + 5);
- r2 = row_rotn_u64(r2, RHO_IDS + 10);
- r3 = row_rotn_u64(r3, RHO_IDS + 15);
- r4 = row_rotn_u64(r4, RHO_IDS + 20);
+ // encoded rho rotate values
+ //
+ // original values:
+ //
+ // static const int64x2_t
+ // r0_a = { 0, 1 }, r0_b = { 62, 28 }, r02_c = { 27, 39 },
+ // r1_a = { 36, 44 }, r1_b = { 6, 55 }, r13_c = { 20, 8 },
+ // r2_a = { 3, 10 }, r2_b = { 43, 25 },
+ // r3_a = { 41, 45 }, r3_b = { 15, 21 },
+ // r4_a = { 18, 2 }, r4_b = { 61, 56 }, r4_c = { 14, 0 };
+ //
+ // low element of r[0-4]_{a,b} packed into low lane of r_ab, like so:
+ //
+ // >> v = [0, 36, 3, 41, 18, 62, 6, 43, 15, 61].each_with_index.reduce(0) { |r, (c, i)| r+(64**i)*
+ // c }
+ // => 1103290028930644224
+ // >> (v >> 6*9) & 0x3f
+ // => 61
+ // >> 6*9
+ // => 54
+ // >> v
+ // => 1103290028930644224
+ // >> '0x%016x' % v
+ // => "0x0f4fac6f92a43900"
+ //
+ // high element of r[0-4]_{a,b} packed into high lane of r_ab, like so:
+ //
+ // >> v = [1, 44, 10, 45, 2, 28, 55, 25, 21, 56].each_with_index.reduce(0) { |r, (c, i)| r+(64**i)
+ // *c }
+ // => 1014831051886078721
+ // >> '0x%016x' % v
+ // => "0x0e15677702b4ab01"
+ //
+ // low elements of r[0-4]_c packed into low lane of r_c, like so:
+ //
+ // >> v = [27, 20, 39, 8, 14].each_with_index.reduce(0) { |r, (c, i)| r+(64**i)*c }
+ // => 237139227
+ // >> '0x%016x' % v
+ // => "0x000000000e22751b"
+ //
+ // (there are no high elements of r[0-4]_c, all zero)
+ //
+ // to extract elements, right shift by 6*Y (where Y is the row
+ // number), then mask to lower 6 bits (0x3f). for example, to
+ // extract r4_b:
+ //
+ // >> (v >> 6*9) & 0x3f
+ // => 61
+ static const int64x2_t r_ab = { 0x0f4fac6f92a43900LL, 0x0e15677702b4ab01LL },
+ r_c = { 0x000000000e22751bLL, 0 };
+
+ r0 = row_rho(r0, r_ab & 0x3f, (r_ab >> 30) & 0x3f, r_c & 0x3f);
+ r1 = row_rho(r1, (r_ab >> 6) & 0x3f, (r_ab >> 36) & 0x3f, (r_c >> 6) & 0x3f);
+ r2 = row_rho(r2, (r_ab >> 12) & 0x3f, (r_ab >> 42) & 0x3f, (r_c >> 12) & 0x3f);
+ r3 = row_rho(r3, (r_ab >> 18) & 0x3f, (r_ab >> 48) & 0x3f, (r_c >> 18) & 0x3f);
+ r4 = row_rho(r4, (r_ab >> 24) & 0x3f, (r_ab >> 54) & 0x3f, (r_c >> 24) & 0x3f);
}
// pi
{
- row_t t0 = { 0 };
- {
- // dst[ 0] = src[ 0]; dst[ 1] = src[ 6];
- t0.u8x16x3.val[0] = pi_tbl(r0.u8x16x3.val[0], r1.u8x16x3.val[0], PI_LO_HI_IDS);
- // dst[ 2] = src[12]; dst[ 3] = src[18];
- t0.u8x16x3.val[1] = pi_tbl(r2.u8x16x3.val[1], r3.u8x16x3.val[1], PI_LO_HI_IDS);
- // dst[ 4] = src[24];
- t0.u8x16x3.val[2] = r4.u8x16x3.val[2];
- }
-
- row_t t1 = { 0 };
- {
-
- // dst[ 5] = src[ 3]; dst[ 6] = src[ 9];
- t1.u8x16x3.val[0] = pi_tbl(r0.u8x16x3.val[1], r1.u8x16x3.val[2], PI_HI_LO_IDS);
- // dst[ 7] = src[10]; dst[ 8] = src[16];
- t1.u8x16x3.val[1] = pi_tbl(r2.u8x16x3.val[0], r3.u8x16x3.val[0], PI_LO_HI_IDS);
- // dst[ 9] = src[22];
- t1.u8x16x3.val[2] = r4.u8x16x3.val[1];
- }
-
- row_t t2 = { 0 };
- {
- // dst[10] = src[ 1]; dst[11] = src[ 7];
- t2.u8x16x3.val[0] = pi_tbl(r0.u8x16x3.val[0], r1.u8x16x3.val[1], PI_HI_LO_IDS);
- // dst[12] = src[13]; dst[13] = src[19];
- t2.u8x16x3.val[1] = pi_tbl(r2.u8x16x3.val[1], r3.u8x16x3.val[2], PI_HI_LO_IDS);
- // dst[14] = src[20];
- t2.u8x16x3.val[2] = r4.u8x16x3.val[0];
- }
-
- row_t t3 = { 0 };
- {
- // dst[15] = src[ 4]; dst[16] = src[ 5];
- // t3.u8x16x3.val[0] = pi_tbl(r0.u8x16x3.val[2], r1.u8x16x3.val[0], PI_LO_LO_IDS);
- t3.u64x2x3.val[0] = vtrn1q_u64(r0.u64x2x3.val[2], r1.u64x2x3.val[0]);
- // dst[17] = src[11]; dst[18] = src[17];
- t3.u8x16x3.val[1] = pi_tbl(r2.u8x16x3.val[0], r3.u8x16x3.val[1], PI_HI_LO_IDS);
- // dst[19] = src[23];
- t3.u8x16x3.val[2] = pi_tbl(r4.u8x16x3.val[1], r4.u8x16x3.val[1], PI_HI_LO_IDS);
- }
-
- row_t t4 = { 0 };
- {
- // dst[20] = src[ 2]; dst[21] = src[ 8];
- t4.u8x16x3.val[0] = pi_tbl(r0.u8x16x3.val[1], r1.u8x16x3.val[1], PI_LO_HI_IDS);
- // dst[22] = src[14]; dst[23] = src[15];
- // t4.u8x16x3.val[1] = pi_tbl(r2.u8x16x3.val[2], r3.u8x16x3.val[0], PI_LO_LO_IDS);
- t4.u64x2x3.val[1] = vtrn1q_u64(r2.u64x2x3.val[2], r3.u64x2x3.val[0]);
- // dst[24] = src[21];
- t4.u8x16x3.val[2] = pi_tbl(r4.u8x16x3.val[0], r4.u8x16x3.val[0], PI_HI_LO_IDS);
- }
-
+ const row_t t0 = row_set(
+ pi_lo_hi(ROW_GET(r0, 0), ROW_GET(r1, 0)),
+ pi_lo_hi(ROW_GET(r2, 1), ROW_GET(r3, 1)),
+ ROW_GET(r4, 2)
+ );
+
+ const row_t t1 = row_set(
+ vextq_u64(ROW_GET(r0, 1), ROW_GET(r1, 2), 1),
+ pi_lo_hi(ROW_GET(r2, 0), ROW_GET(r3, 0)),
+ ROW_GET(r4, 1)
+ );
+
+ const row_t t2 = row_set(
+ vextq_u64(ROW_GET(r0, 0), ROW_GET(r1, 1), 1),
+ vextq_u64(ROW_GET(r2, 1), ROW_GET(r3, 2), 1),
+ ROW_GET(r4, 0)
+ );
+
+ const row_t t3 = row_set(
+ vtrn1q_u64(ROW_GET(r0, 2), ROW_GET(r1, 0)),
+ vextq_u64(ROW_GET(r2, 0), ROW_GET(r3, 1), 1),
+ vdupq_laneq_u64(ROW_GET(r4, 1), 1)
+ );
+
+ const row_t t4 = row_set(
+ pi_lo_hi(ROW_GET(r0, 1), ROW_GET(r1, 1)),
+ vtrn1q_u64(ROW_GET(r2, 2), ROW_GET(r3, 0)),
+ vdupq_laneq_u64(ROW_GET(r4, 0), 1)
+ );
+
+ /* store rows */
r0 = t0;
r1 = t1;
r2 = t2;
@@ -664,7 +696,7 @@ void permute_neon_inline(uint64_t a[static 25]) {
// iota
{
const uint64x2_t rc = { RCS[i], 0 };
- r0.u64x2x3.val[0] ^= rc;
+ r0.p0 ^= rc;
}
}
@@ -676,10 +708,6 @@ void permute_neon_inline(uint64_t a[static 25]) {
row_store(a + 20, r4);
}
-void permute_neon(uint64_t a[static 25]) {
- permute_neon_inline(a);
-}
-
static void check_state(const char *func, const size_t test_id, const uint64_t got[static 25], const uint64_t exp[static 25]) {
if (!memcmp(got, exp, 25*sizeof(uint64_t))) {
return;
@@ -693,7 +721,6 @@ static void check_state(const char *func, const size_t test_id, const uint64_t g
}
}
-
static const struct {
uint64_t val[5], exp[5];
} ROW_RLL_TESTS[] = {{
@@ -788,19 +815,25 @@ void test_row_rol1_u64(void) {
}
}
+// vshlq_u64() tests
static const struct {
const uint64x2_t val; // value
- const int n; // shift amount
const uint64x2_t exp; // expected result
} VSHL_TESTS[] = {{
.val = { 1, 2 },
- .n = 2,
.exp = { 4, 8 },
}};
+// vshlq_u64() shift amount
+//
+// clang complains if the second parameter to vshlq_n_u64() is not a
+// constant value (gcc does not)
+#define TEST_VSHL_N 2
+
+// test vshlq_u64()
static void test_vshl(void) {
for (size_t i = 0; i < sizeof(VSHL_TESTS)/sizeof(VSHL_TESTS[0]); i++) {
- const uint64x2_t got = vshlq_n_u64(VSHL_TESTS[i].val, VSHL_TESTS[i].n),
+ const uint64x2_t got = vshlq_n_u64(VSHL_TESTS[i].val, TEST_VSHL_N),
exp = VSHL_TESTS[i].exp;
if (got[0] != exp[0] || got[1] != exp[1]) {
fprintf(stderr, "%s[%zu] failed: got = { 0x%016" PRIx64 ", 0x%016" PRIx64 " }, exp { 0x%016" PRIx64 ", 0x%016" PRIx64 " }\n", __func__, i, got[0], got[1], exp[0], exp[1]);
@@ -808,26 +841,45 @@ static void test_vshl(void) {
}
}
+// vshr, n=3 tests
static const struct {
const uint64x2_t val; // value
- const int n; // shift amount
const uint64x2_t exp; // expected result
-} VSHR_TESTS[] = {{
+} VSHR_N3_TESTS[] = {{
.val = { 128, 64 },
- .n = 3,
.exp = { 16, 8 },
-}, {
+}};
+
+// vshr, n=32 tests
+static const struct {
+ const uint64x2_t val; // value
+ const uint64x2_t exp; // expected result
+} VSHR_N32_TESTS[] = {{
.val = { 0xffffffffffffffffULL, 1 },
- .n = 32,
.exp = { 0x00000000ffffffffULL, 0x0000000000000000ULL },
}};
+// test vshrq_u64()
+//
+// note: tests are split so we can use a constant value for N; clang
+// complains if the second parameter to vshrq_n_u64() is not a constant
+// value (gcc does not)
static void test_vshr(void) {
- for (size_t i = 0; i < sizeof(VSHR_TESTS)/sizeof(VSHR_TESTS[0]); i++) {
- const uint64x2_t got = vshrq_n_u64(VSHR_TESTS[i].val, VSHR_TESTS[i].n),
- exp = VSHR_TESTS[i].exp;
+ // loop over n=3 tests
+ for (size_t i = 0; i < sizeof(VSHR_N3_TESTS)/sizeof(VSHR_N3_TESTS[0]); i++) {
+ const uint64x2_t got = vshrq_n_u64(VSHR_N3_TESTS[i].val, 3),
+ exp = VSHR_N3_TESTS[i].exp;
if (got[0] != exp[0] || got[1] != exp[1]) {
- fprintf(stderr, "%s[%zu] failed: got = { 0x%016" PRIx64 ", 0x%016" PRIx64 " }, exp { 0x%016" PRIx64 ", 0x%016" PRIx64 " }\n", __func__, i, got[0], got[1], exp[0], exp[1]);
+ fprintf(stderr, "%s[n=3, %zu] failed: got = { 0x%016" PRIx64 ", 0x%016" PRIx64 " }, exp { 0x%016" PRIx64 ", 0x%016" PRIx64 " }\n", __func__, i, got[0], got[1], exp[0], exp[1]);
+ }
+ }
+
+ // loop over n=32 tests
+ for (size_t i = 0; i < sizeof(VSHR_N32_TESTS)/sizeof(VSHR_N32_TESTS[0]); i++) {
+ const uint64x2_t got = vshrq_n_u64(VSHR_N32_TESTS[i].val, 32),
+ exp = VSHR_N32_TESTS[i].exp;
+ if (got[0] != exp[0] || got[1] != exp[1]) {
+ fprintf(stderr, "%s[n=32, %zu] failed: got = { 0x%016" PRIx64 ", 0x%016" PRIx64 " }, exp { 0x%016" PRIx64 ", 0x%016" PRIx64 " }\n", __func__, i, got[0], got[1], exp[0], exp[1]);
}
}
}
@@ -869,99 +921,150 @@ static const struct {
}};
void test_theta(void) {
+ // loop over tests
for (size_t i = 0; i < sizeof(STEP_TESTS)/sizeof(STEP_TESTS[0]); i++) {
+ // load test data
uint64_t got[25] = { 0 }, exp[25] = { 0 };
memcpy(got, STEP_TESTS[i].vals, sizeof(exp));
memcpy(exp, STEP_TESTS[i].vals, sizeof(exp));
+ // run scalar theta and neon theta
theta_scalar(exp);
theta_neon(got);
+ // compare scalar and neon results
check_state(__func__, i, got, exp);
}
}
void test_rho(void) {
+ // loop over tests
for (size_t i = 0; i < sizeof(STEP_TESTS)/sizeof(STEP_TESTS[0]); i++) {
+ // load test data
uint64_t got[25] = { 0 }, exp[25] = { 0 };
memcpy(got, STEP_TESTS[i].vals, sizeof(exp));
memcpy(exp, STEP_TESTS[i].vals, sizeof(exp));
+ // run scalar rho and neon rho
rho_scalar(exp);
rho_neon(got);
+ // compare scalar and neon results
check_state(__func__, i, got, exp);
}
}
void test_pi(void) {
+ // loop over tests
for (size_t i = 0; i < sizeof(STEP_TESTS)/sizeof(STEP_TESTS[0]); i++) {
+ // load test data
uint64_t got[25] = { 0 }, exp[25] = { 0 };
memcpy(got, STEP_TESTS[i].vals, sizeof(exp));
memcpy(exp, STEP_TESTS[i].vals, sizeof(exp));
+ // run scalar pi and neon pi
pi_scalar(exp);
pi_neon(got);
+ // compare scalar and neon results
check_state(__func__, i, got, exp);
}
}
void test_chi(void) {
+ // loop over tests
for (size_t i = 0; i < sizeof(STEP_TESTS)/sizeof(STEP_TESTS[0]); i++) {
+ // load test data
uint64_t got[25] = { 0 }, exp[25] = { 0 };
memcpy(got, STEP_TESTS[i].vals, sizeof(exp));
memcpy(exp, STEP_TESTS[i].vals, sizeof(exp));
+ // run scalar chi and neon chi
chi_scalar(exp);
chi_neon(got);
+ // compare scalar and neon results
check_state(__func__, i, got, exp);
}
}
void test_iota(void) {
+ // loop over tests
for (size_t i = 0; i < sizeof(STEP_TESTS)/sizeof(STEP_TESTS[0]); i++) {
+ // load test data
uint64_t got[25] = { 0 }, exp[25] = { 0 };
memcpy(got, STEP_TESTS[i].vals, sizeof(exp));
memcpy(exp, STEP_TESTS[i].vals, sizeof(exp));
for (size_t j = 0; j < 24; j++) {
+ // run scalar iota and neon iota
iota_scalar(exp, j);
iota_neon(got, j);
+ // compare scalar and neon results
check_state(__func__, i * 1000 + j, got, exp);
}
}
}
-void test_permute(void) {
+void test_permute_slow(void) {
+ // loop over tests
+ for (size_t z = 0; z < 100000; z++) {
+ for (size_t i = 0; i < sizeof(STEP_TESTS)/sizeof(STEP_TESTS[0]); i++) {
+ // load test data
+ uint64_t got[25] = { 0 }, exp[25] = { 0 };
+ memcpy(got, STEP_TESTS[i].vals, sizeof(exp));
+ memcpy(exp, STEP_TESTS[i].vals, sizeof(exp));
+
+ // run scalar permute and slow neon permute
+ permute_scalar(exp);
+ permute_neon_slow(got);
+
+ // compare scalar and neon results
+ check_state(__func__, i, got, exp);
+ }
+ }
+}
+
+void test_permute_inline(void) {
+ // loop over tests
for (size_t z = 0; z < 100000; z++) {
for (size_t i = 0; i < sizeof(STEP_TESTS)/sizeof(STEP_TESTS[0]); i++) {
+ // load test data
uint64_t got[25] = { 0 }, exp[25] = { 0 };
memcpy(got, STEP_TESTS[i].vals, sizeof(exp));
memcpy(exp, STEP_TESTS[i].vals, sizeof(exp));
+ // run scalar permute and inline neon permute
permute_scalar(exp);
- permute_neon(got);
+ permute_neon_inline(got); // TODO
+ // compare scalar and neon results
check_state(__func__, i, got, exp);
}
}
}
int main(void) {
+ // test primitives
test_row_rll();
test_row_rlr();
test_vshl();
test_vshr();
test_row_rol1_u64();
+
+ // test steps
test_theta();
test_rho();
test_pi();
test_chi();
test_iota();
- test_permute();
+ // test permute
+ test_permute_slow();
+ test_permute_inline();
+
+ // print/return success
+ printf("ok\n");
return 0;
}