From ae8484372cbbf0bad041f3bd9b11c18d9cf37be7 Mon Sep 17 00:00:00 2001
From: Paul Duncan <pabs@pablotron.org>
Date: Tue, 16 Jul 2019 20:24:47 -0400
Subject: make faster

---
 sha256.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 125 insertions(+), 44 deletions(-)

(limited to 'sha256.c')

diff --git a/sha256.c b/sha256.c
index 8249a0c..1391466 100644
--- a/sha256.c
+++ b/sha256.c
@@ -41,22 +41,50 @@ void sha256_init(sha256_t * const ctx) {
   memcpy(ctx->h, H, sizeof(H));
 }
 
-// decode buffer data as 32-bit words (used for the first 16 words)
-#define WI(ctx, i) ( \
-  (((uint32_t) (ctx)->buf[4 * (i) + 0]) << 24) | \
-  (((uint32_t) (ctx)->buf[4 * (i) + 1]) << 16) | \
-  (((uint32_t) (ctx)->buf[4 * (i) + 2]) << 8) | \
-  ((uint32_t) (ctx)->buf[4 * (i) + 3]) \
+// WI: decode buffer data as 32-bit words (used for the first 16 words)
+#define WI(i) ( \
+  (((uint32_t) ctx->buf[4 * (i) + 0]) << 24) | \
+  (((uint32_t) ctx->buf[4 * (i) + 1]) << 16) | \
+  (((uint32_t) ctx->buf[4 * (i) + 2]) << 8) | \
+  ((uint32_t) ctx->buf[4 * (i) + 3]) \
 )
 
+// WE: expand first 16 buffer words into remaining 48 words
+#define WE(i) do { \
+  const uint32_t w2 = w[(i) - 2], \
+                 w7 = w[(i) - 7], \
+                 w15 = w[(i) - 15], \
+                 w16 = w[(i) - 16], \
+                 s0 = rr(w15, 7) ^ rr(w15, 18) ^ (w15 >> 3), \
+                 s1 = rr(w2, 17) ^ rr(w2, 19) ^ (w2 >> 10); \
+  w[i] = w16 + s0 + w7 + s1; \
+} while (0)
+
+// WC: compress word
+#define WC(i) do { \
+  const uint32_t s1 = rr(hs[4], 6) ^ rr(hs[4], 11) ^ rr(hs[4], 25), \
+                 ch = (hs[4] & hs[5]) ^ ((~(hs[4])) & hs[6]), \
+                 t0 = hs[7] + s1 + ch + K[i] + w[i], \
+                 s0 = rr(hs[0], 2) ^ rr(hs[0], 13) ^ rr(hs[0], 22), \
+                 mj = (hs[0] & hs[1]) ^ (hs[0] & hs[2]) ^ (hs[1] & hs[2]), \
+                 t1 = s0 + mj; \
+\
+  hs[7] = hs[6]; \
+  hs[6] = hs[5]; \
+  hs[5] = hs[4]; \
+  hs[4] = hs[3] + t0; \
+  hs[3] = hs[2]; \
+  hs[2] = hs[1]; \
+  hs[1] = hs[0]; \
+  hs[0] = t0 + t1; \
+} while (0)
+
 static void
 sha256_block(sha256_t * const ctx) {
   // init first 16 words from buffer
   uint32_t w[64] = {
-    WI(ctx, 0), WI(ctx, 1), WI(ctx, 2), WI(ctx, 3),
-    WI(ctx, 4), WI(ctx, 5), WI(ctx, 6), WI(ctx, 7),
-    WI(ctx, 8), WI(ctx, 9), WI(ctx, 10), WI(ctx, 11),
-    WI(ctx, 12), WI(ctx, 13), WI(ctx, 14), WI(ctx, 15),
+    WI(0), WI(1), WI(2), WI(3), WI(4), WI(5), WI(6), WI(7),
+    WI(8), WI(9), WI(10), WI(11), WI(12), WI(13), WI(14), WI(15),
     0,
   };
 
@@ -67,14 +95,33 @@ sha256_block(sha256_t * const ctx) {
   //   s0 := (w[i-15] rr  7) xor (w[i-15] rr 18) xor (w[i-15] rs  3)
   //   s1 := (w[i- 2] rr 17) xor (w[i- 2] rr 19) xor (w[i- 2] rs 10)
   //   w[i] := w[i-16] + s0 + w[i-7] + s1
+  //
+  // for (size_t i = 16; i < 64; i++) {
+  //   const uint32_t w2 = w[i - 2],
+  //                  w7 = w[i - 7],
+  //                  w15 = w[i - 15],
+  //                  w16 = w[i - 16],
+  //                  s0 = rr(w15, 7) ^ rr(w15, 18) ^ (w15 >> 3),
+  //                  s1 = rr(w2, 17) ^ rr(w2, 19) ^ (w2 >> 10);
+  //   w[i] = w16 + s0 + w7 + s1;
+  // }
+  //
+  // // fully unrolled version:
+  // WE(24); WE(25); WE(26); WE(27); WE(28); WE(29); WE(30); WE(31);
+  // WE(32); WE(33); WE(34); WE(35); WE(36); WE(37); WE(38); WE(39);
+  // WE(40); WE(41); WE(42); WE(43); WE(44); WE(45); WE(46); WE(47);
+  // WE(48); WE(49); WE(50); WE(51); WE(52); WE(53); WE(54); WE(55);
+  // WE(56); WE(57); WE(58); WE(59); WE(60); WE(61); WE(62); WE(63);
+  //
+  // partially unrolled:
+  // for (size_t we_i = 16; we_i < 64; we_i += 16) {
+  //   WE(we_i + 0); WE(we_i + 1); WE(we_i + 2); WE(we_i + 3);
+  //   WE(we_i + 4); WE(we_i + 5); WE(we_i + 6); WE(we_i + 7);
+  //   WE(we_i + 8); WE(we_i + 9); WE(we_i + 10); WE(we_i + 11);
+  //   WE(we_i + 12); WE(we_i + 13); WE(we_i + 14); WE(we_i + 15);
+  // }
   for (size_t i = 16; i < 64; i++) {
-    const uint32_t w2 = w[i - 2],
-                   w7 = w[i - 7],
-                   w15 = w[i - 15],
-                   w16 = w[i - 16],
-                   s0 = rr(w15, 7) ^ rr(w15, 18) ^ (w15 >> 3),
-                   s1 = rr(w2, 17) ^ rr(w2, 19) ^ (w2 >> 10);
-    w[i] = w16 + s0 + w7 + s1;
+    WE(i);
   }
 
   // Initialize working variables to current hash value
@@ -101,22 +148,41 @@ sha256_block(sha256_t * const ctx) {
   //   c := b
   //   b := a
   //   a := temp1 + temp2
-  for (size_t i = 0; i < 64; i++) {
-    const uint32_t s1 = rr(hs[4], 6) ^ rr(hs[4], 11) ^ rr(hs[4], 25),
-                   ch = (hs[4] & hs[5]) ^ ((~(hs[4])) & hs[6]),
-                   t0 = hs[7] + s1 + ch + K[i] + w[i],
-                   s0 = rr(hs[0], 2) ^ rr(hs[0], 13) ^ rr(hs[0], 22),
-                   mj = (hs[0] & hs[1]) ^ (hs[0] & hs[2]) ^ (hs[1] & hs[2]),
-                   t1 = s0 + mj;
-
-    hs[7] = hs[6];
-    hs[6] = hs[5];
-    hs[5] = hs[4];
-    hs[4] = hs[3] + t0;
-    hs[3] = hs[2];
-    hs[2] = hs[1];
-    hs[1] = hs[0];
-    hs[0] = t0 + t1;
+  //
+  // for (size_t i = 0; i < 64; i++) {
+  //   const uint32_t s1 = rr(hs[4], 6) ^ rr(hs[4], 11) ^ rr(hs[4], 25),
+  //                  ch = (hs[4] & hs[5]) ^ ((~(hs[4])) & hs[6]),
+  //                  t0 = hs[7] + s1 + ch + K[i] + w[i],
+  //                  s0 = rr(hs[0], 2) ^ rr(hs[0], 13) ^ rr(hs[0], 22),
+  //                  mj = (hs[0] & hs[1]) ^ (hs[0] & hs[2]) ^ (hs[1] & hs[2]),
+  //                  t1 = s0 + mj;
+
+  //   hs[7] = hs[6];
+  //   hs[6] = hs[5];
+  //   hs[5] = hs[4];
+  //   hs[4] = hs[3] + t0;
+  //   hs[3] = hs[2];
+  //   hs[2] = hs[1];
+  //   hs[1] = hs[0];
+  //   hs[0] = t0 + t1;
+  // }
+  //
+  // // fully unrolled version:
+  // WC(0); WC(1); WC(2); WC(3); WC(4); WC(5); WC(6); WC(7);
+  // WC(8); WC(9); WC(10); WC(11); WC(12); WC(13); WC(14); WC(15);
+  // WC(16); WC(17); WC(18); WC(19); WC(20); WC(21); WC(22); WC(23);
+  // WC(24); WC(25); WC(26); WC(27); WC(28); WC(29); WC(30); WC(31);
+  // WC(32); WC(33); WC(34); WC(35); WC(36); WC(37); WC(38); WC(39);
+  // WC(40); WC(41); WC(42); WC(43); WC(44); WC(45); WC(46); WC(47);
+  // WC(48); WC(49); WC(50); WC(51); WC(52); WC(53); WC(54); WC(55);
+  // WC(56); WC(57); WC(58); WC(59); WC(60); WC(61); WC(62); WC(63);
+  //
+  // partially unrolled:
+  for (size_t i = 0; i < 64; i += 16) {
+    WC(i + 0); WC(i + 1); WC(i + 2); WC(i + 3);
+    WC(i + 4); WC(i + 5); WC(i + 6); WC(i + 7);
+    WC(i + 8); WC(i + 9); WC(i + 10); WC(i + 11);
+    WC(i + 12); WC(i + 13); WC(i + 14); WC(i + 15);
   }
 
   // Add the compressed chunk to the current hash value
@@ -131,22 +197,43 @@ sha256_block(sha256_t * const ctx) {
 }
 
 #undef WI
+#undef WE
+#undef WC
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
 
 void sha256_push(
   sha256_t * const ctx,
   const uint8_t * const src,
   const size_t src_len
 ) {
-  for (size_t i = 0; i < src_len; i++) {
-    ctx->buf[ctx->buf_len] = src[i];
-    ctx->buf_len++;
+  const size_t buf_left = 64 - ctx->buf_len;
+
+  if (src_len >= buf_left) {
+    // fill remaining buffer
+    memcpy(ctx->buf + ctx->buf_len, src, buf_left);
+    sha256_block(ctx);
+    ctx->buf_len = 0;
+
+    const size_t new_src_len = src_len - buf_left;
+    const size_t num_blocks = new_src_len / 64;
 
-    if (ctx->buf_len == 64) {
+    // process chunks
+    for (size_t i = 0; i < num_blocks; i++) {
+      memcpy(ctx->buf, src + buf_left + (64 * i), 64);
       sha256_block(ctx);
-      ctx->buf_len = 0;
     }
+
+    // copy remaining bytes to buffer
+    const size_t new_buf_len = (new_src_len - 64 * num_blocks);
+    memcpy(ctx->buf, src + buf_left + (64 * num_blocks), new_buf_len);
+    ctx->buf_len = new_buf_len;
+  } else {
+    memcpy(ctx->buf + ctx->buf_len, src, src_len);
+    ctx->buf_len += src_len;
   }
 
+  // update byte count
   ctx->num_bytes += src_len;
 }
 
@@ -191,18 +278,12 @@ void sha256_fini(
   const uint64_t num_bytes = ctx->num_bytes;
   const size_t pad_len = (65 - ((num_bytes + 1 + 8) % 64));
 
-  // fprintf(stderr, "ctx->num_bytes (before pad) = %lu\n", ctx->num_bytes);
-
   // push padding
   sha256_push(ctx, PADDING, pad_len);
 
-  // fprintf(stderr, "ctx->num_bytes (before len) = %lu\n", ctx->num_bytes);
-
   // push length (in bits)
   sha256_push_u64(ctx, num_bytes * 8);
 
-  // fprintf(stderr, "ctx->num_bytes (after len) = %lu\n", ctx->num_bytes);
-
   // extract hash
   const uint8_t hash[32] = {
     WB(ctx, 0), WB(ctx, 1), WB(ctx, 2), WB(ctx, 3),
-- 
cgit v1.2.3