deps/cifra/src/sha3.c - third_party/github/h2o/picotls - Git at Google

 /*
  * cifra - embedded cryptography library
  * Written in 2014 by Joseph Birr-Pixton <jpixton@gmail.com>
  *
  * To the extent possible under law, the author(s) have dedicated all
  * copyright and related and neighboring rights to this software to the
  * public domain worldwide. This software is distributed without any
  * warranty.
  *
  * You should have received a copy of the CC0 Public Domain Dedication
  * along with this software. If not, see
  * <http://creativecommons.org/publicdomain/zero/1.0/>.
  */

 #include <string.h>

 #include "sha3.h"
 #include "blockwise.h"
 #include "handy.h"
 #include "bitops.h"
 #include "tassert.h"

 /* The round constants, pre-interleaved.  See bitinter.py */
 static const cf_sha3_bi round_constants[24] = {
   { 0x00000001, 0x00000000 }, { 0x00000000, 0x00000089 },
   { 0x00000000, 0x8000008b }, { 0x00000000, 0x80008080 },
   { 0x00000001, 0x0000008b }, { 0x00000001, 0x00008000 },
   { 0x00000001, 0x80008088 }, { 0x00000001, 0x80000082 },
   { 0x00000000, 0x0000000b }, { 0x00000000, 0x0000000a },
   { 0x00000001, 0x00008082 }, { 0x00000000, 0x00008003 },
   { 0x00000001, 0x0000808b }, { 0x00000001, 0x8000000b },
   { 0x00000001, 0x8000008a }, { 0x00000001, 0x80000081 },
   { 0x00000000, 0x80000081 }, { 0x00000000, 0x80000008 },
   { 0x00000000, 0x00000083 }, { 0x00000000, 0x80008003 },
   { 0x00000001, 0x80008088 }, { 0x00000000, 0x80000088 },
   { 0x00000001, 0x00008000 }, { 0x00000000, 0x80008082 }
 };

 static const uint8_t rotation_constants[5][5] = {
   {  0,  1, 62, 28, 27, },
   { 36, 44,  6, 55, 20, },
   {  3, 10, 43, 25, 39, },
   { 41, 45, 15, 21,  8, },
   { 18,  2, 61, 56, 14, }
 };

 /* --- Bit interleaving and uninterleaving --- */
 /* See bitinter.py for models of these bit twiddles.  The originals
  * come from "Hacker's Delight" by Henry Warren, where they are named
  * shuffle2 and unshuffle.
  * See:
  *   http://www.hackersdelight.org/hdcodetxt/shuffle.c.txt
  *
  * The overriding aim is to change bit ordering:
  *   AaBbCcDd -> ABCDabcd
  * and back.  Once they're in the shuffled form, we can extract
  * odd/even bits by taking the half words from each pair.
  */

 static inline uint32_t shuffle_out(uint32_t x)
 {
   uint32_t t;
   t = (x ^ (x >> 1)) & 0x22222222;  x = x ^ t ^ (t << 1);
   t = (x ^ (x >> 2)) & 0x0c0c0c0c;  x = x ^ t ^ (t << 2);
   t = (x ^ (x >> 4)) & 0x00f000f0;  x = x ^ t ^ (t << 4);
   t = (x ^ (x >> 8)) & 0x0000ff00;  x = x ^ t ^ (t << 8);
   return x;
 }

 /* Convert ABCDabcd -> AaBbCcDd. */
 static inline uint32_t shuffle_in(uint32_t x)
 {
   uint32_t t;
   t = (x ^ (x >> 8)) & 0x0000ff00;  x = x ^ t ^ (t << 8);
   t = (x ^ (x >> 4)) & 0x00f000f0;  x = x ^ t ^ (t << 4);
   t = (x ^ (x >> 2)) & 0x0c0c0c0c;  x = x ^ t ^ (t << 2);
   t = (x ^ (x >> 1)) & 0x22222222;  x = x ^ t ^ (t << 1);
   return x;
 }

 static inline void read64_bi(cf_sha3_bi *out, const uint8_t data[8])
 {
   uint32_t lo = read32_le(data + 0),
            hi = read32_le(data + 4);

   lo = shuffle_out(lo);
   hi = shuffle_out(hi);

   out->odd = (lo & 0x0000ffff) | (hi << 16);
   out->evn = (lo >> 16) | (hi & 0xffff0000);
 }

 static inline void write64_bi(const cf_sha3_bi *bi, uint8_t data[8])
 {
   uint32_t lo = (bi->odd & 0x0000ffff) | (bi->evn << 16),
            hi = (bi->odd >> 16) | (bi->evn & 0xffff0000);

   lo = shuffle_in(lo);
   hi = shuffle_in(hi);

   write32_le(lo, data + 0);
   write32_le(hi, data + 4);
 }

 static inline void rotl_bi_1(cf_sha3_bi *out, const cf_sha3_bi *in)
 {
   /* in bit-interleaved representation, a rotation of 1
    * is a swap plus a single rotation of the odd word. */
   out->odd = rotl32(in->evn, 1);
   out->evn = in->odd;
 }

 static inline void rotl_bi_n(cf_sha3_bi *out, const cf_sha3_bi *in, uint8_t rot)
 {
   uint8_t half = rot >> 1;

   /* nb. rot is a constant, so this isn't a branch leak. */
   if (rot & 1)
   {
     out->odd = rotl32(in->evn, half + 1);
     out->evn = rotl32(in->odd, half);
   } else {
     out->evn = rotl32(in->evn, half);
     out->odd = rotl32(in->odd, half);
   }
 }

 /* --- */

 static void sha3_init(cf_sha3_context *ctx, uint16_t rate_bits, uint16_t capacity_bits)
 {
   mem_clean(ctx, sizeof *ctx);
   ctx->rate = rate_bits / 8;
   ctx->capacity = capacity_bits / 8;
 }

 static void absorb(cf_sha3_context *ctx, const uint8_t *data, uint16_t sz)
 {
   uint16_t lanes = sz / 8;

   for (uint16_t x = 0, y = 0, i = 0; i < lanes; i++)
   {
     cf_sha3_bi bi;
     read64_bi(&bi, data);
     ctx->A[x][y].odd ^= bi.odd;
     ctx->A[x][y].evn ^= bi.evn;
     data += 8;

     x++;
     if (x == 5)
     {
       y++;
       x = 0;
     }
   }
 }

 /* Integers [-1,20] mod 5. To avoid a divmod.  Indices
  * are constants; not data-dependant. */
 static const uint8_t mod5_table[] = {
   4,
   0,
   1, 2, 3, 4, 0, 1, 2, 3, 4, 0,
   1, 2, 3, 4, 0, 1, 2, 3, 4, 0
 };

 #define MOD5(x) (mod5_table[(x) + 1])

 static void theta(cf_sha3_context *ctx)
 {
   cf_sha3_bi C[5], D[5];

   for (int x = 0; x < 5; x++)
   {
     C[x].odd = ctx->A[x][0].odd ^ ctx->A[x][1].odd ^ ctx->A[x][2].odd ^ ctx->A[x][3].odd ^ ctx->A[x][4].odd;
     C[x].evn = ctx->A[x][0].evn ^ ctx->A[x][1].evn ^ ctx->A[x][2].evn ^ ctx->A[x][3].evn ^ ctx->A[x][4].evn;
   }

   for (int x = 0; x < 5; x++)
   {
     cf_sha3_bi r;
     rotl_bi_1(&r, &C[MOD5(x + 1)]);
     D[x].odd = C[MOD5(x - 1)].odd ^ r.odd;
     D[x].evn = C[MOD5(x - 1)].evn ^ r.evn;

     for (int y = 0; y < 5; y++)
     {
       ctx->A[x][y].odd ^= D[x].odd;
       ctx->A[x][y].evn ^= D[x].evn;
     }
   }
 }

 static void rho_pi_chi(cf_sha3_context *ctx)
 {
   cf_sha3_bi B[5][5] = { { { 0 } } };

   for (int x = 0; x < 5; x++)
     for (int y = 0; y < 5; y++)
       rotl_bi_n(&B[y][MOD5(2 * x + 3 * y)], &ctx->A[x][y], rotation_constants[y][x]);

   for (int x = 0; x < 5; x++)
   {
     unsigned x1 = MOD5(x + 1);
     unsigned x2 = MOD5(x + 2);

     for (int y = 0; y < 5; y++)
     {
       ctx->A[x][y].odd = B[x][y].odd ^ ((~ B[x1][y].odd) & B[x2][y].odd);
       ctx->A[x][y].evn = B[x][y].evn ^ ((~ B[x1][y].evn) & B[x2][y].evn);
     }
   }
 }

 static void permute(cf_sha3_context *ctx)
 {
   for (int r = 0; r < 24; r++)
   {
     theta(ctx);
     rho_pi_chi(ctx);

     /* iota */
     ctx->A[0][0].odd ^= round_constants[r].odd;
     ctx->A[0][0].evn ^= round_constants[r].evn;
   }
 }

 static void extract(cf_sha3_context *ctx, uint8_t *out, size_t nbytes)
 {
   uint16_t lanes = (nbytes + 7) / 8;

   for (uint16_t x = 0, y = 0, i = 0; i < lanes; i++)
   {
     if (nbytes >= 8)
     {
       write64_bi(&ctx->A[x][y], out);
       out += 8;
       nbytes -= 8;
     } else {
       uint8_t buf[8];
       write64_bi(&ctx->A[x][y], buf);
       memcpy(out, buf, nbytes);
       out += nbytes;
       nbytes = 0;
     }

     x++;
     if (x == 5)
     {
       y++;
       x = 0;
     }
   }
 }

 static void squeeze(cf_sha3_context *ctx, uint8_t *out, size_t nbytes)
 {
   while (nbytes)
   {
     size_t take = MIN(nbytes, ctx->rate);
     extract(ctx, out, take);
     out += take;
     nbytes -= take;

     assert(nbytes == 0);
 #if 0
     /* Note: if we ever have |H| >= rate, we need to permute
      * after each rate-length block.
      *
      * This cannot currently happen. */
     if (nbytes)
       permute(ctx);
 #endif
   }
 }

 static void sha3_block(void *vctx, const uint8_t *data)
 {
   cf_sha3_context *ctx = vctx;

   absorb(ctx, data, ctx->rate);
   permute(ctx);
 }

 static void sha3_update(cf_sha3_context *ctx, const void *data, size_t nbytes)
 {
   cf_blockwise_accumulate(ctx->partial, &ctx->npartial, ctx->rate,
                           data, nbytes,
                           sha3_block, ctx);
 }

 /* Padding and domain separation constants.
  *
  * FIPS 202 specifies that 0b01 is appended to hash function
  * input, and 0b1111 is appended to SHAKE input.
  *
  * This is done in internal (little endian) bit ordering, and
  * we convolve it with the leftmost (first) padding bit, so:
  *
  * Hash: 0b110
  * SHAKE: 0b11111
  */

 #define DOMAIN_HASH_PAD  0x06
 #define DOMAIN_SHAKE_PAD 0x1f

 static void pad(cf_sha3_context *ctx, uint8_t domain, size_t npad)
 {
   assert(npad >= 1);

   cf_blockwise_acc_pad(ctx->partial, &ctx->npartial, ctx->rate,
                        domain, 0x00, 0x80,
                        npad,
                        sha3_block, ctx);
 }

 static void pad_and_squeeze(cf_sha3_context *ctx, uint8_t *out, size_t nout)
 {
   pad(ctx, DOMAIN_HASH_PAD, ctx->rate - ctx->npartial);
   assert(ctx->npartial == 0);

   squeeze(ctx, out, nout);
   mem_clean(ctx, sizeof *ctx);
 }

 /* SHA3-224 */
 void cf_sha3_224_init(cf_sha3_context *ctx)
 {
   sha3_init(ctx, 1152, 448);
 }

 void cf_sha3_224_update(cf_sha3_context *ctx, const void *data, size_t nbytes)
 {
   sha3_update(ctx, data, nbytes);
 }

 void cf_sha3_224_digest(const cf_sha3_context *ctx, uint8_t hash[CF_SHA3_224_HASHSZ])
 {
   cf_sha3_context ours = *ctx;
   cf_sha3_224_digest_final(&ours, hash);
 }

 void cf_sha3_224_digest_final(cf_sha3_context *ctx, uint8_t hash[CF_SHA3_224_HASHSZ])
 {
   pad_and_squeeze(ctx, hash, CF_SHA3_224_HASHSZ);
 }

 const cf_chash cf_sha3_224 = {
   .hashsz = CF_SHA3_224_HASHSZ,
   .blocksz = CF_SHA3_224_BLOCKSZ,
   .init = (cf_chash_init) cf_sha3_224_init,
   .update = (cf_chash_update) cf_sha3_224_update,
   .digest = (cf_chash_digest) cf_sha3_224_digest
 };

 /* SHA3-256 */
 void cf_sha3_256_init(cf_sha3_context *ctx)
 {
   sha3_init(ctx, 1088, 512);
 }

 void cf_sha3_256_update(cf_sha3_context *ctx, const void *data, size_t nbytes)
 {
   sha3_update(ctx, data, nbytes);
 }

 void cf_sha3_256_digest(const cf_sha3_context *ctx, uint8_t hash[CF_SHA3_256_HASHSZ])
 {
   cf_sha3_context ours = *ctx;
   cf_sha3_256_digest_final(&ours, hash);
 }

 void cf_sha3_256_digest_final(cf_sha3_context *ctx, uint8_t hash[CF_SHA3_256_HASHSZ])
 {
   pad_and_squeeze(ctx, hash, CF_SHA3_256_HASHSZ);
 }

 const cf_chash cf_sha3_256 = {
   .hashsz = CF_SHA3_256_HASHSZ,
   .blocksz = CF_SHA3_256_BLOCKSZ,
   .init = (cf_chash_init) cf_sha3_256_init,
   .update = (cf_chash_update) cf_sha3_256_update,
   .digest = (cf_chash_digest) cf_sha3_256_digest
 };

 /* SHA3-384 */
 void cf_sha3_384_init(cf_sha3_context *ctx)
 {
   sha3_init(ctx, 832, 768);
 }

 void cf_sha3_384_update(cf_sha3_context *ctx, const void *data, size_t nbytes)
 {
   sha3_update(ctx, data, nbytes);
 }

 void cf_sha3_384_digest(const cf_sha3_context *ctx, uint8_t hash[CF_SHA3_384_HASHSZ])
 {
   cf_sha3_context ours = *ctx;
   cf_sha3_384_digest_final(&ours, hash);
 }

 void cf_sha3_384_digest_final(cf_sha3_context *ctx, uint8_t hash[CF_SHA3_384_HASHSZ])
 {
   pad_and_squeeze(ctx, hash, CF_SHA3_384_HASHSZ);
 }

 const cf_chash cf_sha3_384 = {
   .hashsz = CF_SHA3_384_HASHSZ,
   .blocksz = CF_SHA3_384_BLOCKSZ,
   .init = (cf_chash_init) cf_sha3_384_init,
   .update = (cf_chash_update) cf_sha3_384_update,
   .digest = (cf_chash_digest) cf_sha3_384_digest
 };

 /* SHA3-512 */
 void cf_sha3_512_init(cf_sha3_context *ctx)
 {
   sha3_init(ctx, 576, 1024);
 }

 void cf_sha3_512_update(cf_sha3_context *ctx, const void *data, size_t nbytes)
 {
   sha3_update(ctx, data, nbytes);
 }

 void cf_sha3_512_digest(const cf_sha3_context *ctx, uint8_t hash[CF_SHA3_512_HASHSZ])
 {
   cf_sha3_context ours = *ctx;
   cf_sha3_512_digest_final(&ours, hash);
 }

 void cf_sha3_512_digest_final(cf_sha3_context *ctx, uint8_t hash[CF_SHA3_512_HASHSZ])
 {
   pad_and_squeeze(ctx, hash, CF_SHA3_512_HASHSZ);
 }

 const cf_chash cf_sha3_512 = {
   .hashsz = CF_SHA3_512_HASHSZ,
   .blocksz = CF_SHA3_512_BLOCKSZ,
   .init = (cf_chash_init) cf_sha3_512_init,
   .update = (cf_chash_update) cf_sha3_512_update,
   .digest = (cf_chash_digest) cf_sha3_512_digest
 };
	/*
	* cifra - embedded cryptography library
	* Written in 2014 by Joseph Birr-Pixton <jpixton@gmail.com>
	*
	* To the extent possible under law, the author(s) have dedicated all
	* copyright and related and neighboring rights to this software to the
	* public domain worldwide. This software is distributed without any
	* warranty.
	*
	* You should have received a copy of the CC0 Public Domain Dedication
	* along with this software. If not, see
	* <http://creativecommons.org/publicdomain/zero/1.0/>.
	*/

	#include <string.h>

	#include "sha3.h"
	#include "blockwise.h"
	#include "handy.h"
	#include "bitops.h"
	#include "tassert.h"

	/* The round constants, pre-interleaved. See bitinter.py */
	static const cf_sha3_bi round_constants[24] = {
	{ 0x00000001, 0x00000000 }, { 0x00000000, 0x00000089 },
	{ 0x00000000, 0x8000008b }, { 0x00000000, 0x80008080 },
	{ 0x00000001, 0x0000008b }, { 0x00000001, 0x00008000 },
	{ 0x00000001, 0x80008088 }, { 0x00000001, 0x80000082 },
	{ 0x00000000, 0x0000000b }, { 0x00000000, 0x0000000a },
	{ 0x00000001, 0x00008082 }, { 0x00000000, 0x00008003 },
	{ 0x00000001, 0x0000808b }, { 0x00000001, 0x8000000b },
	{ 0x00000001, 0x8000008a }, { 0x00000001, 0x80000081 },
	{ 0x00000000, 0x80000081 }, { 0x00000000, 0x80000008 },
	{ 0x00000000, 0x00000083 }, { 0x00000000, 0x80008003 },
	{ 0x00000001, 0x80008088 }, { 0x00000000, 0x80000088 },
	{ 0x00000001, 0x00008000 }, { 0x00000000, 0x80008082 }
	};

	static const uint8_t rotation_constants[5][5] = {
	{ 0, 1, 62, 28, 27, },
	{ 36, 44, 6, 55, 20, },
	{ 3, 10, 43, 25, 39, },
	{ 41, 45, 15, 21, 8, },
	{ 18, 2, 61, 56, 14, }
	};

	/* --- Bit interleaving and uninterleaving --- */
	/* See bitinter.py for models of these bit twiddles. The originals
	* come from "Hacker's Delight" by Henry Warren, where they are named
	* shuffle2 and unshuffle.
	* See:
	* http://www.hackersdelight.org/hdcodetxt/shuffle.c.txt
	*
	* The overriding aim is to change bit ordering:
	* AaBbCcDd -> ABCDabcd
	* and back. Once they're in the shuffled form, we can extract
	* odd/even bits by taking the half words from each pair.
	*/

	static inline uint32_t shuffle_out(uint32_t x)
	{
	uint32_t t;
	t = (x ^ (x >> 1)) & 0x22222222; x = x ^ t ^ (t << 1);
	t = (x ^ (x >> 2)) & 0x0c0c0c0c; x = x ^ t ^ (t << 2);
	t = (x ^ (x >> 4)) & 0x00f000f0; x = x ^ t ^ (t << 4);
	t = (x ^ (x >> 8)) & 0x0000ff00; x = x ^ t ^ (t << 8);
	return x;
	}

	/* Convert ABCDabcd -> AaBbCcDd. */
	static inline uint32_t shuffle_in(uint32_t x)
	{
	uint32_t t;
	t = (x ^ (x >> 8)) & 0x0000ff00; x = x ^ t ^ (t << 8);
	t = (x ^ (x >> 4)) & 0x00f000f0; x = x ^ t ^ (t << 4);
	t = (x ^ (x >> 2)) & 0x0c0c0c0c; x = x ^ t ^ (t << 2);
	t = (x ^ (x >> 1)) & 0x22222222; x = x ^ t ^ (t << 1);
	return x;
	}

	static inline void read64_bi(cf_sha3_bi *out, const uint8_t data[8])
	{
	uint32_t lo = read32_le(data + 0),
	hi = read32_le(data + 4);

	lo = shuffle_out(lo);
	hi = shuffle_out(hi);

	out->odd = (lo & 0x0000ffff) \| (hi << 16);
	out->evn = (lo >> 16) \| (hi & 0xffff0000);
	}

	static inline void write64_bi(const cf_sha3_bi *bi, uint8_t data[8])
	{
	uint32_t lo = (bi->odd & 0x0000ffff) \| (bi->evn << 16),
	hi = (bi->odd >> 16) \| (bi->evn & 0xffff0000);

	lo = shuffle_in(lo);
	hi = shuffle_in(hi);

	write32_le(lo, data + 0);
	write32_le(hi, data + 4);
	}

	static inline void rotl_bi_1(cf_sha3_bi out, const cf_sha3_bi in)
	{
	/* in bit-interleaved representation, a rotation of 1
	* is a swap plus a single rotation of the odd word. */
	out->odd = rotl32(in->evn, 1);
	out->evn = in->odd;
	}

	static inline void rotl_bi_n(cf_sha3_bi out, const cf_sha3_bi in, uint8_t rot)
	{
	uint8_t half = rot >> 1;

	/* nb. rot is a constant, so this isn't a branch leak. */
	if (rot & 1)
	{
	out->odd = rotl32(in->evn, half + 1);
	out->evn = rotl32(in->odd, half);
	} else {
	out->evn = rotl32(in->evn, half);
	out->odd = rotl32(in->odd, half);
	}
	}

	/* --- */

	static void sha3_init(cf_sha3_context *ctx, uint16_t rate_bits, uint16_t capacity_bits)
	{
	mem_clean(ctx, sizeof *ctx);
	ctx->rate = rate_bits / 8;
	ctx->capacity = capacity_bits / 8;
	}

	static void absorb(cf_sha3_context ctx, const uint8_t data, uint16_t sz)
	{
	uint16_t lanes = sz / 8;

	for (uint16_t x = 0, y = 0, i = 0; i < lanes; i++)
	{
	cf_sha3_bi bi;
	read64_bi(&bi, data);
	ctx->A[x][y].odd ^= bi.odd;
	ctx->A[x][y].evn ^= bi.evn;
	data += 8;

	x++;
	if (x == 5)
	{
	y++;
	x = 0;
	}
	}
	}

	/* Integers [-1,20] mod 5. To avoid a divmod. Indices
	* are constants; not data-dependant. */
	static const uint8_t mod5_table[] = {
	4,
	0,
	1, 2, 3, 4, 0, 1, 2, 3, 4, 0,
	1, 2, 3, 4, 0, 1, 2, 3, 4, 0
	};

	#define MOD5(x) (mod5_table[(x) + 1])

	static void theta(cf_sha3_context *ctx)
	{
	cf_sha3_bi C[5], D[5];

	for (int x = 0; x < 5; x++)
	{
	C[x].odd = ctx->A[x][0].odd ^ ctx->A[x][1].odd ^ ctx->A[x][2].odd ^ ctx->A[x][3].odd ^ ctx->A[x][4].odd;
	C[x].evn = ctx->A[x][0].evn ^ ctx->A[x][1].evn ^ ctx->A[x][2].evn ^ ctx->A[x][3].evn ^ ctx->A[x][4].evn;
	}

	for (int x = 0; x < 5; x++)
	{
	cf_sha3_bi r;
	rotl_bi_1(&r, &C[MOD5(x + 1)]);
	D[x].odd = C[MOD5(x - 1)].odd ^ r.odd;
	D[x].evn = C[MOD5(x - 1)].evn ^ r.evn;

	for (int y = 0; y < 5; y++)
	{
	ctx->A[x][y].odd ^= D[x].odd;
	ctx->A[x][y].evn ^= D[x].evn;
	}
	}
	}

	static void rho_pi_chi(cf_sha3_context *ctx)
	{
	cf_sha3_bi B[5][5] = { { { 0 } } };

	for (int x = 0; x < 5; x++)
	for (int y = 0; y < 5; y++)
	rotl_bi_n(&B[y][MOD5(2 * x + 3 * y)], &ctx->A[x][y], rotation_constants[y][x]);

	for (int x = 0; x < 5; x++)
	{
	unsigned x1 = MOD5(x + 1);
	unsigned x2 = MOD5(x + 2);

	for (int y = 0; y < 5; y++)
	{
	ctx->A[x][y].odd = B[x][y].odd ^ ((~ B[x1][y].odd) & B[x2][y].odd);
	ctx->A[x][y].evn = B[x][y].evn ^ ((~ B[x1][y].evn) & B[x2][y].evn);
	}
	}
	}

	static void permute(cf_sha3_context *ctx)
	{
	for (int r = 0; r < 24; r++)
	{
	theta(ctx);
	rho_pi_chi(ctx);

	/* iota */
	ctx->A[0][0].odd ^= round_constants[r].odd;
	ctx->A[0][0].evn ^= round_constants[r].evn;
	}
	}

	static void extract(cf_sha3_context ctx, uint8_t out, size_t nbytes)
	{
	uint16_t lanes = (nbytes + 7) / 8;

	for (uint16_t x = 0, y = 0, i = 0; i < lanes; i++)
	{
	if (nbytes >= 8)
	{
	write64_bi(&ctx->A[x][y], out);
	out += 8;
	nbytes -= 8;
	} else {
	uint8_t buf[8];
	write64_bi(&ctx->A[x][y], buf);
	memcpy(out, buf, nbytes);
	out += nbytes;
	nbytes = 0;
	}

	x++;
	if (x == 5)
	{
	y++;
	x = 0;
	}
	}
	}

	static void squeeze(cf_sha3_context ctx, uint8_t out, size_t nbytes)
	{
	while (nbytes)
	{
	size_t take = MIN(nbytes, ctx->rate);
	extract(ctx, out, take);
	out += take;
	nbytes -= take;

	assert(nbytes == 0);
	#if 0
	/* Note: if we ever have \|H\| >= rate, we need to permute
	* after each rate-length block.
	*
	* This cannot currently happen. */
	if (nbytes)
	permute(ctx);
	#endif
	}
	}

	static void sha3_block(void vctx, const uint8_t data)
	{
	cf_sha3_context *ctx = vctx;

	absorb(ctx, data, ctx->rate);
	permute(ctx);
	}

	static void sha3_update(cf_sha3_context ctx, const void data, size_t nbytes)
	{
	cf_blockwise_accumulate(ctx->partial, &ctx->npartial, ctx->rate,
	data, nbytes,
	sha3_block, ctx);
	}

	/* Padding and domain separation constants.
	*
	* FIPS 202 specifies that 0b01 is appended to hash function
	* input, and 0b1111 is appended to SHAKE input.
	*
	* This is done in internal (little endian) bit ordering, and
	* we convolve it with the leftmost (first) padding bit, so:
	*
	* Hash: 0b110
	* SHAKE: 0b11111
	*/

	#define DOMAIN_HASH_PAD 0x06
	#define DOMAIN_SHAKE_PAD 0x1f

	static void pad(cf_sha3_context *ctx, uint8_t domain, size_t npad)
	{
	assert(npad >= 1);

	cf_blockwise_acc_pad(ctx->partial, &ctx->npartial, ctx->rate,
	domain, 0x00, 0x80,
	npad,
	sha3_block, ctx);
	}

	static void pad_and_squeeze(cf_sha3_context ctx, uint8_t out, size_t nout)
	{
	pad(ctx, DOMAIN_HASH_PAD, ctx->rate - ctx->npartial);
	assert(ctx->npartial == 0);

	squeeze(ctx, out, nout);
	mem_clean(ctx, sizeof *ctx);
	}

	/* SHA3-224 */
	void cf_sha3_224_init(cf_sha3_context *ctx)
	{
	sha3_init(ctx, 1152, 448);
	}

	void cf_sha3_224_update(cf_sha3_context ctx, const void data, size_t nbytes)
	{
	sha3_update(ctx, data, nbytes);
	}

	void cf_sha3_224_digest(const cf_sha3_context *ctx, uint8_t hash[CF_SHA3_224_HASHSZ])
	{
	cf_sha3_context ours = *ctx;
	cf_sha3_224_digest_final(&ours, hash);
	}

	void cf_sha3_224_digest_final(cf_sha3_context *ctx, uint8_t hash[CF_SHA3_224_HASHSZ])
	{
	pad_and_squeeze(ctx, hash, CF_SHA3_224_HASHSZ);
	}

	const cf_chash cf_sha3_224 = {
	.hashsz = CF_SHA3_224_HASHSZ,
	.blocksz = CF_SHA3_224_BLOCKSZ,
	.init = (cf_chash_init) cf_sha3_224_init,
	.update = (cf_chash_update) cf_sha3_224_update,
	.digest = (cf_chash_digest) cf_sha3_224_digest
	};

	/* SHA3-256 */
	void cf_sha3_256_init(cf_sha3_context *ctx)
	{
	sha3_init(ctx, 1088, 512);
	}

	void cf_sha3_256_update(cf_sha3_context ctx, const void data, size_t nbytes)
	{
	sha3_update(ctx, data, nbytes);
	}

	void cf_sha3_256_digest(const cf_sha3_context *ctx, uint8_t hash[CF_SHA3_256_HASHSZ])
	{
	cf_sha3_context ours = *ctx;
	cf_sha3_256_digest_final(&ours, hash);
	}

	void cf_sha3_256_digest_final(cf_sha3_context *ctx, uint8_t hash[CF_SHA3_256_HASHSZ])
	{
	pad_and_squeeze(ctx, hash, CF_SHA3_256_HASHSZ);
	}

	const cf_chash cf_sha3_256 = {
	.hashsz = CF_SHA3_256_HASHSZ,
	.blocksz = CF_SHA3_256_BLOCKSZ,
	.init = (cf_chash_init) cf_sha3_256_init,
	.update = (cf_chash_update) cf_sha3_256_update,
	.digest = (cf_chash_digest) cf_sha3_256_digest
	};

	/* SHA3-384 */
	void cf_sha3_384_init(cf_sha3_context *ctx)
	{
	sha3_init(ctx, 832, 768);
	}

	void cf_sha3_384_update(cf_sha3_context ctx, const void data, size_t nbytes)
	{
	sha3_update(ctx, data, nbytes);
	}

	void cf_sha3_384_digest(const cf_sha3_context *ctx, uint8_t hash[CF_SHA3_384_HASHSZ])
	{
	cf_sha3_context ours = *ctx;
	cf_sha3_384_digest_final(&ours, hash);
	}

	void cf_sha3_384_digest_final(cf_sha3_context *ctx, uint8_t hash[CF_SHA3_384_HASHSZ])
	{
	pad_and_squeeze(ctx, hash, CF_SHA3_384_HASHSZ);
	}

	const cf_chash cf_sha3_384 = {
	.hashsz = CF_SHA3_384_HASHSZ,
	.blocksz = CF_SHA3_384_BLOCKSZ,
	.init = (cf_chash_init) cf_sha3_384_init,
	.update = (cf_chash_update) cf_sha3_384_update,
	.digest = (cf_chash_digest) cf_sha3_384_digest
	};

	/* SHA3-512 */
	void cf_sha3_512_init(cf_sha3_context *ctx)
	{
	sha3_init(ctx, 576, 1024);
	}

	void cf_sha3_512_update(cf_sha3_context ctx, const void data, size_t nbytes)
	{
	sha3_update(ctx, data, nbytes);
	}

	void cf_sha3_512_digest(const cf_sha3_context *ctx, uint8_t hash[CF_SHA3_512_HASHSZ])
	{
	cf_sha3_context ours = *ctx;
	cf_sha3_512_digest_final(&ours, hash);
	}

	void cf_sha3_512_digest_final(cf_sha3_context *ctx, uint8_t hash[CF_SHA3_512_HASHSZ])
	{
	pad_and_squeeze(ctx, hash, CF_SHA3_512_HASHSZ);
	}

	const cf_chash cf_sha3_512 = {
	.hashsz = CF_SHA3_512_HASHSZ,
	.blocksz = CF_SHA3_512_BLOCKSZ,
	.init = (cf_chash_init) cf_sha3_512_init,
	.update = (cf_chash_update) cf_sha3_512_update,
	.digest = (cf_chash_digest) cf_sha3_512_digest
	};