diff mbox series

[RFC,v2,08/12] crypto: poly1305 - add Poly1305 core API

Message ID 20181015175424.97147-9-ebiggers@kernel.org (mailing list archive)
State RFC
Delegated to: Herbert Xu
Headers show
Series crypto: Adiantum support | expand

Commit Message

Eric Biggers Oct. 15, 2018, 5:54 p.m. UTC
From: Eric Biggers <ebiggers@google.com>

Expose a low-level Poly1305 API which implements the
ε-almost-∆-universal (εA∆U) hash function underlying the Poly1305 MAC
and supports block-aligned inputs only.

This is needed for Adiantum hashing, which builds an εA∆U hash function
from NH and a polynomial evaluation in GF(2^{130}-5); this polynomial
evaluation is identical to the one the Poly1305 MAC does.  However, the
crypto_shash Poly1305 API isn't very appropriate for this because its
calling convention assumes it is used as a MAC, with a 32-byte
"one-time key" provided for every digest.

But by design, in Adiantum hashing the performance of the polynomial
evaluation isn't nearly as critical as NH.  So it suffices to just have
some C helper functions.  Thus, this patch adds such functions.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/poly1305_glue.c |  20 ++--
 crypto/poly1305_generic.c       | 174 ++++++++++++++++++--------------
 include/crypto/poly1305.h       |  28 ++++-
 3 files changed, 136 insertions(+), 86 deletions(-)

Comments

Ard Biesheuvel Oct. 20, 2018, 3:45 a.m. UTC | #1
On 16 October 2018 at 01:54, Eric Biggers <ebiggers@kernel.org> wrote:
> From: Eric Biggers <ebiggers@google.com>
>
> Expose a low-level Poly1305 API which implements the
> ε-almost-∆-universal (εA∆U) hash function underlying the Poly1305 MAC
> and supports block-aligned inputs only.
>
> This is needed for Adiantum hashing, which builds an εA∆U hash function
> from NH and a polynomial evaluation in GF(2^{130}-5); this polynomial
> evaluation is identical to the one the Poly1305 MAC does.  However, the
> crypto_shash Poly1305 API isn't very appropriate for this because its
> calling convention assumes it is used as a MAC, with a 32-byte
> "one-time key" provided for every digest.
>
> But by design, in Adiantum hashing the performance of the polynomial
> evaluation isn't nearly as critical as NH.  So it suffices to just have
> some C helper functions.  Thus, this patch adds such functions.
>
> Signed-off-by: Eric Biggers <ebiggers@google.com>

Could we split this up into
- a patch that updates the poly1305_desc_ctx layout and fixes up all
the references
- a patch that actually breaks out the functionality you need to
access separately

I am aware that you'll end up touching some lines twice, but it should
be much easier to review.

> ---
>  arch/x86/crypto/poly1305_glue.c |  20 ++--
>  crypto/poly1305_generic.c       | 174 ++++++++++++++++++--------------
>  include/crypto/poly1305.h       |  28 ++++-
>  3 files changed, 136 insertions(+), 86 deletions(-)
>
> diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
> index f012b7e28ad1d..88cc01506c84a 100644
> --- a/arch/x86/crypto/poly1305_glue.c
> +++ b/arch/x86/crypto/poly1305_glue.c
> @@ -83,35 +83,37 @@ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
>         if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
>                 if (unlikely(!sctx->wset)) {
>                         if (!sctx->uset) {
> -                               memcpy(sctx->u, dctx->r, sizeof(sctx->u));
> -                               poly1305_simd_mult(sctx->u, dctx->r);
> +                               memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
> +                               poly1305_simd_mult(sctx->u, dctx->r.r);
>                                 sctx->uset = true;
>                         }
>                         memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
> -                       poly1305_simd_mult(sctx->u + 5, dctx->r);
> +                       poly1305_simd_mult(sctx->u + 5, dctx->r.r);
>                         memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
> -                       poly1305_simd_mult(sctx->u + 10, dctx->r);
> +                       poly1305_simd_mult(sctx->u + 10, dctx->r.r);
>                         sctx->wset = true;
>                 }
>                 blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
> -               poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u);
> +               poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
> +                                    sctx->u);
>                 src += POLY1305_BLOCK_SIZE * 4 * blocks;
>                 srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
>         }
>  #endif
>         if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
>                 if (unlikely(!sctx->uset)) {
> -                       memcpy(sctx->u, dctx->r, sizeof(sctx->u));
> -                       poly1305_simd_mult(sctx->u, dctx->r);
> +                       memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
> +                       poly1305_simd_mult(sctx->u, dctx->r.r);
>                         sctx->uset = true;
>                 }
>                 blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
> -               poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u);
> +               poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
> +                                    sctx->u);
>                 src += POLY1305_BLOCK_SIZE * 2 * blocks;
>                 srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
>         }
>         if (srclen >= POLY1305_BLOCK_SIZE) {
> -               poly1305_block_sse2(dctx->h, src, dctx->r, 1);
> +               poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
>                 srclen -= POLY1305_BLOCK_SIZE;
>         }
>         return srclen;
> diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c
> index 47d3a6b83931e..2a06874204e87 100644
> --- a/crypto/poly1305_generic.c
> +++ b/crypto/poly1305_generic.c
> @@ -38,7 +38,7 @@ int crypto_poly1305_init(struct shash_desc *desc)
>  {
>         struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
>
> -       memset(dctx->h, 0, sizeof(dctx->h));
> +       poly1305_core_init(&dctx->h);
>         dctx->buflen = 0;
>         dctx->rset = false;
>         dctx->sset = false;
> @@ -47,23 +47,16 @@ int crypto_poly1305_init(struct shash_desc *desc)
>  }
>  EXPORT_SYMBOL_GPL(crypto_poly1305_init);
>
> -static void poly1305_setrkey(struct poly1305_desc_ctx *dctx, const u8 *key)
> +void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
>  {
>         /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
> -       dctx->r[0] = (get_unaligned_le32(key +  0) >> 0) & 0x3ffffff;
> -       dctx->r[1] = (get_unaligned_le32(key +  3) >> 2) & 0x3ffff03;
> -       dctx->r[2] = (get_unaligned_le32(key +  6) >> 4) & 0x3ffc0ff;
> -       dctx->r[3] = (get_unaligned_le32(key +  9) >> 6) & 0x3f03fff;
> -       dctx->r[4] = (get_unaligned_le32(key + 12) >> 8) & 0x00fffff;
> -}
> -
> -static void poly1305_setskey(struct poly1305_desc_ctx *dctx, const u8 *key)
> -{
> -       dctx->s[0] = get_unaligned_le32(key +  0);
> -       dctx->s[1] = get_unaligned_le32(key +  4);
> -       dctx->s[2] = get_unaligned_le32(key +  8);
> -       dctx->s[3] = get_unaligned_le32(key + 12);
> +       key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
> +       key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
> +       key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
> +       key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
> +       key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
>  }
> +EXPORT_SYMBOL_GPL(poly1305_core_setkey);
>
>  /*
>   * Poly1305 requires a unique key for each tag, which implies that we can't set
> @@ -75,13 +68,16 @@ unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
>  {
>         if (!dctx->sset) {
>                 if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
> -                       poly1305_setrkey(dctx, src);
> +                       poly1305_core_setkey(&dctx->r, src);
>                         src += POLY1305_BLOCK_SIZE;
>                         srclen -= POLY1305_BLOCK_SIZE;
>                         dctx->rset = true;
>                 }
>                 if (srclen >= POLY1305_BLOCK_SIZE) {
> -                       poly1305_setskey(dctx, src);
> +                       dctx->s[0] = get_unaligned_le32(src +  0);
> +                       dctx->s[1] = get_unaligned_le32(src +  4);
> +                       dctx->s[2] = get_unaligned_le32(src +  8);
> +                       dctx->s[3] = get_unaligned_le32(src + 12);
>                         src += POLY1305_BLOCK_SIZE;
>                         srclen -= POLY1305_BLOCK_SIZE;
>                         dctx->sset = true;
> @@ -91,41 +87,37 @@ unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
>  }
>  EXPORT_SYMBOL_GPL(crypto_poly1305_setdesckey);
>
> -static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
> -                                   const u8 *src, unsigned int srclen,
> -                                   u32 hibit)
> +static void poly1305_blocks_internal(struct poly1305_state *state,
> +                                    const struct poly1305_key *key,
> +                                    const void *src, unsigned int nblocks,
> +                                    u32 hibit)
>  {
>         u32 r0, r1, r2, r3, r4;
>         u32 s1, s2, s3, s4;
>         u32 h0, h1, h2, h3, h4;
>         u64 d0, d1, d2, d3, d4;
> -       unsigned int datalen;
>
> -       if (unlikely(!dctx->sset)) {
> -               datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
> -               src += srclen - datalen;
> -               srclen = datalen;
> -       }
> +       if (!nblocks)
> +               return;
>
> -       r0 = dctx->r[0];
> -       r1 = dctx->r[1];
> -       r2 = dctx->r[2];
> -       r3 = dctx->r[3];
> -       r4 = dctx->r[4];
> +       r0 = key->r[0];
> +       r1 = key->r[1];
> +       r2 = key->r[2];
> +       r3 = key->r[3];
> +       r4 = key->r[4];
>
>         s1 = r1 * 5;
>         s2 = r2 * 5;
>         s3 = r3 * 5;
>         s4 = r4 * 5;
>
> -       h0 = dctx->h[0];
> -       h1 = dctx->h[1];
> -       h2 = dctx->h[2];
> -       h3 = dctx->h[3];
> -       h4 = dctx->h[4];
> -
> -       while (likely(srclen >= POLY1305_BLOCK_SIZE)) {
> +       h0 = state->h[0];
> +       h1 = state->h[1];
> +       h2 = state->h[2];
> +       h3 = state->h[3];
> +       h4 = state->h[4];
>
> +       do {
>                 /* h += m[i] */
>                 h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
>                 h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
> @@ -154,16 +146,36 @@ static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
>                 h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
>
>                 src += POLY1305_BLOCK_SIZE;
> -               srclen -= POLY1305_BLOCK_SIZE;
> -       }
> +       } while (--nblocks);
>
> -       dctx->h[0] = h0;
> -       dctx->h[1] = h1;
> -       dctx->h[2] = h2;
> -       dctx->h[3] = h3;
> -       dctx->h[4] = h4;
> +       state->h[0] = h0;
> +       state->h[1] = h1;
> +       state->h[2] = h2;
> +       state->h[3] = h3;
> +       state->h[4] = h4;
> +}
>
> -       return srclen;
> +void poly1305_core_blocks(struct poly1305_state *state,
> +                         const struct poly1305_key *key,
> +                         const void *src, unsigned int nblocks)
> +{
> +       poly1305_blocks_internal(state, key, src, nblocks, 1 << 24);
> +}
> +EXPORT_SYMBOL_GPL(poly1305_core_blocks);
> +
> +static void poly1305_blocks(struct poly1305_desc_ctx *dctx,
> +                           const u8 *src, unsigned int srclen, u32 hibit)
> +{
> +       unsigned int datalen;
> +
> +       if (unlikely(!dctx->sset)) {
> +               datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
> +               src += srclen - datalen;
> +               srclen = datalen;
> +       }
> +
> +       poly1305_blocks_internal(&dctx->h, &dctx->r,
> +                                src, srclen / POLY1305_BLOCK_SIZE, hibit);
>  }
>
>  int crypto_poly1305_update(struct shash_desc *desc,
> @@ -187,9 +199,9 @@ int crypto_poly1305_update(struct shash_desc *desc,
>         }
>
>         if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
> -               bytes = poly1305_blocks(dctx, src, srclen, 1 << 24);
> -               src += srclen - bytes;
> -               srclen = bytes;
> +               poly1305_blocks(dctx, src, srclen, 1 << 24);
> +               src += srclen - (srclen % POLY1305_BLOCK_SIZE);
> +               srclen %= POLY1305_BLOCK_SIZE;
>         }
>
>         if (unlikely(srclen)) {
> @@ -201,30 +213,18 @@ int crypto_poly1305_update(struct shash_desc *desc,
>  }
>  EXPORT_SYMBOL_GPL(crypto_poly1305_update);
>
> -int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
> +void poly1305_core_emit(const struct poly1305_state *state, void *dst)
>  {
> -       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
>         u32 h0, h1, h2, h3, h4;
>         u32 g0, g1, g2, g3, g4;
>         u32 mask;
> -       u64 f = 0;
> -
> -       if (unlikely(!dctx->sset))
> -               return -ENOKEY;
> -
> -       if (unlikely(dctx->buflen)) {
> -               dctx->buf[dctx->buflen++] = 1;
> -               memset(dctx->buf + dctx->buflen, 0,
> -                      POLY1305_BLOCK_SIZE - dctx->buflen);
> -               poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
> -       }
>
>         /* fully carry h */
> -       h0 = dctx->h[0];
> -       h1 = dctx->h[1];
> -       h2 = dctx->h[2];
> -       h3 = dctx->h[3];
> -       h4 = dctx->h[4];
> +       h0 = state->h[0];
> +       h1 = state->h[1];
> +       h2 = state->h[2];
> +       h3 = state->h[3];
> +       h4 = state->h[4];
>
>         h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
>         h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
> @@ -254,16 +254,40 @@ int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
>         h4 = (h4 & mask) | g4;
>
>         /* h = h % (2^128) */
> -       h0 = (h0 >>  0) | (h1 << 26);
> -       h1 = (h1 >>  6) | (h2 << 20);
> -       h2 = (h2 >> 12) | (h3 << 14);
> -       h3 = (h3 >> 18) | (h4 <<  8);
> +       put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
> +       put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
> +       put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
> +       put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
> +}
> +EXPORT_SYMBOL_GPL(poly1305_core_emit);
> +
> +int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
> +{
> +       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
> +       __le32 digest[4];
> +       u64 f = 0;
> +
> +       if (unlikely(!dctx->sset))
> +               return -ENOKEY;
> +
> +       if (unlikely(dctx->buflen)) {
> +               dctx->buf[dctx->buflen++] = 1;
> +               memset(dctx->buf + dctx->buflen, 0,
> +                      POLY1305_BLOCK_SIZE - dctx->buflen);
> +               poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
> +       }
> +
> +       poly1305_core_emit(&dctx->h, digest);
>
>         /* mac = (h + s) % (2^128) */
> -       f = (f >> 32) + h0 + dctx->s[0]; put_unaligned_le32(f, dst +  0);
> -       f = (f >> 32) + h1 + dctx->s[1]; put_unaligned_le32(f, dst +  4);
> -       f = (f >> 32) + h2 + dctx->s[2]; put_unaligned_le32(f, dst +  8);
> -       f = (f >> 32) + h3 + dctx->s[3]; put_unaligned_le32(f, dst + 12);
> +       f = (f >> 32) + le32_to_cpu(digest[0]) + dctx->s[0];
> +       put_unaligned_le32(f, dst + 0);
> +       f = (f >> 32) + le32_to_cpu(digest[1]) + dctx->s[1];
> +       put_unaligned_le32(f, dst + 4);
> +       f = (f >> 32) + le32_to_cpu(digest[2]) + dctx->s[2];
> +       put_unaligned_le32(f, dst + 8);
> +       f = (f >> 32) + le32_to_cpu(digest[3]) + dctx->s[3];
> +       put_unaligned_le32(f, dst + 12);
>
>         return 0;
>  }
> diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h
> index f718a19da82f7..34317ed2071e6 100644
> --- a/include/crypto/poly1305.h
> +++ b/include/crypto/poly1305.h
> @@ -13,13 +13,21 @@
>  #define POLY1305_KEY_SIZE      32
>  #define POLY1305_DIGEST_SIZE   16
>
> +struct poly1305_key {
> +       u32 r[5];       /* key, base 2^26 */
> +};
> +
> +struct poly1305_state {
> +       u32 h[5];       /* accumulator, base 2^26 */
> +};
> +
>  struct poly1305_desc_ctx {
>         /* key */
> -       u32 r[5];
> +       struct poly1305_key r;
>         /* finalize key */
>         u32 s[4];
>         /* accumulator */
> -       u32 h[5];
> +       struct poly1305_state h;
>         /* partial buffer */
>         u8 buf[POLY1305_BLOCK_SIZE];
>         /* bytes used in partial buffer */
> @@ -30,6 +38,22 @@ struct poly1305_desc_ctx {
>         bool sset;
>  };
>
> +/*
> + * Poly1305 core functions.  These implement the ε-almost-∆-universal hash
> + * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
> + * ("s key") at the end.  They also only support block-aligned inputs.
> + */
> +void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key);
> +static inline void poly1305_core_init(struct poly1305_state *state)
> +{
> +       memset(state->h, 0, sizeof(state->h));
> +}
> +void poly1305_core_blocks(struct poly1305_state *state,
> +                         const struct poly1305_key *key,
> +                         const void *src, unsigned int nblocks);
> +void poly1305_core_emit(const struct poly1305_state *state, void *dst);
> +
> +/* Crypto API helper functions for the Poly1305 MAC */
>  int crypto_poly1305_init(struct shash_desc *desc);
>  unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
>                                         const u8 *src, unsigned int srclen);
> --
> 2.19.1.331.ge82ca0e54c-goog
>
diff mbox series

Patch

diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index f012b7e28ad1d..88cc01506c84a 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -83,35 +83,37 @@  static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
 	if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
 		if (unlikely(!sctx->wset)) {
 			if (!sctx->uset) {
-				memcpy(sctx->u, dctx->r, sizeof(sctx->u));
-				poly1305_simd_mult(sctx->u, dctx->r);
+				memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+				poly1305_simd_mult(sctx->u, dctx->r.r);
 				sctx->uset = true;
 			}
 			memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
-			poly1305_simd_mult(sctx->u + 5, dctx->r);
+			poly1305_simd_mult(sctx->u + 5, dctx->r.r);
 			memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
-			poly1305_simd_mult(sctx->u + 10, dctx->r);
+			poly1305_simd_mult(sctx->u + 10, dctx->r.r);
 			sctx->wset = true;
 		}
 		blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
-		poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u);
+		poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
+				     sctx->u);
 		src += POLY1305_BLOCK_SIZE * 4 * blocks;
 		srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
 	}
 #endif
 	if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
 		if (unlikely(!sctx->uset)) {
-			memcpy(sctx->u, dctx->r, sizeof(sctx->u));
-			poly1305_simd_mult(sctx->u, dctx->r);
+			memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+			poly1305_simd_mult(sctx->u, dctx->r.r);
 			sctx->uset = true;
 		}
 		blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
-		poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u);
+		poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
+				     sctx->u);
 		src += POLY1305_BLOCK_SIZE * 2 * blocks;
 		srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
 	}
 	if (srclen >= POLY1305_BLOCK_SIZE) {
-		poly1305_block_sse2(dctx->h, src, dctx->r, 1);
+		poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
 		srclen -= POLY1305_BLOCK_SIZE;
 	}
 	return srclen;
diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c
index 47d3a6b83931e..2a06874204e87 100644
--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
@@ -38,7 +38,7 @@  int crypto_poly1305_init(struct shash_desc *desc)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 
-	memset(dctx->h, 0, sizeof(dctx->h));
+	poly1305_core_init(&dctx->h);
 	dctx->buflen = 0;
 	dctx->rset = false;
 	dctx->sset = false;
@@ -47,23 +47,16 @@  int crypto_poly1305_init(struct shash_desc *desc)
 }
 EXPORT_SYMBOL_GPL(crypto_poly1305_init);
 
-static void poly1305_setrkey(struct poly1305_desc_ctx *dctx, const u8 *key)
+void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
 {
 	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
-	dctx->r[0] = (get_unaligned_le32(key +  0) >> 0) & 0x3ffffff;
-	dctx->r[1] = (get_unaligned_le32(key +  3) >> 2) & 0x3ffff03;
-	dctx->r[2] = (get_unaligned_le32(key +  6) >> 4) & 0x3ffc0ff;
-	dctx->r[3] = (get_unaligned_le32(key +  9) >> 6) & 0x3f03fff;
-	dctx->r[4] = (get_unaligned_le32(key + 12) >> 8) & 0x00fffff;
-}
-
-static void poly1305_setskey(struct poly1305_desc_ctx *dctx, const u8 *key)
-{
-	dctx->s[0] = get_unaligned_le32(key +  0);
-	dctx->s[1] = get_unaligned_le32(key +  4);
-	dctx->s[2] = get_unaligned_le32(key +  8);
-	dctx->s[3] = get_unaligned_le32(key + 12);
+	key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
+	key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
+	key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
+	key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
+	key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
 }
+EXPORT_SYMBOL_GPL(poly1305_core_setkey);
 
 /*
  * Poly1305 requires a unique key for each tag, which implies that we can't set
@@ -75,13 +68,16 @@  unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
 {
 	if (!dctx->sset) {
 		if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
-			poly1305_setrkey(dctx, src);
+			poly1305_core_setkey(&dctx->r, src);
 			src += POLY1305_BLOCK_SIZE;
 			srclen -= POLY1305_BLOCK_SIZE;
 			dctx->rset = true;
 		}
 		if (srclen >= POLY1305_BLOCK_SIZE) {
-			poly1305_setskey(dctx, src);
+			dctx->s[0] = get_unaligned_le32(src +  0);
+			dctx->s[1] = get_unaligned_le32(src +  4);
+			dctx->s[2] = get_unaligned_le32(src +  8);
+			dctx->s[3] = get_unaligned_le32(src + 12);
 			src += POLY1305_BLOCK_SIZE;
 			srclen -= POLY1305_BLOCK_SIZE;
 			dctx->sset = true;
@@ -91,41 +87,37 @@  unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
 }
 EXPORT_SYMBOL_GPL(crypto_poly1305_setdesckey);
 
-static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
-				    const u8 *src, unsigned int srclen,
-				    u32 hibit)
+static void poly1305_blocks_internal(struct poly1305_state *state,
+				     const struct poly1305_key *key,
+				     const void *src, unsigned int nblocks,
+				     u32 hibit)
 {
 	u32 r0, r1, r2, r3, r4;
 	u32 s1, s2, s3, s4;
 	u32 h0, h1, h2, h3, h4;
 	u64 d0, d1, d2, d3, d4;
-	unsigned int datalen;
 
-	if (unlikely(!dctx->sset)) {
-		datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
-		src += srclen - datalen;
-		srclen = datalen;
-	}
+	if (!nblocks)
+		return;
 
-	r0 = dctx->r[0];
-	r1 = dctx->r[1];
-	r2 = dctx->r[2];
-	r3 = dctx->r[3];
-	r4 = dctx->r[4];
+	r0 = key->r[0];
+	r1 = key->r[1];
+	r2 = key->r[2];
+	r3 = key->r[3];
+	r4 = key->r[4];
 
 	s1 = r1 * 5;
 	s2 = r2 * 5;
 	s3 = r3 * 5;
 	s4 = r4 * 5;
 
-	h0 = dctx->h[0];
-	h1 = dctx->h[1];
-	h2 = dctx->h[2];
-	h3 = dctx->h[3];
-	h4 = dctx->h[4];
-
-	while (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+	h0 = state->h[0];
+	h1 = state->h[1];
+	h2 = state->h[2];
+	h3 = state->h[3];
+	h4 = state->h[4];
 
+	do {
 		/* h += m[i] */
 		h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
 		h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
@@ -154,16 +146,36 @@  static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
 		h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
 
 		src += POLY1305_BLOCK_SIZE;
-		srclen -= POLY1305_BLOCK_SIZE;
-	}
+	} while (--nblocks);
 
-	dctx->h[0] = h0;
-	dctx->h[1] = h1;
-	dctx->h[2] = h2;
-	dctx->h[3] = h3;
-	dctx->h[4] = h4;
+	state->h[0] = h0;
+	state->h[1] = h1;
+	state->h[2] = h2;
+	state->h[3] = h3;
+	state->h[4] = h4;
+}
 
-	return srclen;
+void poly1305_core_blocks(struct poly1305_state *state,
+			  const struct poly1305_key *key,
+			  const void *src, unsigned int nblocks)
+{
+	poly1305_blocks_internal(state, key, src, nblocks, 1 << 24);
+}
+EXPORT_SYMBOL_GPL(poly1305_core_blocks);
+
+static void poly1305_blocks(struct poly1305_desc_ctx *dctx,
+			    const u8 *src, unsigned int srclen, u32 hibit)
+{
+	unsigned int datalen;
+
+	if (unlikely(!dctx->sset)) {
+		datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+		src += srclen - datalen;
+		srclen = datalen;
+	}
+
+	poly1305_blocks_internal(&dctx->h, &dctx->r,
+				 src, srclen / POLY1305_BLOCK_SIZE, hibit);
 }
 
 int crypto_poly1305_update(struct shash_desc *desc,
@@ -187,9 +199,9 @@  int crypto_poly1305_update(struct shash_desc *desc,
 	}
 
 	if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
-		bytes = poly1305_blocks(dctx, src, srclen, 1 << 24);
-		src += srclen - bytes;
-		srclen = bytes;
+		poly1305_blocks(dctx, src, srclen, 1 << 24);
+		src += srclen - (srclen % POLY1305_BLOCK_SIZE);
+		srclen %= POLY1305_BLOCK_SIZE;
 	}
 
 	if (unlikely(srclen)) {
@@ -201,30 +213,18 @@  int crypto_poly1305_update(struct shash_desc *desc,
 }
 EXPORT_SYMBOL_GPL(crypto_poly1305_update);
 
-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+void poly1305_core_emit(const struct poly1305_state *state, void *dst)
 {
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 	u32 h0, h1, h2, h3, h4;
 	u32 g0, g1, g2, g3, g4;
 	u32 mask;
-	u64 f = 0;
-
-	if (unlikely(!dctx->sset))
-		return -ENOKEY;
-
-	if (unlikely(dctx->buflen)) {
-		dctx->buf[dctx->buflen++] = 1;
-		memset(dctx->buf + dctx->buflen, 0,
-		       POLY1305_BLOCK_SIZE - dctx->buflen);
-		poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
-	}
 
 	/* fully carry h */
-	h0 = dctx->h[0];
-	h1 = dctx->h[1];
-	h2 = dctx->h[2];
-	h3 = dctx->h[3];
-	h4 = dctx->h[4];
+	h0 = state->h[0];
+	h1 = state->h[1];
+	h2 = state->h[2];
+	h3 = state->h[3];
+	h4 = state->h[4];
 
 	h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
 	h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
@@ -254,16 +254,40 @@  int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
 	h4 = (h4 & mask) | g4;
 
 	/* h = h % (2^128) */
-	h0 = (h0 >>  0) | (h1 << 26);
-	h1 = (h1 >>  6) | (h2 << 20);
-	h2 = (h2 >> 12) | (h3 << 14);
-	h3 = (h3 >> 18) | (h4 <<  8);
+	put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
+	put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
+	put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
+	put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
+}
+EXPORT_SYMBOL_GPL(poly1305_core_emit);
+
+int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+{
+	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+	__le32 digest[4];
+	u64 f = 0;
+
+	if (unlikely(!dctx->sset))
+		return -ENOKEY;
+
+	if (unlikely(dctx->buflen)) {
+		dctx->buf[dctx->buflen++] = 1;
+		memset(dctx->buf + dctx->buflen, 0,
+		       POLY1305_BLOCK_SIZE - dctx->buflen);
+		poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+	}
+
+	poly1305_core_emit(&dctx->h, digest);
 
 	/* mac = (h + s) % (2^128) */
-	f = (f >> 32) + h0 + dctx->s[0]; put_unaligned_le32(f, dst +  0);
-	f = (f >> 32) + h1 + dctx->s[1]; put_unaligned_le32(f, dst +  4);
-	f = (f >> 32) + h2 + dctx->s[2]; put_unaligned_le32(f, dst +  8);
-	f = (f >> 32) + h3 + dctx->s[3]; put_unaligned_le32(f, dst + 12);
+	f = (f >> 32) + le32_to_cpu(digest[0]) + dctx->s[0];
+	put_unaligned_le32(f, dst + 0);
+	f = (f >> 32) + le32_to_cpu(digest[1]) + dctx->s[1];
+	put_unaligned_le32(f, dst + 4);
+	f = (f >> 32) + le32_to_cpu(digest[2]) + dctx->s[2];
+	put_unaligned_le32(f, dst + 8);
+	f = (f >> 32) + le32_to_cpu(digest[3]) + dctx->s[3];
+	put_unaligned_le32(f, dst + 12);
 
 	return 0;
 }
diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h
index f718a19da82f7..34317ed2071e6 100644
--- a/include/crypto/poly1305.h
+++ b/include/crypto/poly1305.h
@@ -13,13 +13,21 @@ 
 #define POLY1305_KEY_SIZE	32
 #define POLY1305_DIGEST_SIZE	16
 
+struct poly1305_key {
+	u32 r[5];	/* key, base 2^26 */
+};
+
+struct poly1305_state {
+	u32 h[5];	/* accumulator, base 2^26 */
+};
+
 struct poly1305_desc_ctx {
 	/* key */
-	u32 r[5];
+	struct poly1305_key r;
 	/* finalize key */
 	u32 s[4];
 	/* accumulator */
-	u32 h[5];
+	struct poly1305_state h;
 	/* partial buffer */
 	u8 buf[POLY1305_BLOCK_SIZE];
 	/* bytes used in partial buffer */
@@ -30,6 +38,22 @@  struct poly1305_desc_ctx {
 	bool sset;
 };
 
+/*
+ * Poly1305 core functions.  These implement the ε-almost-∆-universal hash
+ * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
+ * ("s key") at the end.  They also only support block-aligned inputs.
+ */
+void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key);
+static inline void poly1305_core_init(struct poly1305_state *state)
+{
+	memset(state->h, 0, sizeof(state->h));
+}
+void poly1305_core_blocks(struct poly1305_state *state,
+			  const struct poly1305_key *key,
+			  const void *src, unsigned int nblocks);
+void poly1305_core_emit(const struct poly1305_state *state, void *dst);
+
+/* Crypto API helper functions for the Poly1305 MAC */
 int crypto_poly1305_init(struct shash_desc *desc);
 unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
 					const u8 *src, unsigned int srclen);