diff mbox

[v5,4/4] crypto: Add Allwinner Security System crypto accelerator

Message ID 1413728182-13569-5-git-send-email-clabbe.montjoie@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Corentin Labbe Oct. 19, 2014, 2:16 p.m. UTC
Add support for the Security System included in Allwinner SoC A20.
The Security System is a hardware cryptographic accelerator that support AES/MD5/SHA1/DES/3DES/PRNG algorithms.

Signed-off-by: LABBE Corentin <clabbe.montjoie@gmail.com>
---
 drivers/crypto/Kconfig                    |  17 ++
 drivers/crypto/Makefile                   |   1 +
 drivers/crypto/sunxi-ss/Makefile          |   2 +
 drivers/crypto/sunxi-ss/sunxi-ss-cipher.c | 489 ++++++++++++++++++++++++++++++
 drivers/crypto/sunxi-ss/sunxi-ss-core.c   | 318 +++++++++++++++++++
 drivers/crypto/sunxi-ss/sunxi-ss-hash.c   | 445 +++++++++++++++++++++++++++
 drivers/crypto/sunxi-ss/sunxi-ss.h        | 193 ++++++++++++
 7 files changed, 1465 insertions(+)
 create mode 100644 drivers/crypto/sunxi-ss/Makefile
 create mode 100644 drivers/crypto/sunxi-ss/sunxi-ss-cipher.c
 create mode 100644 drivers/crypto/sunxi-ss/sunxi-ss-core.c
 create mode 100644 drivers/crypto/sunxi-ss/sunxi-ss-hash.c
 create mode 100644 drivers/crypto/sunxi-ss/sunxi-ss.h

Comments

Joe Perches Oct. 20, 2014, 11:52 p.m. UTC | #1
On Tue, 2014-10-21 at 02:28 +0300, Vladimir Zapolskiy wrote:
> On 19.10.2014 17:16, LABBE Corentin wrote:
> > Add support for the Security System included in Allwinner SoC A20.
> > The Security System is a hardware cryptographic accelerator that support AES/MD5/SHA1/DES/3DES/PRNG algorithms.
[]
> > diff --git a/drivers/crypto/sunxi-ss/sunxi-ss-core.c b/drivers/crypto/sunxi-ss/sunxi-ss-core.c
[]
> > +	cr = clk_get_rate(ss->busclk);
> > +	if (cr >= cr_ahb)
> > +		dev_dbg(&pdev->dev, "Clock bus %lu (%lu MHz) (must be >= %lu)\n",
> > +				cr, cr / 1000000, cr_ahb);
> > +	else
> > +		dev_warn(&pdev->dev, "Clock bus %lu (%lu MHz) (must be >= %lu)\n",
> > +				cr, cr / 1000000, cr_ahb);
> 
> See next comment.
> 
> > +	cr = clk_get_rate(ss->ssclk);
> > +	if (cr <= cr_mod)
> > +		if (cr < cr_mod)
> > +			dev_info(&pdev->dev, "Clock ss %lu (%lu MHz) (must be <= %lu)\n",
> > +					cr, cr / 1000000, cr_mod);
> > +		else
> > +			dev_dbg(&pdev->dev, "Clock ss %lu (%lu MHz) (must be <= %lu)\n",
> > +					cr, cr / 1000000, cr_mod);
> > +	else
> > +		dev_warn(&pdev->dev, "Clock ss is at %lu (%lu MHz) (must be <= %lu)\n",
> > +				cr, cr / 1000000, cr_mod);
> 
> The management of kernel log levels looks pretty strange. As far as I
> understand there is no error on any clock rate, I'd recommend to keep
> only one information message.

And if not, please add some braces.

> hash_init: initialize request context */
> > +int sunxi_hash_init(struct ahash_request *areq)
> > +{
> > +	const char *hash_type;
> > +	struct sunxi_req_ctx *op = ahash_request_ctx(areq);
> > +
> > +	memset(op, 0, sizeof(struct sunxi_req_ctx));
> > +
> > +	hash_type = crypto_tfm_alg_name(areq->base.tfm);
> > +
> > +	if (strcmp(hash_type, "sha1") == 0)
> > +		op->mode = SS_OP_SHA1;
> > +	if (strcmp(hash_type, "md5") == 0)
> > +		op->mode = SS_OP_MD5;

else if ?

> > +	if (op->mode == 0)
> > +		return -EINVAL;

maybe this?

	if (!strcmp(hash_type, "sha1"))
		op->mode = SS_OP_SHA1;
	else if (!strcmp(hash_type, "md5"))
		op->mode = SH_OP_MD5;
	else
		return -EINVAL;

> > +
> > +	return 0;
> > +}
[]
> > +int sunxi_hash_update(struct ahash_request *areq)
> > +{
[]
> > +	dev_dbg(ss->dev, "%s %s bc=%llu len=%u mode=%x bw=%u ww=%u",
> > +			__func__, crypto_tfm_alg_name(areq->base.tfm),
> > +			op->byte_count, areq->nbytes, op->mode,
> > +			op->nbw, op->nwait);

dev_dbg statements generally don't need __func__ as
dynamic_debug can add it.

If you want to keep it, the most common output form for
__func__ is '"%s: ...", __func__'
Corentin Labbe Oct. 21, 2014, 4:25 p.m. UTC | #2
On 10/21/14 01:28, Vladimir Zapolskiy wrote:
> Hello LABBE,
> 
> On 19.10.2014 17:16, LABBE Corentin wrote:
>> Add support for the Security System included in Allwinner SoC A20.
>> The Security System is a hardware cryptographic accelerator that support AES/MD5/SHA1/DES/3DES/PRNG algorithms.
>>
[]
>> +
>> +	/* If we have only one SG, we can use kmap_atomic */
>> +	if (sg_next(in_sg) == NULL && sg_next(out_sg) == NULL)
>> +		return sunxi_ss_aes_poll_atomic(areq);
> 
> for clarity it might be better to move all "mutex_unlock(&ss->lock)"
> calls from sunxi_ss_aes_poll_atomic() body right to here.
> 

Ok
I have moved all mutex_unlock/writel(0, SS_CTL) at the end of function, it is cleaner now.

>> +
>> +};
>> +
>> +static int sunxi_ss_probe(struct platform_device *pdev)
>> +{
>> +	struct resource *res;
>> +	u32 v;
>> +	int err;
>> +	unsigned long cr;
>> +	const unsigned long cr_ahb = 24 * 1000 * 1000;
>> +	const unsigned long cr_mod = 150 * 1000 * 1000;
>> +
>> +	if (!pdev->dev.of_node)
>> +		return -ENODEV;
>> +
>> +	ss = devm_kzalloc(&pdev->dev, sizeof(*ss), GFP_KERNEL);
>> +	if (ss == NULL)
>> +		return -ENOMEM;
> 
> Why do you dynamically allocate memory for "struct sunxi_ss_ctx *ss"?
> Since you have a single global pointer, it makes sense to declare
> "struct sunxi_ss_ctx ss" statically instead.
> 
> And even a better solution is to remove a single global pointer.

All other crypto driver I have read use a global structure and it made things easy.
Thanks to M. Ripard that pointed to me the talitos driver that solve the global device pointer by using alg template and container_of().

But since I think there will never 2 Security System at the same time on the same SoC, I do not know if it is worth the cost to add more complexity just to remove a pointer.

> 
>> +
>> +	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
>> +	ss->base = devm_ioremap_resource(&pdev->dev, res);
>> +	if (IS_ERR(ss->base)) {
>> +		dev_err(&pdev->dev, "Cannot request MMIO\n");
>> +		return PTR_ERR(ss->base);
>> +	}
>> +
>> +	ss->ssclk = devm_clk_get(&pdev->dev, "mod");
>> +	if (IS_ERR(ss->ssclk)) {
>> +		err = PTR_ERR(ss->ssclk);
>> +		dev_err(&pdev->dev, "Cannot get SS clock err=%d\n", err);
>> +		return err;
>> +	}
>> +	dev_dbg(&pdev->dev, "clock ss acquired\n");
>> +
>> +	ss->busclk = devm_clk_get(&pdev->dev, "ahb");
>> +	if (IS_ERR(ss->busclk)) {
>> +		err = PTR_ERR(ss->busclk);
>> +		dev_err(&pdev->dev, "Cannot get AHB SS clock err=%d\n", err);
>> +		return err;
>> +	}
>> +	dev_dbg(&pdev->dev, "clock ahb_ss acquired\n");
>> +
>> +	/* Enable both clocks */
>> +	err = clk_prepare_enable(ss->busclk);
>> +	if (err != 0) {
>> +		dev_err(&pdev->dev, "Cannot prepare_enable busclk\n");
>> +		return err;
>> +	}
>> +	err = clk_prepare_enable(ss->ssclk);
>> +	if (err != 0) {
>> +		dev_err(&pdev->dev, "Cannot prepare_enable ssclk\n");
>> +		clk_disable_unprepare(ss->busclk);
> 
> goto somewhere to the end of the function?

OK

> 
>> +		return err;
>> +	}
>> +
>> +	/*
>> +	 * Check that clock have the correct rates gived in the datasheet
>> +	 * Try to set the clock to the maximum allowed
>> +	 */
>> +	err = clk_set_rate(ss->ssclk, cr_mod);
>> +	if (err != 0) {
>> +		dev_err(&pdev->dev, "Cannot set clock rate to ssclk\n");
>> +		clk_disable_unprepare(ss->ssclk);
>> +		clk_disable_unprepare(ss->busclk);
> 
> goto "error_md5"?

Ok

> 
>> +		return err;
>> +	}
>> +
>> +	cr = clk_get_rate(ss->busclk);
>> +	if (cr >= cr_ahb)
>> +		dev_dbg(&pdev->dev, "Clock bus %lu (%lu MHz) (must be >= %lu)\n",
>> +				cr, cr / 1000000, cr_ahb);
>> +	else
>> +		dev_warn(&pdev->dev, "Clock bus %lu (%lu MHz) (must be >= %lu)\n",
>> +				cr, cr / 1000000, cr_ahb);
> 
> See next comment.
> 
>> +	cr = clk_get_rate(ss->ssclk);
>> +	if (cr <= cr_mod)
>> +		if (cr < cr_mod)
>> +			dev_info(&pdev->dev, "Clock ss %lu (%lu MHz) (must be <= %lu)\n",
>> +					cr, cr / 1000000, cr_mod);
>> +		else
>> +			dev_dbg(&pdev->dev, "Clock ss %lu (%lu MHz) (must be <= %lu)\n",
>> +					cr, cr / 1000000, cr_mod);
>> +	else
>> +		dev_warn(&pdev->dev, "Clock ss is at %lu (%lu MHz) (must be <= %lu)\n",
>> +				cr, cr / 1000000, cr_mod);
> 
> The management of kernel log levels looks pretty strange. As far as I
> understand there is no error on any clock rate, I'd recommend to keep
> only one information message.
> 

If clock rate are below the recommended value, the only impact I found was bad performance.
So it explain the warn and no error. (yes the info must be warn, ...fixed)

But I will put comment for explain that.

>> +	/*
>> +	 * Datasheet named it "Die Bonding ID"
>> +	 * I expect to be a sort of Security System Revision number.
>> +	 * Since the A80 seems to have an other version of SS
>> +	 * this info could be useful
>> +	 */
>> +	writel(SS_ENABLED, ss->base + SS_CTL);
>> +	v = readl(ss->base + SS_CTL);
>> +	v >>= 16;
>> +	v &= 0x07;
>> +	dev_info(&pdev->dev, "Die ID %d\n", v);
>> +	writel(0, ss->base + SS_CTL);
>> +
>> +	ss->dev = &pdev->dev;
>> +
>> +	mutex_init(&ss->lock);
>> +	mutex_init(&ss->bufin_lock);
>> +	mutex_init(&ss->bufout_lock);
>> +
>> +	err = crypto_register_ahash(&sunxi_md5_alg);
>> +	if (err)
>> +		goto error_md5;
>> +	err = crypto_register_ahash(&sunxi_sha1_alg);
>> +	if (err)
>> +		goto error_sha1;
>> +	err = crypto_register_algs(sunxi_cipher_algs,
>> +			ARRAY_SIZE(sunxi_cipher_algs));
>> +	if (err)
>> +		goto error_ciphers;
>> +
>> +	return 0;
>> +error_ciphers:
>> +	crypto_unregister_ahash(&sunxi_sha1_alg);
>> +error_sha1:
>> +	crypto_unregister_ahash(&sunxi_md5_alg);
>> +error_md5:
>> +	clk_disable_unprepare(ss->ssclk);
>> +	clk_disable_unprepare(ss->busclk);
>> +	return err;
>> +}
>> +
>> +static int __exit sunxi_ss_remove(struct platform_device *pdev)
>> +{
>> +	if (!pdev->dev.of_node)
>> +		return 0;
> 
> Redundant check.
> 

Ok

> 
> 
> --
> With best wishes,
> Vladimir
> 

Thanks for the review
Corentin Labbe Oct. 21, 2014, 4:39 p.m. UTC | #3
Le 21/10/2014 01:52, Joe Perches a écrit :
> On Tue, 2014-10-21 at 02:28 +0300, Vladimir Zapolskiy wrote:
>> On 19.10.2014 17:16, LABBE Corentin wrote:
>>> Add support for the Security System included in Allwinner SoC A20.
>>> The Security System is a hardware cryptographic accelerator that support AES/MD5/SHA1/DES/3DES/PRNG algorithms.
> []
>>> diff --git a/drivers/crypto/sunxi-ss/sunxi-ss-core.c b/drivers/crypto/sunxi-ss/sunxi-ss-core.c
> []
>>> +	cr = clk_get_rate(ss->busclk);
>>> +	if (cr >= cr_ahb)
>>> +		dev_dbg(&pdev->dev, "Clock bus %lu (%lu MHz) (must be >= %lu)\n",
>>> +				cr, cr / 1000000, cr_ahb);
>>> +	else
>>> +		dev_warn(&pdev->dev, "Clock bus %lu (%lu MHz) (must be >= %lu)\n",
>>> +				cr, cr / 1000000, cr_ahb);
>>
>> See next comment.
>>
>>> +	cr = clk_get_rate(ss->ssclk);
>>> +	if (cr <= cr_mod)
>>> +		if (cr < cr_mod)
>>> +			dev_info(&pdev->dev, "Clock ss %lu (%lu MHz) (must be <= %lu)\n",
>>> +					cr, cr / 1000000, cr_mod);
>>> +		else
>>> +			dev_dbg(&pdev->dev, "Clock ss %lu (%lu MHz) (must be <= %lu)\n",
>>> +					cr, cr / 1000000, cr_mod);
>>> +	else
>>> +		dev_warn(&pdev->dev, "Clock ss is at %lu (%lu MHz) (must be <= %lu)\n",
>>> +				cr, cr / 1000000, cr_mod);
>>
>> The management of kernel log levels looks pretty strange. As far as I
>> understand there is no error on any clock rate, I'd recommend to keep
>> only one information message.
> 
> And if not, please add some braces.
> 
>> hash_init: initialize request context */
>>> +int sunxi_hash_init(struct ahash_request *areq)
>>> +{
>>> +	const char *hash_type;
>>> +	struct sunxi_req_ctx *op = ahash_request_ctx(areq);
>>> +
>>> +	memset(op, 0, sizeof(struct sunxi_req_ctx));
>>> +
>>> +	hash_type = crypto_tfm_alg_name(areq->base.tfm);
>>> +
>>> +	if (strcmp(hash_type, "sha1") == 0)
>>> +		op->mode = SS_OP_SHA1;
>>> +	if (strcmp(hash_type, "md5") == 0)
>>> +		op->mode = SS_OP_MD5;
> 
> else if ?
> 
>>> +	if (op->mode == 0)
>>> +		return -EINVAL;
> 
> maybe this?
> 
> 	if (!strcmp(hash_type, "sha1"))
> 		op->mode = SS_OP_SHA1;
> 	else if (!strcmp(hash_type, "md5"))
> 		op->mode = SH_OP_MD5;
> 	else
> 		return -EINVAL;
> 

Ok it is better

>>> +
>>> +	return 0;
>>> +}
> []
>>> +int sunxi_hash_update(struct ahash_request *areq)
>>> +{
> []
>>> +	dev_dbg(ss->dev, "%s %s bc=%llu len=%u mode=%x bw=%u ww=%u",
>>> +			__func__, crypto_tfm_alg_name(areq->base.tfm),
>>> +			op->byte_count, areq->nbytes, op->mode,
>>> +			op->nbw, op->nwait);
> 
> dev_dbg statements generally don't need __func__ as
> dynamic_debug can add it.
> 
> If you want to keep it, the most common output form for
> __func__ is '"%s: ...", __func__'
> 

It is a big debug that I forgot to remove but I fixed that in other dev_dbg

thanks
Maxime Ripard Oct. 21, 2014, 7:11 p.m. UTC | #4
Hi Corentin,

Thanks for resending it.

On Sun, Oct 19, 2014 at 04:16:22PM +0200, LABBE Corentin wrote:
> Add support for the Security System included in Allwinner SoC A20.
> The Security System is a hardware cryptographic accelerator that support AES/MD5/SHA1/DES/3DES/PRNG algorithms.
> 
> Signed-off-by: LABBE Corentin <clabbe.montjoie@gmail.com>
> ---
>  drivers/crypto/Kconfig                    |  17 ++
>  drivers/crypto/Makefile                   |   1 +
>  drivers/crypto/sunxi-ss/Makefile          |   2 +
>  drivers/crypto/sunxi-ss/sunxi-ss-cipher.c | 489 ++++++++++++++++++++++++++++++
>  drivers/crypto/sunxi-ss/sunxi-ss-core.c   | 318 +++++++++++++++++++
>  drivers/crypto/sunxi-ss/sunxi-ss-hash.c   | 445 +++++++++++++++++++++++++++
>  drivers/crypto/sunxi-ss/sunxi-ss.h        | 193 ++++++++++++
>  7 files changed, 1465 insertions(+)
>  create mode 100644 drivers/crypto/sunxi-ss/Makefile
>  create mode 100644 drivers/crypto/sunxi-ss/sunxi-ss-cipher.c
>  create mode 100644 drivers/crypto/sunxi-ss/sunxi-ss-core.c
>  create mode 100644 drivers/crypto/sunxi-ss/sunxi-ss-hash.c
>  create mode 100644 drivers/crypto/sunxi-ss/sunxi-ss.h
> 
> diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
> index 2fb0fdf..9ba9759 100644
> --- a/drivers/crypto/Kconfig
> +++ b/drivers/crypto/Kconfig
> @@ -436,4 +436,21 @@ config CRYPTO_DEV_QCE
>  	  hardware. To compile this driver as a module, choose M here. The
>  	  module will be called qcrypto.
>  
> +config CRYPTO_DEV_SUNXI_SS
> +	tristate "Support for Allwinner Security System cryptographic accelerator"
> +	depends on ARCH_SUNXI
> +	select CRYPTO_MD5
> +	select CRYPTO_SHA1
> +	select CRYPTO_AES
> +	select CRYPTO_DES
> +	select CRYPTO_BLKCIPHER
> +	help
> +	  Some Allwinner SoC have a crypto accelerator named
> +	  Security System. Select this if you want to use it.
> +	  The Security System handle AES/DES/3DES ciphers in CBC mode
> +	  and SHA1 and MD5 hash algorithms.
> +
> +	  To compile this driver as a module, choose M here: the module
> +	  will be called sunxi-ss.
> +
>  endif # CRYPTO_HW
> diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile
> index 3924f93..856545c 100644
> --- a/drivers/crypto/Makefile
> +++ b/drivers/crypto/Makefile
> @@ -25,3 +25,4 @@ obj-$(CONFIG_CRYPTO_DEV_TALITOS) += talitos.o
>  obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/
>  obj-$(CONFIG_CRYPTO_DEV_QAT) += qat/
>  obj-$(CONFIG_CRYPTO_DEV_QCE) += qce/
> +obj-$(CONFIG_CRYPTO_DEV_SUNXI_SS) += sunxi-ss/
> diff --git a/drivers/crypto/sunxi-ss/Makefile b/drivers/crypto/sunxi-ss/Makefile
> new file mode 100644
> index 0000000..8bb287d
> --- /dev/null
> +++ b/drivers/crypto/sunxi-ss/Makefile
> @@ -0,0 +1,2 @@
> +obj-$(CONFIG_CRYPTO_DEV_SUNXI_SS) += sunxi-ss.o
> +sunxi-ss-y += sunxi-ss-core.o sunxi-ss-hash.o sunxi-ss-cipher.o
> diff --git a/drivers/crypto/sunxi-ss/sunxi-ss-cipher.c b/drivers/crypto/sunxi-ss/sunxi-ss-cipher.c
> new file mode 100644
> index 0000000..8d0416e
> --- /dev/null
> +++ b/drivers/crypto/sunxi-ss/sunxi-ss-cipher.c
> @@ -0,0 +1,489 @@
> +/*
> + * sunxi-ss-cipher.c - hardware cryptographic accelerator for Allwinner A20 SoC
> + *
> + * Copyright (C) 2013-2014 Corentin LABBE <clabbe.montjoie@gmail.com>
> + *
> + * This file add support for AES cipher with 128,192,256 bits
> + * keysize in CBC mode.
> + * Add support also for DES and 3DES in CBC mode.
> + *
> + * You could find the datasheet in Documentation/arm/sunxi/README
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + */
> +#include "sunxi-ss.h"
> +
> +extern struct sunxi_ss_ctx *ss;
> +
> +static int sunxi_ss_cipher(struct ablkcipher_request *areq, u32 mode)
> +{
> +	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(areq);
> +	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
> +	const char *cipher_type;
> +
> +	if (areq->nbytes == 0)
> +		return 0;
> +
> +	if (areq->info == NULL) {
> +		dev_err(ss->dev, "ERROR: Empty IV\n");
> +		return -EINVAL;
> +	}
> +
> +	if (areq->src == NULL || areq->dst == NULL) {
> +		dev_err(ss->dev, "ERROR: Some SGs are NULL\n");
> +		return -EINVAL;
> +	}
> +
> +	cipher_type = crypto_tfm_alg_name(crypto_ablkcipher_tfm(tfm));
> +
> +	if (strcmp("cbc(aes)", cipher_type) == 0) {
> +		mode |= SS_OP_AES | SS_CBC | SS_ENABLED | op->keymode;
> +		return sunxi_ss_aes_poll(areq, mode);
> +	}
> +
> +	if (strcmp("cbc(des)", cipher_type) == 0) {
> +		mode |= SS_OP_DES | SS_CBC | SS_ENABLED | op->keymode;
> +		return sunxi_ss_des_poll(areq, mode);
> +	}
> +
> +	if (strcmp("cbc(des3_ede)", cipher_type) == 0) {
> +		mode |= SS_OP_3DES | SS_CBC | SS_ENABLED | op->keymode;
> +		return sunxi_ss_des_poll(areq, mode);
> +	}
> +
> +	dev_err(ss->dev, "ERROR: Cipher %s not handled\n", cipher_type);
> +	return -EINVAL;
> +}
> +
> +int sunxi_ss_cipher_encrypt(struct ablkcipher_request *areq)
> +{
> +	return sunxi_ss_cipher(areq, SS_ENCRYPTION);
> +}
> +
> +int sunxi_ss_cipher_decrypt(struct ablkcipher_request *areq)
> +{
> +	return sunxi_ss_cipher(areq, SS_DECRYPTION);
> +}
> +
> +int sunxi_ss_cipher_init(struct crypto_tfm *tfm)
> +{
> +	struct sunxi_tfm_ctx *op = crypto_tfm_ctx(tfm);
> +
> +	memset(op, 0, sizeof(struct sunxi_tfm_ctx));
> +	return 0;
> +}
> +
> +/*
> + * Optimized function for the case where we have only one SG,
> + * so we can use kmap_atomic
> + */
> +static int sunxi_ss_aes_poll_atomic(struct ablkcipher_request *areq)
> +{
> +	u32 spaces;
> +	struct scatterlist *in_sg = areq->src;
> +	struct scatterlist *out_sg = areq->dst;
> +	void *src_addr;
> +	void *dst_addr;
> +	unsigned int ileft = areq->nbytes;
> +	unsigned int oleft = areq->nbytes;
> +	unsigned int todo;
> +	u32 *src32;
> +	u32 *dst32;
> +	u32 rx_cnt = 32;
> +	u32 tx_cnt = 0;
> +	int i;
> +
> +	src_addr = kmap_atomic(sg_page(in_sg)) + in_sg->offset;

Where does this scatter_list is coming from? Can it even be allocated
in highmem?

> +	if (src_addr == NULL) {
> +		dev_err(ss->dev, "kmap_atomic error for src SG\n");
> +		writel(0, ss->base + SS_CTL);
> +		mutex_unlock(&ss->lock);
> +		return -EINVAL;
> +	}
> +
> +	dst_addr = kmap_atomic(sg_page(out_sg)) + out_sg->offset;
> +	if (dst_addr == NULL) {
> +		dev_err(ss->dev, "kmap_atomic error for dst SG\n");
> +		writel(0, ss->base + SS_CTL);
> +		kunmap_atomic(src_addr);
> +		mutex_unlock(&ss->lock);
> +		return -EINVAL;

Please use gotos in your error path.

> +	}
> +
> +	src32 = (u32 *)src_addr;
> +	dst32 = (u32 *)dst_addr;
> +	ileft = areq->nbytes / 4;
> +	oleft = areq->nbytes / 4;
> +	i = 0;
> +	do {
> +		if (ileft > 0 && rx_cnt > 0) {
> +			todo = min(rx_cnt, ileft);
> +			ileft -= todo;
> +			do {
> +				writel_relaxed(*src32++,

Please put some braces around that referencing/increment.

> +						ss->base +
> +						SS_RXFIFO);
> +				todo--;
> +			} while (todo > 0);
> +		}
> +		if (tx_cnt > 0) {
> +			todo = min(tx_cnt, oleft);
> +			oleft -= todo;
> +			do {
> +				*dst32++ = readl_relaxed(ss->base +
> +						SS_TXFIFO);
> +				todo--;
> +			} while (todo > 0);
> +		}
> +		spaces = readl_relaxed(ss->base + SS_FCSR);
> +		rx_cnt = SS_RXFIFO_SPACES(spaces);
> +		tx_cnt = SS_TXFIFO_SPACES(spaces);
> +	} while (oleft > 0);
> +	writel(0, ss->base + SS_CTL);
> +	kunmap_atomic(src_addr);
> +	kunmap_atomic(dst_addr);
> +	mutex_unlock(&ss->lock);

You never took that mutex in that function...

> +	return 0;
> +}
> +
> +int sunxi_ss_aes_poll(struct ablkcipher_request *areq, u32 mode)
> +{
> +	u32 spaces;
> +	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(areq);
> +	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
> +	unsigned int ivsize = crypto_ablkcipher_ivsize(tfm);
> +	/* when activating SS, the default FIFO space is 32 */
> +	u32 rx_cnt = 32;
> +	u32 tx_cnt = 0;
> +	u32 v;
> +	int i;
> +	struct scatterlist *in_sg = areq->src;
> +	struct scatterlist *out_sg = areq->dst;
> +	void *src_addr;
> +	void *dst_addr;
> +	unsigned int ileft = areq->nbytes;
> +	unsigned int oleft = areq->nbytes;
> +	unsigned int sgileft = areq->src->length;
> +	unsigned int sgoleft = areq->dst->length;
> +	unsigned int todo;
> +	u32 *src32;
> +	u32 *dst32;
> +
> +	mutex_lock(&ss->lock);
> +
> +	for (i = 0; i < op->keylen; i += 4)
> +		writel(*(op->key + i/4), ss->base + SS_KEY0 + i);
> +
> +	if (areq->info != NULL) {
> +		for (i = 0; i < 4 && i < ivsize / 4; i++) {
> +			v = *(u32 *)(areq->info + i * 4);
> +			writel(v, ss->base + SS_IV0 + i * 4);
> +		}
> +	}
> +	writel(mode, ss->base + SS_CTL);
> +
> +	/* If we have only one SG, we can use kmap_atomic */
> +	if (sg_next(in_sg) == NULL && sg_next(out_sg) == NULL)
> +		return sunxi_ss_aes_poll_atomic(areq);
> +
> +	/*
> +	 * If we have more than one SG, we cannot use kmap_atomic since
> +	 * we hold the mapping too long
> +	 */
> +	src_addr = kmap(sg_page(in_sg)) + in_sg->offset;
> +	if (src_addr == NULL) {
> +		dev_err(ss->dev, "KMAP error for src SG\n");
> +		mutex_unlock(&ss->lock);
> +		return -EINVAL;
> +	}
> +	dst_addr = kmap(sg_page(out_sg)) + out_sg->offset;
> +	if (dst_addr == NULL) {
> +		kunmap(sg_page(in_sg));
> +		dev_err(ss->dev, "KMAP error for dst SG\n");
> +		mutex_unlock(&ss->lock);
> +		return -EINVAL;
> +	}
> +	src32 = (u32 *)src_addr;
> +	dst32 = (u32 *)dst_addr;
> +	ileft = areq->nbytes / 4;
> +	oleft = areq->nbytes / 4;
> +	sgileft = in_sg->length / 4;
> +	sgoleft = out_sg->length / 4;
> +	do {
> +		spaces = readl_relaxed(ss->base + SS_FCSR);
> +		rx_cnt = SS_RXFIFO_SPACES(spaces);
> +		tx_cnt = SS_TXFIFO_SPACES(spaces);
> +		todo = min3(rx_cnt, ileft, sgileft);
> +		if (todo > 0) {
> +			ileft -= todo;
> +			sgileft -= todo;
> +		}
> +		while (todo > 0) {
> +			writel_relaxed(*src32++, ss->base + SS_RXFIFO);
> +			todo--;
> +		}
> +		if (in_sg != NULL && sgileft == 0 && ileft > 0) {
> +			kunmap(sg_page(in_sg));
> +			in_sg = sg_next(in_sg);
> +			while (in_sg != NULL && in_sg->length == 0)
> +				in_sg = sg_next(in_sg);
> +			if (in_sg != NULL && ileft > 0) {
> +				src_addr = kmap(sg_page(in_sg)) + in_sg->offset;
> +				if (src_addr == NULL) {
> +					dev_err(ss->dev, "ERROR: KMAP for src SG\n");
> +					mutex_unlock(&ss->lock);
> +					return -EINVAL;
> +				}
> +				src32 = src_addr;
> +				sgileft = in_sg->length / 4;
> +			}
> +		}
> +		/* do not test oleft since when oleft == 0 we have finished */
> +		todo = min3(tx_cnt, oleft, sgoleft);
> +		if (todo > 0) {
> +			oleft -= todo;
> +			sgoleft -= todo;
> +		}
> +		while (todo > 0) {
> +			*dst32++ = readl_relaxed(ss->base + SS_TXFIFO);
> +			todo--;
> +		}
> +		if (out_sg != NULL && sgoleft == 0 && oleft >= 0) {
> +			kunmap(sg_page(out_sg));
> +			out_sg = sg_next(out_sg);
> +			while (out_sg != NULL && out_sg->length == 0)
> +				out_sg = sg_next(out_sg);
> +			if (out_sg != NULL && oleft > 0) {
> +				dst_addr = kmap(sg_page(out_sg)) +
> +					out_sg->offset;
> +				if (dst_addr == NULL) {
> +					dev_err(ss->dev, "KMAP error\n");
> +					mutex_unlock(&ss->lock);
> +					return -EINVAL;
> +				}
> +				dst32 = dst_addr;
> +				sgoleft = out_sg->length / 4;
> +			}
> +		}
> +	} while (oleft > 0);
> +
> +	writel_relaxed(0, ss->base + SS_CTL);
> +	mutex_unlock(&ss->lock);
> +	return 0;
> +}
> +
> +/*
> + * Pure CPU way of doing DES/3DES with SS
> + * Since DES and 3DES SGs could be smaller than 4 bytes, I use sg_copy_to_buffer
> + * for "linearize" them.
> + * The problem with that is that I alloc (2 x areq->nbytes) for buf_in/buf_out
> + * TODO: change this system, I need to support other mode than CBC where len
> + * is not a multiple of 4 and the hack of linearize use too much memory
> + * SGsrc -> buf_in -> SS -> buf_out -> SGdst
> + */
> +int sunxi_ss_des_poll(struct ablkcipher_request *areq, u32 mode)
> +{
> +	u32 value, spaces;
> +	size_t nb_in_sg_tx, nb_in_sg_rx;
> +	size_t ir, it;
> +	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(areq);
> +	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
> +	unsigned int ivsize = crypto_ablkcipher_ivsize(tfm);
> +	u32 tx_cnt = 0;
> +	u32 rx_cnt = 0;
> +	u32 v;
> +	int i;
> +	int no_chunk = 1;
> +	struct scatterlist *in_sg = areq->src;
> +	struct scatterlist *out_sg = areq->dst;
> +
> +	/*
> +	 * if we have only SGs with size multiple of 4,
> +	 * we can use the SS AES function
> +	 */
> +	while (in_sg != NULL && no_chunk == 1) {
> +		if ((in_sg->length % 4) != 0)
> +			no_chunk = 0;
> +		in_sg = sg_next(in_sg);
> +	}
> +	while (out_sg != NULL && no_chunk == 1) {
> +		if ((out_sg->length % 4) != 0)
> +			no_chunk = 0;
> +		out_sg = sg_next(out_sg);
> +	}
> +
> +	if (no_chunk == 1)
> +		return sunxi_ss_aes_poll(areq, mode);
> +
> +	in_sg = areq->src;
> +	out_sg = areq->dst;
> +
> +	nb_in_sg_rx = sg_nents(in_sg);
> +	nb_in_sg_tx = sg_nents(out_sg);
> +
> +	/*
> +	 * buf_in and buf_out are allocated only one time
> +	 * then we keep the buffer until driver end
> +	 * the allocation can only grow more
> +	 * we do not reduce it for simplification
> +	 */
> +	mutex_lock(&ss->bufin_lock);
> +	if (ss->buf_in == NULL) {
> +		ss->buf_in = kmalloc(areq->nbytes, GFP_KERNEL);
> +		ss->buf_in_size = areq->nbytes;
> +	} else {
> +		if (areq->nbytes > ss->buf_in_size) {
> +			kfree(ss->buf_in);
> +			ss->buf_in = kmalloc(areq->nbytes, GFP_KERNEL);
> +			ss->buf_in_size = areq->nbytes;
> +		}
> +	}
> +	if (ss->buf_in == NULL) {
> +		ss->buf_in_size = 0;
> +		mutex_unlock(&ss->bufin_lock);
> +		dev_err(ss->dev, "Unable to allocate pages.\n");
> +		return -ENOMEM;
> +	}
> +	mutex_lock(&ss->bufout_lock);

What are these two mutexes used for? It looks like you're only using
them in this function.

What would prevent you from just using the "main" lock like you did
for the AES?

> +	if (ss->buf_out == NULL) {
> +		ss->buf_out = kmalloc(areq->nbytes, GFP_KERNEL);
> +		if (ss->buf_out == NULL) {
> +			ss->buf_out_size = 0;
> +			mutex_unlock(&ss->bufin_lock);
> +			mutex_unlock(&ss->bufout_lock);
> +			dev_err(ss->dev, "Unable to allocate pages.\n");
> +			return -ENOMEM;
> +		}
> +		ss->buf_out_size = areq->nbytes;
> +	} else {
> +		if (areq->nbytes > ss->buf_out_size) {
> +			kfree(ss->buf_out);
> +			ss->buf_out = kmalloc(areq->nbytes, GFP_KERNEL);
> +			if (ss->buf_out == NULL) {
> +				ss->buf_out_size = 0;
> +				mutex_unlock(&ss->bufin_lock);
> +				mutex_unlock(&ss->bufout_lock);
> +				dev_err(ss->dev, "Unable to allocate pages.\n");
> +				return -ENOMEM;
> +			}
> +			ss->buf_out_size = areq->nbytes;
> +		}
> +	}
> +
> +	sg_copy_to_buffer(areq->src, nb_in_sg_rx, ss->buf_in, areq->nbytes);
> +
> +	ir = 0;
> +	it = 0;
> +	mutex_lock(&ss->lock);
> +
> +	for (i = 0; i < op->keylen; i += 4)
> +		writel(*(op->key + i/4), ss->base + SS_KEY0 + i);
> +	if (areq->info != NULL) {
> +		for (i = 0; i < 4 && i < ivsize / 4; i++) {
> +			v = *(u32 *)(areq->info + i * 4);
> +			writel(v, ss->base + SS_IV0 + i * 4);
> +		}
> +	}
> +	writel(mode, ss->base + SS_CTL);
> +
> +	do {
> +		if (rx_cnt == 0 || tx_cnt == 0) {
> +			spaces = readl(ss->base + SS_FCSR);
> +			rx_cnt = SS_RXFIFO_SPACES(spaces);
> +			tx_cnt = SS_TXFIFO_SPACES(spaces);
> +		}
> +		if (rx_cnt > 0 && ir < areq->nbytes) {
> +			do {
> +				value = *(u32 *)(ss->buf_in + ir);
> +				writel(value, ss->base + SS_RXFIFO);
> +				ir += 4;
> +				rx_cnt--;
> +			} while (rx_cnt > 0 && ir < areq->nbytes);
> +		}
> +		if (tx_cnt > 0 && it < areq->nbytes) {
> +			do {
> +				value = readl(ss->base + SS_TXFIFO);
> +				*(u32 *)(ss->buf_out + it) = value;
> +				it += 4;
> +				tx_cnt--;
> +			} while (tx_cnt > 0 && it < areq->nbytes);
> +		}
> +		if (ir == areq->nbytes) {
> +			mutex_unlock(&ss->bufin_lock);

If tx_cnt <= 0 and it < areq->nbytes, your loop will stop, and you'll
not release the mutex.

> +			ir++;
> +		}
> +	} while (it < areq->nbytes);
> +
> +	writel(0, ss->base + SS_CTL);
> +	mutex_unlock(&ss->lock);
> +
> +	/*
> +	 * a simple optimization, since we dont need the hardware for this copy
> +	 * we release the lock and do the copy. With that we gain 5/10% perf
> +	 */
> +	sg_copy_from_buffer(areq->dst, nb_in_sg_tx, ss->buf_out, areq->nbytes);
> +
> +	mutex_unlock(&ss->bufout_lock);
> +	return 0;
> +}
> +
> +/* check and set the AES key, prepare the mode to be used */
> +int sunxi_ss_aes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
> +		unsigned int keylen)
> +{
> +	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
> +
> +	switch (keylen) {
> +	case 128 / 8:
> +		op->keymode = SS_AES_128BITS;
> +		break;
> +	case 192 / 8:
> +		op->keymode = SS_AES_192BITS;
> +		break;
> +	case 256 / 8:
> +		op->keymode = SS_AES_256BITS;
> +		break;
> +	default:
> +		dev_err(ss->dev, "ERROR: Invalid keylen %u\n", keylen);
> +		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
> +		return -EINVAL;
> +	}
> +	op->keylen = keylen;
> +	memcpy(op->key, key, keylen);
> +	return 0;
> +}
> +
> +/* check and set the DES key, prepare the mode to be used */
> +int sunxi_ss_des_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
> +		unsigned int keylen)
> +{
> +	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
> +
> +	if (keylen != DES_KEY_SIZE) {
> +		dev_err(ss->dev, "Invalid keylen %u\n", keylen);
> +		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
> +		return -EINVAL;
> +	}
> +	op->keylen = keylen;
> +	memcpy(op->key, key, keylen);
> +	return 0;
> +}
> +
> +/* check and set the 3DES key, prepare the mode to be used */
> +int sunxi_ss_des3_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
> +		unsigned int keylen)
> +{
> +	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
> +
> +	if (keylen != 3 * DES_KEY_SIZE) {
> +		dev_err(ss->dev, "Invalid keylen %u\n", keylen);
> +		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
> +		return -EINVAL;
> +	}
> +	op->keylen = keylen;
> +	memcpy(op->key, key, keylen);
> +	return 0;
> +}
> diff --git a/drivers/crypto/sunxi-ss/sunxi-ss-core.c b/drivers/crypto/sunxi-ss/sunxi-ss-core.c
> new file mode 100644
> index 0000000..e66d7e2
> --- /dev/null
> +++ b/drivers/crypto/sunxi-ss/sunxi-ss-core.c
> @@ -0,0 +1,318 @@
> +/*
> + * sunxi-ss-core.c - hardware cryptographic accelerator for Allwinner A20 SoC
> + *
> + * Copyright (C) 2013-2014 Corentin LABBE <clabbe.montjoie@gmail.com>
> + *
> + * Core file which registers crypto algorithms supported by the SS.
> + *
> + * You could find a link for the datasheet in Documentation/arm/sunxi/README
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + */
> +#include <linux/clk.h>
> +#include <linux/crypto.h>
> +#include <linux/io.h>
> +#include <linux/module.h>
> +#include <linux/of.h>
> +#include <linux/platform_device.h>
> +#include <crypto/scatterwalk.h>
> +#include <linux/scatterlist.h>
> +#include <linux/interrupt.h>
> +#include <linux/delay.h>
> +
> +#include "sunxi-ss.h"
> +
> +struct sunxi_ss_ctx *ss;
> +
> +/*
> + * General notes for whole driver:
> + *
> + * After each request the device must be disabled with a write of 0 in SS_CTL
> + *
> + * For performance reason, we use writel_relaxed/read_relaxed for all
> + * operations on RX and TX FIFO and also SS_FCSR.
> + * Excepts for the last write on TX FIFO.
> + * For all other registers, we use writel/readl.
> + * See http://permalink.gmane.org/gmane.linux.ports.arm.kernel/117644
> + * and http://permalink.gmane.org/gmane.linux.ports.arm.kernel/117640
> + */

I don't really know why that comment is here, when there's not a
single writel in this file.

> +
> +static struct ahash_alg sunxi_md5_alg = {
> +	.init = sunxi_hash_init,
> +	.update = sunxi_hash_update,
> +	.final = sunxi_hash_final,
> +	.finup = sunxi_hash_finup,
> +	.digest = sunxi_hash_digest,
> +	.halg = {
> +		.digestsize = MD5_DIGEST_SIZE,
> +		.base = {
> +			.cra_name = "md5",
> +			.cra_driver_name = "md5-sunxi-ss",
> +			.cra_priority = 300,
> +			.cra_alignmask = 3,
> +			.cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
> +			.cra_blocksize = MD5_HMAC_BLOCK_SIZE,
> +			.cra_ctxsize = sizeof(struct sunxi_req_ctx),
> +			.cra_module = THIS_MODULE,
> +			.cra_type = &crypto_ahash_type,
> +			.cra_init = sunxi_hash_crainit
> +		}
> +	}
> +};
> +
> +static struct ahash_alg sunxi_sha1_alg = {
> +	.init = sunxi_hash_init,
> +	.update = sunxi_hash_update,
> +	.final = sunxi_hash_final,
> +	.finup = sunxi_hash_finup,
> +	.digest = sunxi_hash_digest,
> +	.halg = {
> +		.digestsize = SHA1_DIGEST_SIZE,
> +		.base = {
> +			.cra_name = "sha1",
> +			.cra_driver_name = "sha1-sunxi-ss",
> +			.cra_priority = 300,
> +			.cra_alignmask = 3,
> +			.cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
> +			.cra_blocksize = SHA1_BLOCK_SIZE,
> +			.cra_ctxsize = sizeof(struct sunxi_req_ctx),
> +			.cra_module = THIS_MODULE,
> +			.cra_type = &crypto_ahash_type,
> +			.cra_init = sunxi_hash_crainit
> +		}
> +	}
> +};
> +
> +static struct crypto_alg sunxi_cipher_algs[] = {
> +{
> +	.cra_name = "cbc(aes)",
> +	.cra_driver_name = "cbc-aes-sunxi-ss",
> +	.cra_priority = 300,
> +	.cra_blocksize = AES_BLOCK_SIZE,
> +	.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER,
> +	.cra_ctxsize = sizeof(struct sunxi_tfm_ctx),
> +	.cra_module = THIS_MODULE,
> +	.cra_alignmask = 3,
> +	.cra_type = &crypto_ablkcipher_type,
> +	.cra_init = sunxi_ss_cipher_init,
> +	.cra_u = {
> +		.ablkcipher = {
> +			.min_keysize    = AES_MIN_KEY_SIZE,
> +			.max_keysize    = AES_MAX_KEY_SIZE,
> +			.ivsize         = AES_BLOCK_SIZE,
> +			.setkey         = sunxi_ss_aes_setkey,
> +			.encrypt        = sunxi_ss_cipher_encrypt,
> +			.decrypt        = sunxi_ss_cipher_decrypt,
> +		}
> +	}
> +}, {
> +	.cra_name = "cbc(des)",
> +	.cra_driver_name = "cbc-des-sunxi-ss",
> +	.cra_priority = 300,
> +	.cra_blocksize = DES_BLOCK_SIZE,
> +	.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER,
> +	.cra_ctxsize = sizeof(struct sunxi_req_ctx),
> +	.cra_module = THIS_MODULE,
> +	.cra_alignmask = 3,
> +	.cra_type = &crypto_ablkcipher_type,
> +	.cra_init = sunxi_ss_cipher_init,
> +	.cra_u.ablkcipher = {
> +		.min_keysize    = DES_KEY_SIZE,
> +		.max_keysize    = DES_KEY_SIZE,
> +		.ivsize         = DES_BLOCK_SIZE,
> +		.setkey         = sunxi_ss_des_setkey,
> +		.encrypt        = sunxi_ss_cipher_encrypt,
> +		.decrypt        = sunxi_ss_cipher_decrypt,
> +	}
> +}, {
> +	.cra_name = "cbc(des3_ede)",
> +	.cra_driver_name = "cbc-des3-sunxi-ss",
> +	.cra_priority = 300,
> +	.cra_blocksize = DES3_EDE_BLOCK_SIZE,
> +	.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER,
> +	.cra_ctxsize = sizeof(struct sunxi_req_ctx),
> +	.cra_module = THIS_MODULE,
> +	.cra_alignmask = 3,
> +	.cra_type = &crypto_ablkcipher_type,
> +	.cra_init = sunxi_ss_cipher_init,
> +	.cra_u.ablkcipher = {
> +		.min_keysize    = DES3_EDE_KEY_SIZE,
> +		.max_keysize    = DES3_EDE_KEY_SIZE,
> +		.ivsize         = DES3_EDE_BLOCK_SIZE,
> +		.setkey         = sunxi_ss_des3_setkey,
> +		.encrypt        = sunxi_ss_cipher_encrypt,
> +		.decrypt        = sunxi_ss_cipher_decrypt,
> +	}
> +}
> +};
> +
> +static int sunxi_ss_probe(struct platform_device *pdev)
> +{
> +	struct resource *res;
> +	u32 v;
> +	int err;
> +	unsigned long cr;
> +	const unsigned long cr_ahb = 24 * 1000 * 1000;
> +	const unsigned long cr_mod = 150 * 1000 * 1000;
> +
> +	if (!pdev->dev.of_node)
> +		return -ENODEV;
> +
> +	ss = devm_kzalloc(&pdev->dev, sizeof(*ss), GFP_KERNEL);
> +	if (ss == NULL)
> +		return -ENOMEM;
> +
> +	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> +	ss->base = devm_ioremap_resource(&pdev->dev, res);
> +	if (IS_ERR(ss->base)) {
> +		dev_err(&pdev->dev, "Cannot request MMIO\n");
> +		return PTR_ERR(ss->base);
> +	}
> +
> +	ss->ssclk = devm_clk_get(&pdev->dev, "mod");
> +	if (IS_ERR(ss->ssclk)) {
> +		err = PTR_ERR(ss->ssclk);
> +		dev_err(&pdev->dev, "Cannot get SS clock err=%d\n", err);
> +		return err;
> +	}
> +	dev_dbg(&pdev->dev, "clock ss acquired\n");
> +
> +	ss->busclk = devm_clk_get(&pdev->dev, "ahb");
> +	if (IS_ERR(ss->busclk)) {
> +		err = PTR_ERR(ss->busclk);
> +		dev_err(&pdev->dev, "Cannot get AHB SS clock err=%d\n", err);
> +		return err;
> +	}
> +	dev_dbg(&pdev->dev, "clock ahb_ss acquired\n");
> +
> +	/* Enable both clocks */
> +	err = clk_prepare_enable(ss->busclk);
> +	if (err != 0) {
> +		dev_err(&pdev->dev, "Cannot prepare_enable busclk\n");
> +		return err;
> +	}
> +	err = clk_prepare_enable(ss->ssclk);
> +	if (err != 0) {
> +		dev_err(&pdev->dev, "Cannot prepare_enable ssclk\n");
> +		clk_disable_unprepare(ss->busclk);
> +		return err;
> +	}
> +
> +	/*
> +	 * Check that clock have the correct rates gived in the datasheet
> +	 * Try to set the clock to the maximum allowed
> +	 */
> +	err = clk_set_rate(ss->ssclk, cr_mod);
> +	if (err != 0) {
> +		dev_err(&pdev->dev, "Cannot set clock rate to ssclk\n");
> +		clk_disable_unprepare(ss->ssclk);
> +		clk_disable_unprepare(ss->busclk);
> +		return err;
> +	}
> +
> +	cr = clk_get_rate(ss->busclk);
> +	if (cr >= cr_ahb)
> +		dev_dbg(&pdev->dev, "Clock bus %lu (%lu MHz) (must be >= %lu)\n",
> +				cr, cr / 1000000, cr_ahb);
> +	else
> +		dev_warn(&pdev->dev, "Clock bus %lu (%lu MHz) (must be >= %lu)\n",
> +				cr, cr / 1000000, cr_ahb);
> +
> +	cr = clk_get_rate(ss->ssclk);
> +	if (cr <= cr_mod)
> +		if (cr < cr_mod)
> +			dev_info(&pdev->dev, "Clock ss %lu (%lu MHz) (must be <= %lu)\n",
> +					cr, cr / 1000000, cr_mod);
> +		else
> +			dev_dbg(&pdev->dev, "Clock ss %lu (%lu MHz) (must be <= %lu)\n",
> +					cr, cr / 1000000, cr_mod);
> +	else
> +		dev_warn(&pdev->dev, "Clock ss is at %lu (%lu MHz) (must be <= %lu)\n",
> +				cr, cr / 1000000, cr_mod);

If the set_rate fails, it will return an error. All this is useless.

> +
> +	/*
> +	 * Datasheet named it "Die Bonding ID"
> +	 * I expect to be a sort of Security System Revision number.
> +	 * Since the A80 seems to have an other version of SS
> +	 * this info could be useful
> +	 */
> +	writel(SS_ENABLED, ss->base + SS_CTL);
> +	v = readl(ss->base + SS_CTL);
> +	v >>= 16;
> +	v &= 0x07;
> +	dev_info(&pdev->dev, "Die ID %d\n", v);
> +	writel(0, ss->base + SS_CTL);

If the A80 has a different IP, it will most likely have a different
compatible anyway. You can remove that code.

> +
> +	ss->dev = &pdev->dev;
> +
> +	mutex_init(&ss->lock);
> +	mutex_init(&ss->bufin_lock);
> +	mutex_init(&ss->bufout_lock);
> +
> +	err = crypto_register_ahash(&sunxi_md5_alg);
> +	if (err)
> +		goto error_md5;
> +	err = crypto_register_ahash(&sunxi_sha1_alg);
> +	if (err)
> +		goto error_sha1;
> +	err = crypto_register_algs(sunxi_cipher_algs,
> +			ARRAY_SIZE(sunxi_cipher_algs));
> +	if (err)
> +		goto error_ciphers;
> +
> +	return 0;
> +error_ciphers:
> +	crypto_unregister_ahash(&sunxi_sha1_alg);
> +error_sha1:
> +	crypto_unregister_ahash(&sunxi_md5_alg);
> +error_md5:
> +	clk_disable_unprepare(ss->ssclk);
> +	clk_disable_unprepare(ss->busclk);
> +	return err;
> +}
> +
> +static int __exit sunxi_ss_remove(struct platform_device *pdev)

The remove callback should not be in the __exit section.

Here, that function will get removed if the driver is compiled as
built-in the remove function will not even be in the kernel
image. Which will result in an instant crash when the kernel will try
to call this function (since it's not even there anymore).

> +{
> +	if (!pdev->dev.of_node)
> +		return 0;
> +
> +	crypto_unregister_ahash(&sunxi_md5_alg);
> +	crypto_unregister_ahash(&sunxi_sha1_alg);
> +	crypto_unregister_algs(sunxi_cipher_algs,
> +			ARRAY_SIZE(sunxi_cipher_algs));
> +
> +	if (ss->buf_in != NULL)
> +		kfree(ss->buf_in);
> +	if (ss->buf_out != NULL)
> +		kfree(ss->buf_out);
> +
> +	writel(0, ss->base + SS_CTL);
> +	clk_disable_unprepare(ss->busclk);
> +	clk_disable_unprepare(ss->ssclk);
> +	return 0;
> +}
> +
> +static const struct of_device_id a20ss_crypto_of_match_table[] = {
> +	{ .compatible = "allwinner,sun7i-a20-crypto" },
> +	{}
> +};
> +MODULE_DEVICE_TABLE(of, a20ss_crypto_of_match_table);
> +
> +static struct platform_driver sunxi_ss_driver = {
> +	.probe          = sunxi_ss_probe,
> +	.remove         = __exit_p(sunxi_ss_remove),

And this is why you're not seeing any warning.

> +	.driver         = {
> +		.owner          = THIS_MODULE,

You can drop this, it's already set by module_platform_driver.

> +		.name           = "sunxi-ss",
> +		.of_match_table	= a20ss_crypto_of_match_table,
> +	},
> +};
> +
> +module_platform_driver(sunxi_ss_driver);
> +
> +MODULE_DESCRIPTION("Allwinner Security System cryptographic accelerator");
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Corentin LABBE <clabbe.montjoie@gmail.com>");
> diff --git a/drivers/crypto/sunxi-ss/sunxi-ss-hash.c b/drivers/crypto/sunxi-ss/sunxi-ss-hash.c
> new file mode 100644
> index 0000000..ec8758f
> --- /dev/null
> +++ b/drivers/crypto/sunxi-ss/sunxi-ss-hash.c
> @@ -0,0 +1,445 @@
> +/*
> + * sunxi-ss-hash.c - hardware cryptographic accelerator for Allwinner A20 SoC
> + *
> + * Copyright (C) 2013-2014 Corentin LABBE <clabbe.montjoie@gmail.com>
> + *
> + * This file add support for MD5 and SHA1.
> + *
> + * You could find the datasheet in Documentation/arm/sunxi/README
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + */
> +#include "sunxi-ss.h"
> +
> +/* This is a totaly arbitrary value */
> +#define SS_TIMEOUT 100
> +
> +extern struct sunxi_ss_ctx *ss;
> +
> +int sunxi_hash_crainit(struct crypto_tfm *tfm)
> +{
> +	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
> +			sizeof(struct sunxi_req_ctx));
> +	return 0;
> +}
> +
> +/* sunxi_hash_init: initialize request context */
> +int sunxi_hash_init(struct ahash_request *areq)
> +{
> +	const char *hash_type;
> +	struct sunxi_req_ctx *op = ahash_request_ctx(areq);
> +
> +	memset(op, 0, sizeof(struct sunxi_req_ctx));
> +
> +	hash_type = crypto_tfm_alg_name(areq->base.tfm);
> +
> +	if (strcmp(hash_type, "sha1") == 0)
> +		op->mode = SS_OP_SHA1;
> +	if (strcmp(hash_type, "md5") == 0)
> +		op->mode = SS_OP_MD5;
> +	if (op->mode == 0)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static u32 rx_cnt;
> +
> +inline void ss_writer(const u32 v)
> +{
> +	u32 spaces;
> +
> +	writel(v, ss->base + SS_RXFIFO);
> +	rx_cnt--;
> +	while (rx_cnt == 0) {
> +		spaces = readl_relaxed(ss->base + SS_FCSR);
> +		rx_cnt = SS_RXFIFO_SPACES(spaces);
> +	}
> +}

Errrr. What?!

Who sets that rx_cnt variable? And it persists between calls? It looks
broken.

> +inline void ss_writer_relaxed(const u32 v)
> +{
> +	u32 spaces;
> +
> +	writel_relaxed(v, ss->base + SS_RXFIFO);
> +	rx_cnt--;
> +	while (rx_cnt == 0) {
> +		spaces = readl_relaxed(ss->base + SS_FCSR);
> +		rx_cnt = SS_RXFIFO_SPACES(spaces);
> +	}
> +}
> +
> +/*
> + * sunxi_hash_update: update hash engine
> + *
> + * Could be used for both SHA1 and MD5
> + * Write data by step of 32bits and put then in the SS.
> + *
> + * Since we cannot leave partial data and hash state in the engine,
> + * we need to get the hash state at the end of this function.
> + * After some work, I have found that we can get the hash state every 64o
> + *
> + * So the first work is to get the number of bytes to write to SS modulo 64
> + * The extra bytes will go to two different destination:
> + * op->wait for full 32bits word
> + * op->wb (waiting bytes) for partial 32 bits word
> + * So we can have up to (64/4)-1 op->wait words and 0/1/2/3 bytes in wb
> + *
> + * So at the begin of update()
> + * if op->nwait * 4 + areq->nbytes < 64
> + * => all data writed to wait buffers and end=0
> + * if not write all nwait to the device and position end to complete to 64o
> + *
> + * example 1:
> + * update1 60o => nwait=15
> + * update2 60o => need one more word to have 64o
> + * end=4
> + * so write all data in op->wait and one word of SGs
> + * write remaining data in op->wait
> + * final state op->nwait=14
> + */
> +int sunxi_hash_update(struct ahash_request *areq)
> +{
> +	u32 v, ivmode = 0;
> +	unsigned int i = 0;
> +	/*
> +	 * i is the total bytes read from SGs, to be compared to areq->nbytes
> +	 * i is important because we cannot rely on SG length since the sum of
> +	 * SG->length could be greater than areq->nbytes
> +	 */
> +
> +	struct sunxi_req_ctx *op = ahash_request_ctx(areq);
> +	struct scatterlist *in_sg;
> +	unsigned int in_i = 0; /* advancement in the current SG */
> +	u64 end;
> +	/*
> +	 * end is the position when we need to stop writing to the device,
> +	 * to be compared to i
> +	 */
> +	int in_r;
> +	void *src_addr;
> +
> +	dev_dbg(ss->dev, "%s %s bc=%llu len=%u mode=%x bw=%u ww=%u",
> +			__func__, crypto_tfm_alg_name(areq->base.tfm),
> +			op->byte_count, areq->nbytes, op->mode,
> +			op->nbw, op->nwait);
> +
> +	if (areq->nbytes == 0)
> +		return 0;
> +
> +	end = ((areq->nbytes + op->nwait * 4 + op->nbw) / 64) * 64
> +		- op->nbw - op->nwait * 4;
> +
> +	if (end > areq->nbytes || areq->nbytes - end > 63) {
> +		dev_err(ss->dev, "ERROR: Bound error %llu %u\n",
> +				end, areq->nbytes);
> +		return -EINVAL;
> +	}
> +
> +	if (op->nwait > 0 && end > 0) {
> +		/* a precedent update was done */
> +		for (i = 0; i < op->nwait; i++) {
> +			ss_writer(op->wait[i]);
> +			op->byte_count += 4;
> +		}
> +		op->nwait = 0;
> +	}
> +
> +	mutex_lock(&ss->lock);
> +	/*
> +	 * if some data have been processed before,
> +	 * we need to restore the partial hash state
> +	 */
> +	if (op->byte_count > 0) {
> +		ivmode = SS_IV_ARBITRARY;
> +		for (i = 0; i < 5; i++)
> +			writel(op->hash[i], ss->base + SS_IV0 + i * 4);
> +	}
> +	/* Enable the device */
> +	writel(op->mode | SS_ENABLED | ivmode, ss->base + SS_CTL);
> +
> +	rx_cnt = 0;
> +	i = 0;
> +
> +	in_sg = areq->src;
> +	src_addr = kmap(sg_page(in_sg)) + in_sg->offset;
> +	if (src_addr == NULL) {
> +		mutex_unlock(&ss->lock);
> +		dev_err(ss->dev, "ERROR: Cannot kmap source buffer\n");
> +		return -EFAULT;
> +	}
> +	do {
> +		/*
> +		 * step 1, if some bytes remains from last SG,
> +		 * try to complete them to 4 and send that word
> +		 */
> +		if (op->nbw > 0) {
> +			while (op->nbw < 4 && i < areq->nbytes &&
> +					in_i < in_sg->length) {
> +				op->wb |= (*(u8 *)(src_addr + in_i))
> +					<< (8 * op->nbw);
> +				dev_dbg(ss->dev, "%s Complete w=%d wb=%x\n",
> +						__func__, op->nbw, op->wb);
> +				i++;
> +				in_i++;
> +				op->nbw++;
> +			}
> +			if (op->nbw == 4) {
> +				if (i <= end) {
> +					ss_writer(op->wb);
> +					op->byte_count += 4;
> +				} else {
> +					op->wait[op->nwait] = op->wb;
> +					op->nwait++;
> +					dev_dbg(ss->dev, "%s Keep %u bytes after %llu\n",
> +						__func__, op->nwait, end);
> +				}
> +				op->nbw = 0;
> +				op->wb = 0;
> +			}
> +		}
> +		/* step 2, main loop, read data 4bytes at a time */
> +		while (i < areq->nbytes && in_i < in_sg->length) {
> +			/* how many bytes we can read, (we need 4) */
> +			in_r = min(in_sg->length - in_i, areq->nbytes - i);
> +			if (in_r < 4) {
> +				/* Not enough data to write to the device */
> +				op->wb = 0;
> +				while (in_r > 0) {
> +					op->wb |= (*(u8 *)(src_addr + in_i))
> +						<< (8 * op->nbw);
> +					dev_dbg(ss->dev, "%s ending bw=%d wb=%x\n",
> +						__func__, op->nbw, op->wb);
> +					in_r--;
> +					i++;
> +					in_i++;
> +					op->nbw++;
> +				}
> +				goto nextsg;
> +			}
> +			v = *(u32 *)(src_addr + in_i);
> +			if (i < end) {
> +				/* last write must be done without relaxed */
> +				if (i + 4 >= end)
> +					ss_writer(v);
> +				else
> +					ss_writer_relaxed(v);
> +				i += 4;
> +				op->byte_count += 4;
> +				in_i += 4;
> +			} else {
> +				op->wait[op->nwait] = v;
> +				i += 4;
> +				in_i += 4;
> +				op->nwait++;
> +				dev_dbg(ss->dev, "%s Keep word ww=%u after %llu\n",
> +						__func__, op->nwait, end);
> +				if (op->nwait > 15) {
> +					dev_err(ss->dev, "FATAL: Cannot enqueue more, bug?\n");
> +					writel(0, ss->base + SS_CTL);
> +					mutex_unlock(&ss->lock);
> +					return -EIO;
> +				}
> +			}
> +		}
> +nextsg:
> +		/* Nothing more to read in this SG */
> +		if (in_i == in_sg->length) {
> +			kunmap(sg_page(in_sg));
> +			do {
> +				in_sg = sg_next(in_sg);
> +			} while (in_sg != NULL && in_sg->length == 0);
> +			in_i = 0;
> +			if (in_sg != NULL) {
> +				src_addr = kmap(sg_page(in_sg)) + in_sg->offset;
> +				if (src_addr == NULL) {
> +					mutex_unlock(&ss->lock);
> +					dev_err(ss->dev, "ERROR: Cannot kmap source buffer\n");
> +					return -EFAULT;
> +				}
> +			}
> +		}
> +	} while (in_sg != NULL && i < areq->nbytes);
> +
> +	/* ask the device to finish the hashing */
> +	writel(op->mode | SS_ENABLED | SS_DATA_END, ss->base + SS_CTL);
> +	i = 0;
> +	do {
> +		v = readl(ss->base + SS_CTL);
> +		i++;
> +	} while (i < SS_TIMEOUT && (v & SS_DATA_END) > 0);
> +	if (i >= SS_TIMEOUT) {
> +		dev_err(ss->dev, "ERROR: %s hash end timeout after %d loop, CTL=%x\n",
> +				__func__, i, v);
> +		writel(0, ss->base + SS_CTL);
> +		mutex_unlock(&ss->lock);
> +		return -EIO;
> +	}
> +
> +	/* get the partial hash */
> +	if (op->mode == SS_OP_SHA1) {
> +		for (i = 0; i < 5; i++)
> +			op->hash[i] = readl(ss->base + SS_MD0 + i * 4);
> +	} else {
> +		for (i = 0; i < 4; i++)
> +			op->hash[i] = readl(ss->base + SS_MD0 + i * 4);
> +	}
> +
> +	writel(0, ss->base + SS_CTL);
> +	mutex_unlock(&ss->lock);
> +	return 0;
> +}
> +
> +/*
> + * sunxi_hash_final: finalize hashing operation
> + *
> + * If we have some remaining bytes, we write them.
> + * Then ask the SS for finalizing the hashing operation
> + */
> +int sunxi_hash_final(struct ahash_request *areq)
> +{
> +	u32 v, ivmode = 0;
> +	unsigned int i;
> +	int zeros;
> +	unsigned int index, padlen;
> +	__be64 bits;
> +	struct sunxi_req_ctx *op = ahash_request_ctx(areq);
> +
> +	dev_dbg(ss->dev, "%s byte=%llu len=%u mode=%x bw=%u %x h=%x ww=%u",
> +			__func__, op->byte_count, areq->nbytes, op->mode,
> +			op->nbw, op->wb, op->hash[0], op->nwait);
> +
> +	mutex_lock(&ss->lock);
> +	rx_cnt = 0;
> +
> +	/*
> +	 * if we have already writed something,
> +	 * restore the partial hash state
> +	 */
> +	if (op->byte_count > 0) {
> +		ivmode = SS_IV_ARBITRARY;
> +		for (i = 0; i < 5; i++)
> +			writel(op->hash[i], ss->base + SS_IV0 + i * 4);
> +	}
> +	writel(op->mode | SS_ENABLED | ivmode, ss->base + SS_CTL);
> +
> +	/* write the remaining words of the wait buffer */
> +	if (op->nwait > 0) {
> +		for (i = 0; i < op->nwait; i++) {
> +			v = op->wait[i];
> +			ss_writer(v);
> +			op->byte_count += 4;
> +			dev_dbg(ss->dev, "%s write %llu i=%u %x\n",
> +					__func__, op->byte_count, i, v);
> +		}
> +		op->nwait = 0;
> +	}
> +
> +	/* write the remaining bytes of the nbw buffer */
> +	if (op->nbw > 0) {
> +		op->wb |= ((1 << 7) << (op->nbw * 8));
> +		ss_writer(op->wb);
> +	} else {
> +		ss_writer((1 << 7));
> +	}
> +
> +	/*
> +	 * number of space to pad to obtain 64o minus 8(size) minus 4 (final 1)
> +	 * I take the operations from other md5/sha1 implementations
> +	 */
> +
> +	/* we have already send 4 more byte of which nbw data */
> +	if (op->mode == SS_OP_MD5) {
> +		index = (op->byte_count + 4) & 0x3f;
> +		op->byte_count += op->nbw;
> +		if (index > 56)
> +			zeros = (120 - index) / 4;
> +		else
> +			zeros = (56 - index) / 4;
> +	} else {
> +		op->byte_count += op->nbw;
> +		index = op->byte_count & 0x3f;
> +		padlen = (index < 56) ? (56 - index) : ((64+56) - index);
> +		zeros = (padlen - 1) / 4;
> +	}
> +	for (i = 0; i < zeros; i++)
> +		ss_writer(0);
> +
> +	/* write the length of data */
> +	if (op->mode == SS_OP_SHA1) {
> +		bits = cpu_to_be64(op->byte_count << 3);
> +		ss_writer(bits & 0xffffffff);
> +		ss_writer((bits >> 32) & 0xffffffff);
> +	} else {
> +		ss_writer((op->byte_count << 3) & 0xffffffff);
> +		ss_writer((op->byte_count >> 29) & 0xffffffff);
> +	}
> +
> +	/* Tell the SS to stop the hashing */
> +	writel(op->mode | SS_ENABLED | SS_DATA_END, ss->base + SS_CTL);
> +
> +	/*
> +	 * Wait for SS to finish the hash.
> +	 * The timeout could happend only in case of bad overcloking
> +	 * or driver bug.
> +	 */
> +	i = 0;
> +	do {
> +		v = readl(ss->base + SS_CTL);
> +		i++;
> +	} while (i < SS_TIMEOUT && (v & SS_DATA_END) > 0);
> +	if (i >= SS_TIMEOUT) {
> +		dev_err(ss->dev, "ERROR: hash end timeout %d>%d ctl=%x len=%u\n",
> +				i, SS_TIMEOUT, v, areq->nbytes);
> +		writel(0, ss->base + SS_CTL);
> +		mutex_unlock(&ss->lock);
> +		return -EIO;
> +	}
> +
> +	/* Get the hash from the device */
> +	if (op->mode == SS_OP_SHA1) {
> +		for (i = 0; i < 5; i++) {
> +			v = cpu_to_be32(readl(ss->base + SS_MD0 + i * 4));
> +			memcpy(areq->result + i * 4, &v, 4);
> +		}
> +	} else {
> +		for (i = 0; i < 4; i++) {
> +			v = readl(ss->base + SS_MD0 + i * 4);
> +			memcpy(areq->result + i * 4, &v, 4);
> +		}
> +	}
> +	writel(0, ss->base + SS_CTL);
> +	mutex_unlock(&ss->lock);
> +	return 0;
> +}
> +
> +/* sunxi_hash_finup: finalize hashing operation after an update */
> +int sunxi_hash_finup(struct ahash_request *areq)
> +{
> +	int err;
> +
> +	err = sunxi_hash_update(areq);
> +	if (err != 0)
> +		return err;
> +
> +	return sunxi_hash_final(areq);
> +}
> +
> +/* combo of init/update/final functions */
> +int sunxi_hash_digest(struct ahash_request *areq)
> +{
> +	int err;
> +
> +	err = sunxi_hash_init(areq);
> +	if (err != 0)
> +		return err;
> +
> +	err = sunxi_hash_update(areq);
> +	if (err != 0)
> +		return err;
> +
> +	return sunxi_hash_final(areq);
> +}
> diff --git a/drivers/crypto/sunxi-ss/sunxi-ss.h b/drivers/crypto/sunxi-ss/sunxi-ss.h
> new file mode 100644
> index 0000000..331e75b
> --- /dev/null
> +++ b/drivers/crypto/sunxi-ss/sunxi-ss.h
> @@ -0,0 +1,193 @@
> +/*
> + * sunxi-ss.c - hardware cryptographic accelerator for Allwinner A20 SoC
> + *
> + * Copyright (C) 2013-2014 Corentin LABBE <clabbe.montjoie@gmail.com>
> + *
> + * Support AES cipher with 128,192,256 bits keysize.
> + * Support MD5 and SHA1 hash algorithms.
> + * Support DES and 3DES
> + *
> + * You could find the datasheet in Documentation/arm/sunxi/README
> + *
> + * Licensed under the GPL-2.
> + */
> +
> +#include <linux/clk.h>
> +#include <linux/crypto.h>
> +#include <linux/io.h>
> +#include <linux/module.h>
> +#include <linux/of.h>
> +#include <linux/platform_device.h>
> +#include <crypto/scatterwalk.h>
> +#include <linux/scatterlist.h>
> +#include <linux/interrupt.h>
> +#include <linux/delay.h>
> +#include <crypto/md5.h>
> +#include <crypto/sha.h>
> +#include <crypto/hash.h>
> +#include <crypto/internal/hash.h>
> +#include <crypto/aes.h>
> +#include <crypto/des.h>
> +#include <crypto/internal/rng.h>
> +
> +#define SS_CTL            0x00
> +#define SS_KEY0           0x04
> +#define SS_KEY1           0x08
> +#define SS_KEY2           0x0C
> +#define SS_KEY3           0x10
> +#define SS_KEY4           0x14
> +#define SS_KEY5           0x18
> +#define SS_KEY6           0x1C
> +#define SS_KEY7           0x20
> +
> +#define SS_IV0            0x24
> +#define SS_IV1            0x28
> +#define SS_IV2            0x2C
> +#define SS_IV3            0x30
> +
> +#define SS_CNT0           0x34
> +#define SS_CNT1           0x38
> +#define SS_CNT2           0x3C
> +#define SS_CNT3           0x40
> +
> +#define SS_FCSR           0x44
> +#define SS_ICSR           0x48
> +
> +#define SS_MD0            0x4C
> +#define SS_MD1            0x50
> +#define SS_MD2            0x54
> +#define SS_MD3            0x58
> +#define SS_MD4            0x5C
> +
> +#define SS_RXFIFO         0x200
> +#define SS_TXFIFO         0x204
> +
> +/* SS_CTL configuration values */
> +
> +/* PRNG generator mode - bit 15 */
> +#define SS_PRNG_ONESHOT		(0 << 15)
> +#define SS_PRNG_CONTINUE	(1 << 15)
> +
> +/* IV mode for hash */
> +#define SS_IV_ARBITRARY		(1 << 14)
> +
> +/* SS operation mode - bits 12-13 */
> +#define SS_ECB			(0 << 12)
> +#define SS_CBC			(1 << 12)
> +#define SS_CNT			(2 << 12)
> +
> +/* Counter width for CNT mode - bits 10-11 */
> +#define SS_CNT_16BITS		(0 << 10)
> +#define SS_CNT_32BITS		(1 << 10)
> +#define SS_CNT_64BITS		(2 << 10)
> +
> +/* Key size for AES - bits 8-9 */
> +#define SS_AES_128BITS		(0 << 8)
> +#define SS_AES_192BITS		(1 << 8)
> +#define SS_AES_256BITS		(2 << 8)
> +
> +/* Operation direction - bit 7 */
> +#define SS_ENCRYPTION		(0 << 7)
> +#define SS_DECRYPTION		(1 << 7)
> +
> +/* SS Method - bits 4-6 */
> +#define SS_OP_AES		(0 << 4)
> +#define SS_OP_DES		(1 << 4)
> +#define SS_OP_3DES		(2 << 4)
> +#define SS_OP_SHA1		(3 << 4)
> +#define SS_OP_MD5		(4 << 4)
> +#define SS_OP_PRNG		(5 << 4)
> +
> +/* Data end bit - bit 2 */
> +#define SS_DATA_END		(1 << 2)
> +
> +/* PRNG start bit - bit 1 */
> +#define SS_PRNG_START		(1 << 1)
> +
> +/* SS Enable bit - bit 0 */
> +#define SS_DISABLED		(0 << 0)
> +#define SS_ENABLED		(1 << 0)
> +
> +/* SS_FCSR configuration values */
> +/* RX FIFO status - bit 30 */
> +#define SS_RXFIFO_FREE		(1 << 30)
> +
> +/* RX FIFO empty spaces - bits 24-29 */
> +#define SS_RXFIFO_SPACES(val)	(((val) >> 24) & 0x3f)
> +
> +/* TX FIFO status - bit 22 */
> +#define SS_TXFIFO_AVAILABLE	(1 << 22)
> +
> +/* TX FIFO available spaces - bits 16-21 */
> +#define SS_TXFIFO_SPACES(val)	(((val) >> 16) & 0x3f)
> +
> +#define SS_RXFIFO_EMP_INT_PENDING	(1 << 10)
> +#define SS_TXFIFO_AVA_INT_PENDING	(1 << 8)
> +#define SS_RXFIFO_EMP_INT_ENABLE	(1 << 2)
> +#define SS_TXFIFO_AVA_INT_ENABLE	(1 << 0)
> +
> +/* SS_ICSR configuration values */
> +#define SS_ICS_DRQ_ENABLE		(1 << 4)
> +
> +struct sunxi_ss_ctx {
> +	void __iomem *base;
> +	int irq;
> +	struct clk *busclk;
> +	struct clk *ssclk;
> +	struct device *dev;
> +	struct resource *res;
> +	void *buf_in; /* pointer to data to be uploaded to the device */
> +	size_t buf_in_size; /* size of buf_in */
> +	void *buf_out;
> +	size_t buf_out_size;
> +	struct mutex lock; /* control the use of the device */
> +	struct mutex bufout_lock; /* control the use of buf_out*/
> +	struct mutex bufin_lock; /* control the sue of buf_in*/
> +};
> +
> +struct sunxi_tfm_ctx {
> +	u32 key[AES_MAX_KEY_SIZE / 4];/* divided by sizeof(u32) */
> +	u32 keylen;
> +	u32 keymode;
> +};
> +
> +struct sunxi_req_ctx {
> +	u32 mode;
> +	u64 byte_count; /* number of bytes "uploaded" to the device */
> +	u32 wb; /* a partial word waiting to be completed and
> +			uploaded to the device */
> +	/* number of bytes to be uploaded in the wb word */
> +	unsigned int nbw;
> +	u32 hash[5];
> +	u32 wait[64];
> +	unsigned int nwait;
> +};
> +
> +#define SS_SEED_LEN (192/8)
> +#define SS_DATA_LEN (160/8)
> +
> +struct prng_context {
> +	u32 seed[SS_SEED_LEN/4];
> +	unsigned int slen;
> +};
> +
> +int sunxi_hash_crainit(struct crypto_tfm *tfm);
> +int sunxi_hash_init(struct ahash_request *areq);
> +int sunxi_hash_update(struct ahash_request *areq);
> +int sunxi_hash_final(struct ahash_request *areq);
> +int sunxi_hash_finup(struct ahash_request *areq);
> +int sunxi_hash_digest(struct ahash_request *areq);
> +int sunxi_hash_export(struct ahash_request *areq, void *out);
> +int sunxi_hash_import(struct ahash_request *areq, const void *in);
> +
> +int sunxi_ss_aes_poll(struct ablkcipher_request *areq, u32 mode);
> +int sunxi_ss_des_poll(struct ablkcipher_request *areq, u32 mode);
> +int sunxi_ss_cipher_init(struct crypto_tfm *tfm);
> +int sunxi_ss_cipher_encrypt(struct ablkcipher_request *areq);
> +int sunxi_ss_cipher_decrypt(struct ablkcipher_request *areq);
> +int sunxi_ss_aes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
> +		unsigned int keylen);
> +int sunxi_ss_des_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
> +		unsigned int keylen);
> +int sunxi_ss_des3_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
> +		unsigned int keylen);
> -- 
> 2.0.4
> 

Thanks,
Maxime
Arnd Bergmann Oct. 22, 2014, 9 a.m. UTC | #5
On Sunday 19 October 2014 16:16:22 LABBE Corentin wrote:
> Add support for the Security System included in Allwinner SoC A20.
> The Security System is a hardware cryptographic accelerator that support AES/MD5/SHA1/DES/3DES/PRNG algorithms.
> 
> Signed-off-by: LABBE Corentin <clabbe.montjoie@gmail.com>

Please wrap lines in the changelog after about 70 characters.

> --- /dev/null
> +++ b/drivers/crypto/sunxi-ss/sunxi-ss-cipher.c
> @@ -0,0 +1,489 @@

> +#include "sunxi-ss.h"
> +
> +extern struct sunxi_ss_ctx *ss;

'extern' declarations belong into header files, not .c files. It would
be even better to avoid this completely and carry the pointer to the
context in an object that gets passed around. In general we want drivers
to be written in a way that allows having multiple instances of the
device, which the global pointer prevents.

> +
> +	src32 = (u32 *)src_addr;
> +	dst32 = (u32 *)dst_addr;


You appear to be missing '__iomem' annotations for the mmio pointers.
Please always run your code through the 'sparse' checker using 'make C=1'
to catch and fix this and other erros.

> +	ileft = areq->nbytes / 4;
> +	oleft = areq->nbytes / 4;
> +	i = 0;
> +	do {
> +		if (ileft > 0 && rx_cnt > 0) {
> +			todo = min(rx_cnt, ileft);
> +			ileft -= todo;
> +			do {
> +				writel_relaxed(*src32++,
> +						ss->base +
> +						SS_RXFIFO);
> +				todo--;
> +			} while (todo > 0);
> +		}

This looks like it should be using writesl() instead of the 
writel_relaxed() loop. That should not only be faster but it will
also change the byte ordering if you are running a big-endian
kernel.

Since this is a FIFO register, the ordering that writesl uses
is likely the correct one.

	Arnd
Corentin Labbe Oct. 24, 2014, 6:50 p.m. UTC | #6
Le 22/10/2014 11:00, Arnd Bergmann a écrit :
> On Sunday 19 October 2014 16:16:22 LABBE Corentin wrote:
>> Add support for the Security System included in Allwinner SoC A20.
>> The Security System is a hardware cryptographic accelerator that support AES/MD5/SHA1/DES/3DES/PRNG algorithms.
>>
>> Signed-off-by: LABBE Corentin <clabbe.montjoie@gmail.com>
> 
> Please wrap lines in the changelog after about 70 characters.
> 

Oups I just see the corresponding part in submittingpatches.txt
Sorry

>> --- /dev/null
>> +++ b/drivers/crypto/sunxi-ss/sunxi-ss-cipher.c
>> @@ -0,0 +1,489 @@
> 
>> +#include "sunxi-ss.h"
>> +
>> +extern struct sunxi_ss_ctx *ss;
> 
> 'extern' declarations belong into header files, not .c files. It would
> be even better to avoid this completely and carry the pointer to the
> context in an object that gets passed around. In general we want drivers
> to be written in a way that allows having multiple instances of the
> device, which the global pointer prevents.
> 

As I already said I think the driver will never be used with multiple instance.
But since many people want this pointer dead, I will work on it.

>> +
>> +	src32 = (u32 *)src_addr;
>> +	dst32 = (u32 *)dst_addr;
> 
> 
> You appear to be missing '__iomem' annotations for the mmio pointers.
> Please always run your code through the 'sparse' checker using 'make C=1'
> to catch and fix this and other erros.
> 

Ok, but with which version of sparse do you have such a warning. I use the 0.5.0 version and I got no warning at all.

>> +	ileft = areq->nbytes / 4;
>> +	oleft = areq->nbytes / 4;
>> +	i = 0;
>> +	do {
>> +		if (ileft > 0 && rx_cnt > 0) {
>> +			todo = min(rx_cnt, ileft);
>> +			ileft -= todo;
>> +			do {
>> +				writel_relaxed(*src32++,
>> +						ss->base +
>> +						SS_RXFIFO);
>> +				todo--;
>> +			} while (todo > 0);
>> +		}
> 
> This looks like it should be using writesl() instead of the 
> writel_relaxed() loop. That should not only be faster but it will
> also change the byte ordering if you are running a big-endian
> kernel.
> 
> Since this is a FIFO register, the ordering that writesl uses
> is likely the correct one.

Great, the code is much cleaner with it. (with up to 10% speed gain)

Thanks

Corentin
Corentin Labbe Oct. 24, 2014, 6:52 p.m. UTC | #7
On 10/21/14 21:11, Maxime Ripard wrote:
> Hi Corentin,
> 
> Thanks for resending it.
> 
> On Sun, Oct 19, 2014 at 04:16:22PM +0200, LABBE Corentin wrote:
>> Add support for the Security System included in Allwinner SoC A20.
>> The Security System is a hardware cryptographic accelerator that support AES/MD5/SHA1/DES/3DES/PRNG algorithms.
>>
>> Signed-off-by: LABBE Corentin <clabbe.montjoie@gmail.com>
>> ---
>>  drivers/crypto/Kconfig                    |  17 ++
>> +static int sunxi_ss_aes_poll_atomic(struct ablkcipher_request *areq)
>> +{
>> +	u32 spaces;
>> +	struct scatterlist *in_sg = areq->src;
>> +	struct scatterlist *out_sg = areq->dst;
>> +	void *src_addr;
>> +	void *dst_addr;
>> +	unsigned int ileft = areq->nbytes;
>> +	unsigned int oleft = areq->nbytes;
>> +	unsigned int todo;
>> +	u32 *src32;
>> +	u32 *dst32;
>> +	u32 rx_cnt = 32;
>> +	u32 tx_cnt = 0;
>> +	int i;
>> +
>> +	src_addr = kmap_atomic(sg_page(in_sg)) + in_sg->offset;
> 
> Where does this scatter_list is coming from? Can it even be allocated
> in highmem?
> 

With AF_ALG and cryptodev, the SG is in highmem. Verified with some PageHighMem().

>> +	if (src_addr == NULL) {
>> +		dev_err(ss->dev, "kmap_atomic error for src SG\n");
>> +		writel(0, ss->base + SS_CTL);
>> +		mutex_unlock(&ss->lock);
>> +		return -EINVAL;
>> +	}
>> +
>> +	dst_addr = kmap_atomic(sg_page(out_sg)) + out_sg->offset;
>> +	if (dst_addr == NULL) {
>> +		dev_err(ss->dev, "kmap_atomic error for dst SG\n");
>> +		writel(0, ss->base + SS_CTL);
>> +		kunmap_atomic(src_addr);
>> +		mutex_unlock(&ss->lock);
>> +		return -EINVAL;
> 
> Please use gotos in your error path.
> 

Ok

>> +	}
>> +
>> +	src32 = (u32 *)src_addr;
>> +	dst32 = (u32 *)dst_addr;
>> +	ileft = areq->nbytes / 4;
>> +	oleft = areq->nbytes / 4;
>> +	i = 0;
>> +	do {
>> +		if (ileft > 0 && rx_cnt > 0) {
>> +			todo = min(rx_cnt, ileft);
>> +			ileft -= todo;
>> +			do {
>> +				writel_relaxed(*src32++,
> 
> Please put some braces around that referencing/increment.
> 

Ok

>> +						ss->base +
>> +						SS_RXFIFO);
>> +				todo--;
>> +			} while (todo > 0);
>> +		}
>> +		if (tx_cnt > 0) {
>> +			todo = min(tx_cnt, oleft);
>> +			oleft -= todo;
>> +			do {
>> +				*dst32++ = readl_relaxed(ss->base +
>> +						SS_TXFIFO);
>> +				todo--;
>> +			} while (todo > 0);
>> +		}
>> +		spaces = readl_relaxed(ss->base + SS_FCSR);
>> +		rx_cnt = SS_RXFIFO_SPACES(spaces);
>> +		tx_cnt = SS_TXFIFO_SPACES(spaces);
>> +	} while (oleft > 0);
>> +	writel(0, ss->base + SS_CTL);
>> +	kunmap_atomic(src_addr);
>> +	kunmap_atomic(dst_addr);
>> +	mutex_unlock(&ss->lock);
> 
> You never took that mutex in that function...
> 

Solved with comment from Vladimir Zapolskiy

>> +	return 0;
[]
>> +	int no_chunk = 1;
>> +	struct scatterlist *in_sg = areq->src;
>> +	struct scatterlist *out_sg = areq->dst;
>> +
>> +	/*
>> +	 * if we have only SGs with size multiple of 4,
>> +	 * we can use the SS AES function
>> +	 */
>> +	while (in_sg != NULL && no_chunk == 1) {
>> +		if ((in_sg->length % 4) != 0)
>> +			no_chunk = 0;
>> +		in_sg = sg_next(in_sg);
>> +	}
>> +	while (out_sg != NULL && no_chunk == 1) {
>> +		if ((out_sg->length % 4) != 0)
>> +			no_chunk = 0;
>> +		out_sg = sg_next(out_sg);
>> +	}
>> +
>> +	if (no_chunk == 1)
>> +		return sunxi_ss_aes_poll(areq, mode);
>> +
>> +	in_sg = areq->src;
>> +	out_sg = areq->dst;
>> +
>> +	nb_in_sg_rx = sg_nents(in_sg);
>> +	nb_in_sg_tx = sg_nents(out_sg);
>> +
>> +	/*
>> +	 * buf_in and buf_out are allocated only one time
>> +	 * then we keep the buffer until driver end
>> +	 * the allocation can only grow more
>> +	 * we do not reduce it for simplification
>> +	 */
>> +	mutex_lock(&ss->bufin_lock);
>> +	if (ss->buf_in == NULL) {
>> +		ss->buf_in = kmalloc(areq->nbytes, GFP_KERNEL);
>> +		ss->buf_in_size = areq->nbytes;
>> +	} else {
>> +		if (areq->nbytes > ss->buf_in_size) {
>> +			kfree(ss->buf_in);
>> +			ss->buf_in = kmalloc(areq->nbytes, GFP_KERNEL);
>> +			ss->buf_in_size = areq->nbytes;
>> +		}
>> +	}
>> +	if (ss->buf_in == NULL) {
>> +		ss->buf_in_size = 0;
>> +		mutex_unlock(&ss->bufin_lock);
>> +		dev_err(ss->dev, "Unable to allocate pages.\n");
>> +		return -ENOMEM;
>> +	}
>> +	mutex_lock(&ss->bufout_lock);
> 
> What are these two mutexes used for? It looks like you're only using
> them in this function.
> 
> What would prevent you from just using the "main" lock like you did
> for the AES?
> 

I have just removed all that DES/3DES code and now I use a fallback for that case.


>> +	if (ss->buf_out == NULL) {
>> +		ss->buf_out = kmalloc(areq->nbytes, GFP_KERNEL);
>> +		if (ss->buf_out == NULL) {
>> +			ss->buf_out_size = 0;
>> +			mutex_unlock(&ss->bufin_lock);
>> +			mutex_unlock(&ss->bufout_lock);
>> +			dev_err(ss->dev, "Unable to allocate pages.\n");
>> +			return -ENOMEM;
>> +		}
>> +		ss->buf_out_size = areq->nbytes;
>> +	} else {
>> +		if (areq->nbytes > ss->buf_out_size) {
>> +			kfree(ss->buf_out);
>> +			ss->buf_out = kmalloc(areq->nbytes, GFP_KERNEL);
>> +			if (ss->buf_out == NULL) {
>> +				ss->buf_out_size = 0;
>> +				mutex_unlock(&ss->bufin_lock);
>> +				mutex_unlock(&ss->bufout_lock);
>> +				dev_err(ss->dev, "Unable to allocate pages.\n");
>> +				return -ENOMEM;
>> +			}
>> +			ss->buf_out_size = areq->nbytes;
>> +		}
>> +	}
>> +
>> +	sg_copy_to_buffer(areq->src, nb_in_sg_rx, ss->buf_in, areq->nbytes);
>> +
>> +	ir = 0;
>> +	it = 0;
>> +	mutex_lock(&ss->lock);
>> +
>> +	for (i = 0; i < op->keylen; i += 4)
>> +		writel(*(op->key + i/4), ss->base + SS_KEY0 + i);
>> +	if (areq->info != NULL) {
>> +		for (i = 0; i < 4 && i < ivsize / 4; i++) {
>> +			v = *(u32 *)(areq->info + i * 4);
>> +			writel(v, ss->base + SS_IV0 + i * 4);
>> +		}
>> +	}
>> +	writel(mode, ss->base + SS_CTL);
>> +
>> +	do {
>> +		if (rx_cnt == 0 || tx_cnt == 0) {
>> +			spaces = readl(ss->base + SS_FCSR);
>> +			rx_cnt = SS_RXFIFO_SPACES(spaces);
>> +			tx_cnt = SS_TXFIFO_SPACES(spaces);
>> +		}
>> +		if (rx_cnt > 0 && ir < areq->nbytes) {
>> +			do {
>> +				value = *(u32 *)(ss->buf_in + ir);
>> +				writel(value, ss->base + SS_RXFIFO);
>> +				ir += 4;
>> +				rx_cnt--;
>> +			} while (rx_cnt > 0 && ir < areq->nbytes);
>> +		}
>> +		if (tx_cnt > 0 && it < areq->nbytes) {
>> +			do {
>> +				value = readl(ss->base + SS_TXFIFO);
>> +				*(u32 *)(ss->buf_out + it) = value;
>> +				it += 4;
>> +				tx_cnt--;
>> +			} while (tx_cnt > 0 && it < areq->nbytes);
>> +		}
>> +		if (ir == areq->nbytes) {
>> +			mutex_unlock(&ss->bufin_lock);
> 
> If tx_cnt <= 0 and it < areq->nbytes, your loop will stop, and you'll
> not release the mutex.
> 

Solved with the cleaning

>> +			ir++;
>> +		}
>> +	} while (it < areq->nbytes);
>> +
>> +	writel(0, ss->base + SS_CTL);
>> +	mutex_unlock(&ss->lock);
>> +
>> +	/*
>> +	 * a simple optimization, since we dont need the hardware for this copy
>> +	 * we release the lock and do the copy. With that we gain 5/10% perf
>> +	 */
>> +	sg_copy_from_buffer(areq->dst, nb_in_sg_tx, ss->buf_out, areq->nbytes);
>> +
>> +	mutex_unlock(&ss->bufout_lock);
>> +	return 0;
>> +}
>> +
>> +/* check and set the AES key, prepare the mode to be used */
>> +int sunxi_ss_aes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
>> +		unsigned int keylen)
>> +{
>> +	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
>> +
>> +	switch (keylen) {
>> +	case 128 / 8:
>> +		op->keymode = SS_AES_128BITS;
>> +		break;
>> +	case 192 / 8:
>> +		op->keymode = SS_AES_192BITS;
>> +		break;
>> +	case 256 / 8:
>> +		op->keymode = SS_AES_256BITS;
>> +		break;
>> +	default:
>> +		dev_err(ss->dev, "ERROR: Invalid keylen %u\n", keylen);
>> +		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
>> +		return -EINVAL;
>> +	}
>> +	op->keylen = keylen;
>> +	memcpy(op->key, key, keylen);
>> +	return 0;
>> +}
>> +
>> +/* check and set the DES key, prepare the mode to be used */
>> +int sunxi_ss_des_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
>> +		unsigned int keylen)
>> +{
>> +	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
>> +
>> +	if (keylen != DES_KEY_SIZE) {
>> +		dev_err(ss->dev, "Invalid keylen %u\n", keylen);
>> +		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
>> +		return -EINVAL;
>> +	}
>> +	op->keylen = keylen;
>> +	memcpy(op->key, key, keylen);
>> +	return 0;
>> +}
>> +
>> +/* check and set the 3DES key, prepare the mode to be used */
>> +int sunxi_ss_des3_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
>> +		unsigned int keylen)
>> +{
>> +	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
>> +
>> +	if (keylen != 3 * DES_KEY_SIZE) {
>> +		dev_err(ss->dev, "Invalid keylen %u\n", keylen);
>> +		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
>> +		return -EINVAL;
>> +	}
>> +	op->keylen = keylen;
>> +	memcpy(op->key, key, keylen);
>> +	return 0;
>> +}
>> diff --git a/drivers/crypto/sunxi-ss/sunxi-ss-core.c b/drivers/crypto/sunxi-ss/sunxi-ss-core.c
>> new file mode 100644
>> index 0000000..e66d7e2
>> --- /dev/null
>> +++ b/drivers/crypto/sunxi-ss/sunxi-ss-core.c
>> @@ -0,0 +1,318 @@
>> +/*
>> + * sunxi-ss-core.c - hardware cryptographic accelerator for Allwinner A20 SoC
>> + *
>> + * Copyright (C) 2013-2014 Corentin LABBE <clabbe.montjoie@gmail.com>
>> + *
>> + * Core file which registers crypto algorithms supported by the SS.
>> + *
>> + * You could find a link for the datasheet in Documentation/arm/sunxi/README
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + */
>> +#include <linux/clk.h>
>> +#include <linux/crypto.h>
>> +#include <linux/io.h>
>> +#include <linux/module.h>
>> +#include <linux/of.h>
>> +#include <linux/platform_device.h>
>> +#include <crypto/scatterwalk.h>
>> +#include <linux/scatterlist.h>
>> +#include <linux/interrupt.h>
>> +#include <linux/delay.h>
>> +
>> +#include "sunxi-ss.h"
>> +
>> +struct sunxi_ss_ctx *ss;
>> +
>> +/*
>> + * General notes for whole driver:
>> + *
>> + * After each request the device must be disabled with a write of 0 in SS_CTL
>> + *
>> + * For performance reason, we use writel_relaxed/read_relaxed for all
>> + * operations on RX and TX FIFO and also SS_FCSR.
>> + * Excepts for the last write on TX FIFO.
>> + * For all other registers, we use writel/readl.
>> + * See http://permalink.gmane.org/gmane.linux.ports.arm.kernel/117644
>> + * and http://permalink.gmane.org/gmane.linux.ports.arm.kernel/117640
>> + */
> 
> I don't really know why that comment is here, when there's not a
> single writel in this file.
> 

It is a note from the whole driver. I do not want to duplicate that comment in others files.
But now I use writesl() and that comment is now unnecessary.

>> +
>> +static struct ahash_alg sunxi_md5_alg = {
>> +	.init = sunxi_hash_init,
>> +	.update = sunxi_hash_update,
>> +	.final = sunxi_hash_final,
>> +	.finup = sunxi_hash_finup,
>> +	.digest = sunxi_hash_digest,
>> +	.halg = {
>> +		.digestsize = MD5_DIGEST_SIZE,
>> +		.base = {
>> +			.cra_name = "md5",
>> +			.cra_driver_name = "md5-sunxi-ss",
>> +			.cra_priority = 300,
>> +			.cra_alignmask = 3,
>> +			.cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
>> +			.cra_blocksize = MD5_HMAC_BLOCK_SIZE,
>> +			.cra_ctxsize = sizeof(struct sunxi_req_ctx),
>> +			.cra_module = THIS_MODULE,
>> +			.cra_type = &crypto_ahash_type,
>> +			.cra_init = sunxi_hash_crainit
>> +		}
>> +	}
>> +};
>> +
>> +static struct ahash_alg sunxi_sha1_alg = {
>> +	.init = sunxi_hash_init,
>> +	.update = sunxi_hash_update,
>> +	.final = sunxi_hash_final,
>> +	.finup = sunxi_hash_finup,
>> +	.digest = sunxi_hash_digest,
>> +	.halg = {
>> +		.digestsize = SHA1_DIGEST_SIZE,
>> +		.base = {
>> +			.cra_name = "sha1",
>> +			.cra_driver_name = "sha1-sunxi-ss",
>> +			.cra_priority = 300,
>> +			.cra_alignmask = 3,
>> +			.cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
>> +			.cra_blocksize = SHA1_BLOCK_SIZE,
>> +			.cra_ctxsize = sizeof(struct sunxi_req_ctx),
>> +			.cra_module = THIS_MODULE,
>> +			.cra_type = &crypto_ahash_type,
>> +			.cra_init = sunxi_hash_crainit
>> +		}
>> +	}
>> +};
>> +
>> +static struct crypto_alg sunxi_cipher_algs[] = {
>> +{
>> +	.cra_name = "cbc(aes)",
>> +	.cra_driver_name = "cbc-aes-sunxi-ss",
>> +	.cra_priority = 300,
>> +	.cra_blocksize = AES_BLOCK_SIZE,
>> +	.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER,
>> +	.cra_ctxsize = sizeof(struct sunxi_tfm_ctx),
>> +	.cra_module = THIS_MODULE,
>> +	.cra_alignmask = 3,
>> +	.cra_type = &crypto_ablkcipher_type,
>> +	.cra_init = sunxi_ss_cipher_init,
>> +	.cra_u = {
>> +		.ablkcipher = {
>> +			.min_keysize    = AES_MIN_KEY_SIZE,
>> +			.max_keysize    = AES_MAX_KEY_SIZE,
>> +			.ivsize         = AES_BLOCK_SIZE,
>> +			.setkey         = sunxi_ss_aes_setkey,
>> +			.encrypt        = sunxi_ss_cipher_encrypt,
>> +			.decrypt        = sunxi_ss_cipher_decrypt,
>> +		}
>> +	}
>> +}, {
>> +	.cra_name = "cbc(des)",
>> +	.cra_driver_name = "cbc-des-sunxi-ss",
>> +	.cra_priority = 300,
>> +	.cra_blocksize = DES_BLOCK_SIZE,
>> +	.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER,
>> +	.cra_ctxsize = sizeof(struct sunxi_req_ctx),
>> +	.cra_module = THIS_MODULE,
>> +	.cra_alignmask = 3,
>> +	.cra_type = &crypto_ablkcipher_type,
>> +	.cra_init = sunxi_ss_cipher_init,
>> +	.cra_u.ablkcipher = {
>> +		.min_keysize    = DES_KEY_SIZE,
>> +		.max_keysize    = DES_KEY_SIZE,
>> +		.ivsize         = DES_BLOCK_SIZE,
>> +		.setkey         = sunxi_ss_des_setkey,
>> +		.encrypt        = sunxi_ss_cipher_encrypt,
>> +		.decrypt        = sunxi_ss_cipher_decrypt,
>> +	}
>> +}, {
>> +	.cra_name = "cbc(des3_ede)",
>> +	.cra_driver_name = "cbc-des3-sunxi-ss",
>> +	.cra_priority = 300,
>> +	.cra_blocksize = DES3_EDE_BLOCK_SIZE,
>> +	.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER,
>> +	.cra_ctxsize = sizeof(struct sunxi_req_ctx),
>> +	.cra_module = THIS_MODULE,
>> +	.cra_alignmask = 3,
>> +	.cra_type = &crypto_ablkcipher_type,
>> +	.cra_init = sunxi_ss_cipher_init,
>> +	.cra_u.ablkcipher = {
>> +		.min_keysize    = DES3_EDE_KEY_SIZE,
>> +		.max_keysize    = DES3_EDE_KEY_SIZE,
>> +		.ivsize         = DES3_EDE_BLOCK_SIZE,
>> +		.setkey         = sunxi_ss_des3_setkey,
>> +		.encrypt        = sunxi_ss_cipher_encrypt,
>> +		.decrypt        = sunxi_ss_cipher_decrypt,
>> +	}
>> +}
>> +};
>> +
>> +static int sunxi_ss_probe(struct platform_device *pdev)
>> +{
>> +	struct resource *res;
>> +	u32 v;
>> +	int err;
>> +	unsigned long cr;
>> +	const unsigned long cr_ahb = 24 * 1000 * 1000;
>> +	const unsigned long cr_mod = 150 * 1000 * 1000;
>> +
>> +	if (!pdev->dev.of_node)
>> +		return -ENODEV;
>> +
>> +	ss = devm_kzalloc(&pdev->dev, sizeof(*ss), GFP_KERNEL);
>> +	if (ss == NULL)
>> +		return -ENOMEM;
>> +
>> +	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
>> +	ss->base = devm_ioremap_resource(&pdev->dev, res);
>> +	if (IS_ERR(ss->base)) {
>> +		dev_err(&pdev->dev, "Cannot request MMIO\n");
>> +		return PTR_ERR(ss->base);
>> +	}
>> +
>> +	ss->ssclk = devm_clk_get(&pdev->dev, "mod");
>> +	if (IS_ERR(ss->ssclk)) {
>> +		err = PTR_ERR(ss->ssclk);
>> +		dev_err(&pdev->dev, "Cannot get SS clock err=%d\n", err);
>> +		return err;
>> +	}
>> +	dev_dbg(&pdev->dev, "clock ss acquired\n");
>> +
>> +	ss->busclk = devm_clk_get(&pdev->dev, "ahb");
>> +	if (IS_ERR(ss->busclk)) {
>> +		err = PTR_ERR(ss->busclk);
>> +		dev_err(&pdev->dev, "Cannot get AHB SS clock err=%d\n", err);
>> +		return err;
>> +	}
>> +	dev_dbg(&pdev->dev, "clock ahb_ss acquired\n");
>> +
>> +	/* Enable both clocks */
>> +	err = clk_prepare_enable(ss->busclk);
>> +	if (err != 0) {
>> +		dev_err(&pdev->dev, "Cannot prepare_enable busclk\n");
>> +		return err;
>> +	}
>> +	err = clk_prepare_enable(ss->ssclk);
>> +	if (err != 0) {
>> +		dev_err(&pdev->dev, "Cannot prepare_enable ssclk\n");
>> +		clk_disable_unprepare(ss->busclk);
>> +		return err;
>> +	}
>> +
>> +	/*
>> +	 * Check that clock have the correct rates gived in the datasheet
>> +	 * Try to set the clock to the maximum allowed
>> +	 */
>> +	err = clk_set_rate(ss->ssclk, cr_mod);
>> +	if (err != 0) {
>> +		dev_err(&pdev->dev, "Cannot set clock rate to ssclk\n");
>> +		clk_disable_unprepare(ss->ssclk);
>> +		clk_disable_unprepare(ss->busclk);
>> +		return err;
>> +	}
>> +
>> +	cr = clk_get_rate(ss->busclk);
>> +	if (cr >= cr_ahb)
>> +		dev_dbg(&pdev->dev, "Clock bus %lu (%lu MHz) (must be >= %lu)\n",
>> +				cr, cr / 1000000, cr_ahb);
>> +	else
>> +		dev_warn(&pdev->dev, "Clock bus %lu (%lu MHz) (must be >= %lu)\n",
>> +				cr, cr / 1000000, cr_ahb);
>> +
>> +	cr = clk_get_rate(ss->ssclk);
>> +	if (cr <= cr_mod)
>> +		if (cr < cr_mod)
>> +			dev_info(&pdev->dev, "Clock ss %lu (%lu MHz) (must be <= %lu)\n",
>> +					cr, cr / 1000000, cr_mod);
>> +		else
>> +			dev_dbg(&pdev->dev, "Clock ss %lu (%lu MHz) (must be <= %lu)\n",
>> +					cr, cr / 1000000, cr_mod);
>> +	else
>> +		dev_warn(&pdev->dev, "Clock ss is at %lu (%lu MHz) (must be <= %lu)\n",
>> +				cr, cr / 1000000, cr_mod);
> 
> If the set_rate fails, it will return an error. All this is useless.
> 

Ok

>> +
>> +	/*
>> +	 * Datasheet named it "Die Bonding ID"
>> +	 * I expect to be a sort of Security System Revision number.
>> +	 * Since the A80 seems to have an other version of SS
>> +	 * this info could be useful
>> +	 */
>> +	writel(SS_ENABLED, ss->base + SS_CTL);
>> +	v = readl(ss->base + SS_CTL);
>> +	v >>= 16;
>> +	v &= 0x07;
>> +	dev_info(&pdev->dev, "Die ID %d\n", v);
>> +	writel(0, ss->base + SS_CTL);
> 
> If the A80 has a different IP, it will most likely have a different
> compatible anyway. You can remove that code.
> 
>> +
>> +	ss->dev = &pdev->dev;
>> +
>> +	mutex_init(&ss->lock);
>> +	mutex_init(&ss->bufin_lock);
>> +	mutex_init(&ss->bufout_lock);
>> +
>> +	err = crypto_register_ahash(&sunxi_md5_alg);
>> +	if (err)
>> +		goto error_md5;
>> +	err = crypto_register_ahash(&sunxi_sha1_alg);
>> +	if (err)
>> +		goto error_sha1;
>> +	err = crypto_register_algs(sunxi_cipher_algs,
>> +			ARRAY_SIZE(sunxi_cipher_algs));
>> +	if (err)
>> +		goto error_ciphers;
>> +
>> +	return 0;
>> +error_ciphers:
>> +	crypto_unregister_ahash(&sunxi_sha1_alg);
>> +error_sha1:
>> +	crypto_unregister_ahash(&sunxi_md5_alg);
>> +error_md5:
>> +	clk_disable_unprepare(ss->ssclk);
>> +	clk_disable_unprepare(ss->busclk);
>> +	return err;
>> +}
>> +
>> +static int __exit sunxi_ss_remove(struct platform_device *pdev)
> 
> The remove callback should not be in the __exit section.
> 
> Here, that function will get removed if the driver is compiled as
> built-in the remove function will not even be in the kernel
> image. Which will result in an instant crash when the kernel will try
> to call this function (since it's not even there anymore).
> 

Ok

>> +{
>> +	if (!pdev->dev.of_node)
>> +		return 0;
>> +
>> +	crypto_unregister_ahash(&sunxi_md5_alg);
>> +	crypto_unregister_ahash(&sunxi_sha1_alg);
>> +	crypto_unregister_algs(sunxi_cipher_algs,
>> +			ARRAY_SIZE(sunxi_cipher_algs));
>> +
>> +	if (ss->buf_in != NULL)
>> +		kfree(ss->buf_in);
>> +	if (ss->buf_out != NULL)
>> +		kfree(ss->buf_out);
>> +
>> +	writel(0, ss->base + SS_CTL);
>> +	clk_disable_unprepare(ss->busclk);
>> +	clk_disable_unprepare(ss->ssclk);
>> +	return 0;
>> +}
>> +
>> +static const struct of_device_id a20ss_crypto_of_match_table[] = {
>> +	{ .compatible = "allwinner,sun7i-a20-crypto" },
>> +	{}
>> +};
>> +MODULE_DEVICE_TABLE(of, a20ss_crypto_of_match_table);
>> +
>> +static struct platform_driver sunxi_ss_driver = {
>> +	.probe          = sunxi_ss_probe,
>> +	.remove         = __exit_p(sunxi_ss_remove),
> 
> And this is why you're not seeing any warning.
> 
>> +	.driver         = {
>> +		.owner          = THIS_MODULE,
> 
> You can drop this, it's already set by module_platform_driver.
> 

Ok

>> +		.name           = "sunxi-ss",
>> +		.of_match_table	= a20ss_crypto_of_match_table,
>> +	},
>> +};
>> +
>> +module_platform_driver(sunxi_ss_driver);
>> +
>> +MODULE_DESCRIPTION("Allwinner Security System cryptographic accelerator");
>> +MODULE_LICENSE("GPL");
>> +MODULE_AUTHOR("Corentin LABBE <clabbe.montjoie@gmail.com>");
>> diff --git a/drivers/crypto/sunxi-ss/sunxi-ss-hash.c b/drivers/crypto/sunxi-ss/sunxi-ss-hash.c
>> new file mode 100644
>> index 0000000..ec8758f
>> --- /dev/null
>> +++ b/drivers/crypto/sunxi-ss/sunxi-ss-hash.c
>> @@ -0,0 +1,445 @@
>> +/*
>> + * sunxi-ss-hash.c - hardware cryptographic accelerator for Allwinner A20 SoC
>> + *
>> + * Copyright (C) 2013-2014 Corentin LABBE <clabbe.montjoie@gmail.com>
>> + *
>> + * This file add support for MD5 and SHA1.
>> + *
>> + * You could find the datasheet in Documentation/arm/sunxi/README
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + */
>> +#include "sunxi-ss.h"
>> +
>> +/* This is a totaly arbitrary value */
>> +#define SS_TIMEOUT 100
>> +
>> +extern struct sunxi_ss_ctx *ss;
>> +
>> +int sunxi_hash_crainit(struct crypto_tfm *tfm)
>> +{
>> +	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
>> +			sizeof(struct sunxi_req_ctx));
>> +	return 0;
>> +}
>> +
>> +/* sunxi_hash_init: initialize request context */
>> +int sunxi_hash_init(struct ahash_request *areq)
>> +{
>> +	const char *hash_type;
>> +	struct sunxi_req_ctx *op = ahash_request_ctx(areq);
>> +
>> +	memset(op, 0, sizeof(struct sunxi_req_ctx));
>> +
>> +	hash_type = crypto_tfm_alg_name(areq->base.tfm);
>> +
>> +	if (strcmp(hash_type, "sha1") == 0)
>> +		op->mode = SS_OP_SHA1;
>> +	if (strcmp(hash_type, "md5") == 0)
>> +		op->mode = SS_OP_MD5;
>> +	if (op->mode == 0)
>> +		return -EINVAL;
>> +
>> +	return 0;
>> +}
>> +
>> +static u32 rx_cnt;
>> +
>> +inline void ss_writer(const u32 v)
>> +{
>> +	u32 spaces;
>> +
>> +	writel(v, ss->base + SS_RXFIFO);
>> +	rx_cnt--;
>> +	while (rx_cnt == 0) {
>> +		spaces = readl_relaxed(ss->base + SS_FCSR);
>> +		rx_cnt = SS_RXFIFO_SPACES(spaces);
>> +	}
>> +}
> 
> Errrr. What?!
> 
> Who sets that rx_cnt variable? And it persists between calls? It looks
> broken.
> 

rx_cnt was set at 0 at start of update/final functions.
I have found cleaner way so that code is removed.

>> +inline void ss_writer_relaxed(const u32 v)
>> +{
>> +	u32 spaces;
>> +
>> +	writel_relaxed(v, ss->base + SS_RXFIFO);
>> +	rx_cnt--;
>> +	while (rx_cnt == 0) {
>> +		spaces = readl_relaxed(ss->base + SS_FCSR);
>> +		rx_cnt = SS_RXFIFO_SPACES(spaces);
>> +	}
>> +}
>> +
>> +/*
>> + * sunxi_hash_update: update hash engine
>> + *
>> + * Could be used for both SHA1 and MD5
>> + * Write data by step of 32bits and put then in the SS.
>> + *
>> + * Since we cannot leave partial data and hash state in the engine,
>> + * we need to get the hash state at the end of this function.
>> + * After some work, I have found that we can get the hash state every 64o
>> + *
>> + * So the first work is to get the number of bytes to write to SS modulo 64
>> + * The extra bytes will go to two different destination:
>> + * op->wait for full 32bits word
>> + * op->wb (waiting bytes) for partial 32 bits word
>> + * So we can have up to (64/4)-1 op->wait words and 0/1/2/3 bytes in wb
>> + *
>> + * So at the begin of update()
>> + * if op->nwait * 4 + areq->nbytes < 64
>> + * => all data writed to wait buffers and end=0
>> + * if not write all nwait to the device and position end to complete to 64o
>> + *
>> + * example 1:
>> + * update1 60o => nwait=15
>> + * update2 60o => need one more word to have 64o
>> + * end=4
>> + * so write all data in op->wait and one word of SGs
>> + * write remaining data in op->wait
>> + * final state op->nwait=14
>> + */
>> +int sunxi_hash_update(struct ahash_request *areq)
>> +{
>> +	u32 v, ivmode = 0;
>> +	unsigned int i = 0;
>> +	/*
>> +	 * i is the total bytes read from SGs, to be compared to areq->nbytes
>> +	 * i is important because we cannot rely on SG length since the sum of
>> +	 * SG->length could be greater than areq->nbytes
>> +	 */
>> +
>> +	struct sunxi_req_ctx *op = ahash_request_ctx(areq);
>> +	struct scatterlist *in_sg;
>> +	unsigned int in_i = 0; /* advancement in the current SG */
>> +	u64 end;
>> +	/*
>> +	 * end is the position when we need to stop writing to the device,
>> +	 * to be compared to i
>> +	 */
>> +	int in_r;
>> +	void *src_addr;
>> +
>> +	dev_dbg(ss->dev, "%s %s bc=%llu len=%u mode=%x bw=%u ww=%u",
>> +			__func__, crypto_tfm_alg_name(areq->base.tfm),
>> +			op->byte_count, areq->nbytes, op->mode,
>> +			op->nbw, op->nwait);
>> +
>> +	if (areq->nbytes == 0)
>> +		return 0;
>> +
>> +	end = ((areq->nbytes + op->nwait * 4 + op->nbw) / 64) * 64
>> +		- op->nbw - op->nwait * 4;
>> +
>> +	if (end > areq->nbytes || areq->nbytes - end > 63) {
>> +		dev_err(ss->dev, "ERROR: Bound error %llu %u\n",
>> +				end, areq->nbytes);
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (op->nwait > 0 && end > 0) {
>> +		/* a precedent update was done */
>> +		for (i = 0; i < op->nwait; i++) {
>> +			ss_writer(op->wait[i]);
>> +			op->byte_count += 4;
>> +		}
>> +		op->nwait = 0;
>> +	}
>> +
>> +	mutex_lock(&ss->lock);
>> +	/*
>> +	 * if some data have been processed before,
>> +	 * we need to restore the partial hash state
>> +	 */
>> +	if (op->byte_count > 0) {
>> +		ivmode = SS_IV_ARBITRARY;
>> +		for (i = 0; i < 5; i++)
>> +			writel(op->hash[i], ss->base + SS_IV0 + i * 4);
>> +	}
>> +	/* Enable the device */
>> +	writel(op->mode | SS_ENABLED | ivmode, ss->base + SS_CTL);
>> +
>> +	rx_cnt = 0;
>> +	i = 0;
>> +
>> +	in_sg = areq->src;
>> +	src_addr = kmap(sg_page(in_sg)) + in_sg->offset;
>> +	if (src_addr == NULL) {
>> +		mutex_unlock(&ss->lock);
>> +		dev_err(ss->dev, "ERROR: Cannot kmap source buffer\n");
>> +		return -EFAULT;
>> +	}
>> +	do {
>> +		/*
>> +		 * step 1, if some bytes remains from last SG,
>> +		 * try to complete them to 4 and send that word
>> +		 */
>> +		if (op->nbw > 0) {
>> +			while (op->nbw < 4 && i < areq->nbytes &&
>> +					in_i < in_sg->length) {
>> +				op->wb |= (*(u8 *)(src_addr + in_i))
>> +					<< (8 * op->nbw);
>> +				dev_dbg(ss->dev, "%s Complete w=%d wb=%x\n",
>> +						__func__, op->nbw, op->wb);
>> +				i++;
>> +				in_i++;
>> +				op->nbw++;
>> +			}
>> +			if (op->nbw == 4) {
>> +				if (i <= end) {
>> +					ss_writer(op->wb);
>> +					op->byte_count += 4;
>> +				} else {
>> +					op->wait[op->nwait] = op->wb;
>> +					op->nwait++;
>> +					dev_dbg(ss->dev, "%s Keep %u bytes after %llu\n",
>> +						__func__, op->nwait, end);
>> +				}
>> +				op->nbw = 0;
>> +				op->wb = 0;
>> +			}
>> +		}
>> +		/* step 2, main loop, read data 4bytes at a time */
>> +		while (i < areq->nbytes && in_i < in_sg->length) {
>> +			/* how many bytes we can read, (we need 4) */
>> +			in_r = min(in_sg->length - in_i, areq->nbytes - i);
>> +			if (in_r < 4) {
>> +				/* Not enough data to write to the device */
>> +				op->wb = 0;
>> +				while (in_r > 0) {
>> +					op->wb |= (*(u8 *)(src_addr + in_i))
>> +						<< (8 * op->nbw);
>> +					dev_dbg(ss->dev, "%s ending bw=%d wb=%x\n",
>> +						__func__, op->nbw, op->wb);
>> +					in_r--;
>> +					i++;
>> +					in_i++;
>> +					op->nbw++;
>> +				}
>> +				goto nextsg;
>> +			}
>> +			v = *(u32 *)(src_addr + in_i);
>> +			if (i < end) {
>> +				/* last write must be done without relaxed */
>> +				if (i + 4 >= end)
>> +					ss_writer(v);
>> +				else
>> +					ss_writer_relaxed(v);
>> +				i += 4;
>> +				op->byte_count += 4;
>> +				in_i += 4;
>> +			} else {
>> +				op->wait[op->nwait] = v;
>> +				i += 4;
>> +				in_i += 4;
>> +				op->nwait++;
>> +				dev_dbg(ss->dev, "%s Keep word ww=%u after %llu\n",
>> +						__func__, op->nwait, end);
>> +				if (op->nwait > 15) {
>> +					dev_err(ss->dev, "FATAL: Cannot enqueue more, bug?\n");
>> +					writel(0, ss->base + SS_CTL);
>> +					mutex_unlock(&ss->lock);
>> +					return -EIO;
>> +				}
>> +			}
>> +		}
>> +nextsg:
>> +		/* Nothing more to read in this SG */
>> +		if (in_i == in_sg->length) {
>> +			kunmap(sg_page(in_sg));
>> +			do {
>> +				in_sg = sg_next(in_sg);
>> +			} while (in_sg != NULL && in_sg->length == 0);
>> +			in_i = 0;
>> +			if (in_sg != NULL) {
>> +				src_addr = kmap(sg_page(in_sg)) + in_sg->offset;
>> +				if (src_addr == NULL) {
>> +					mutex_unlock(&ss->lock);
>> +					dev_err(ss->dev, "ERROR: Cannot kmap source buffer\n");
>> +					return -EFAULT;
>> +				}
>> +			}
>> +		}
>> +	} while (in_sg != NULL && i < areq->nbytes);
>> +
>> +	/* ask the device to finish the hashing */
>> +	writel(op->mode | SS_ENABLED | SS_DATA_END, ss->base + SS_CTL);
>> +	i = 0;
>> +	do {
>> +		v = readl(ss->base + SS_CTL);
>> +		i++;
>> +	} while (i < SS_TIMEOUT && (v & SS_DATA_END) > 0);
>> +	if (i >= SS_TIMEOUT) {
>> +		dev_err(ss->dev, "ERROR: %s hash end timeout after %d loop, CTL=%x\n",
>> +				__func__, i, v);
>> +		writel(0, ss->base + SS_CTL);
>> +		mutex_unlock(&ss->lock);
>> +		return -EIO;
>> +	}
>> +
>> +	/* get the partial hash */
>> +	if (op->mode == SS_OP_SHA1) {
>> +		for (i = 0; i < 5; i++)
>> +			op->hash[i] = readl(ss->base + SS_MD0 + i * 4);
>> +	} else {
>> +		for (i = 0; i < 4; i++)
>> +			op->hash[i] = readl(ss->base + SS_MD0 + i * 4);
>> +	}
>> +
>> +	writel(0, ss->base + SS_CTL);
>> +	mutex_unlock(&ss->lock);
>> +	return 0;
>> +}
>> +
>> +/*
>> + * sunxi_hash_final: finalize hashing operation
>> + *
>> + * If we have some remaining bytes, we write them.
>> + * Then ask the SS for finalizing the hashing operation
>> + */
>> +int sunxi_hash_final(struct ahash_request *areq)
>> +{
>> +	u32 v, ivmode = 0;
>> +	unsigned int i;
>> +	int zeros;
>> +	unsigned int index, padlen;
>> +	__be64 bits;
>> +	struct sunxi_req_ctx *op = ahash_request_ctx(areq);
>> +
>> +	dev_dbg(ss->dev, "%s byte=%llu len=%u mode=%x bw=%u %x h=%x ww=%u",
>> +			__func__, op->byte_count, areq->nbytes, op->mode,
>> +			op->nbw, op->wb, op->hash[0], op->nwait);
>> +
>> +	mutex_lock(&ss->lock);
>> +	rx_cnt = 0;
>> +
>> +	/*
>> +	 * if we have already writed something,
>> +	 * restore the partial hash state
>> +	 */
>> +	if (op->byte_count > 0) {
>> +		ivmode = SS_IV_ARBITRARY;
>> +		for (i = 0; i < 5; i++)
>> +			writel(op->hash[i], ss->base + SS_IV0 + i * 4);
>> +	}
>> +	writel(op->mode | SS_ENABLED | ivmode, ss->base + SS_CTL);
>> +
>> +	/* write the remaining words of the wait buffer */
>> +	if (op->nwait > 0) {
>> +		for (i = 0; i < op->nwait; i++) {
>> +			v = op->wait[i];
>> +			ss_writer(v);
>> +			op->byte_count += 4;
>> +			dev_dbg(ss->dev, "%s write %llu i=%u %x\n",
>> +					__func__, op->byte_count, i, v);
>> +		}
>> +		op->nwait = 0;
>> +	}
>> +
>> +	/* write the remaining bytes of the nbw buffer */
>> +	if (op->nbw > 0) {
>> +		op->wb |= ((1 << 7) << (op->nbw * 8));
>> +		ss_writer(op->wb);
>> +	} else {
>> +		ss_writer((1 << 7));
>> +	}
>> +
>> +	/*
>> +	 * number of space to pad to obtain 64o minus 8(size) minus 4 (final 1)
>> +	 * I take the operations from other md5/sha1 implementations
>> +	 */
>> +
>> +	/* we have already send 4 more byte of which nbw data */
>> +	if (op->mode == SS_OP_MD5) {
>> +		index = (op->byte_count + 4) & 0x3f;
>> +		op->byte_count += op->nbw;
>> +		if (index > 56)
>> +			zeros = (120 - index) / 4;
>> +		else
>> +			zeros = (56 - index) / 4;
>> +	} else {
>> +		op->byte_count += op->nbw;
>> +		index = op->byte_count & 0x3f;
>> +		padlen = (index < 56) ? (56 - index) : ((64+56) - index);
>> +		zeros = (padlen - 1) / 4;
>> +	}
>> +	for (i = 0; i < zeros; i++)
>> +		ss_writer(0);
>> +
>> +	/* write the length of data */
>> +	if (op->mode == SS_OP_SHA1) {
>> +		bits = cpu_to_be64(op->byte_count << 3);
>> +		ss_writer(bits & 0xffffffff);
>> +		ss_writer((bits >> 32) & 0xffffffff);
>> +	} else {
>> +		ss_writer((op->byte_count << 3) & 0xffffffff);
>> +		ss_writer((op->byte_count >> 29) & 0xffffffff);
>> +	}
>> +
>> +	/* Tell the SS to stop the hashing */
>> +	writel(op->mode | SS_ENABLED | SS_DATA_END, ss->base + SS_CTL);
>> +
>> +	/*
>> +	 * Wait for SS to finish the hash.
>> +	 * The timeout could happend only in case of bad overcloking
>> +	 * or driver bug.
>> +	 */
>> +	i = 0;
>> +	do {
>> +		v = readl(ss->base + SS_CTL);
>> +		i++;
>> +	} while (i < SS_TIMEOUT && (v & SS_DATA_END) > 0);
>> +	if (i >= SS_TIMEOUT) {
>> +		dev_err(ss->dev, "ERROR: hash end timeout %d>%d ctl=%x len=%u\n",
>> +				i, SS_TIMEOUT, v, areq->nbytes);
>> +		writel(0, ss->base + SS_CTL);
>> +		mutex_unlock(&ss->lock);
>> +		return -EIO;
>> +	}
>> +
>> +	/* Get the hash from the device */
>> +	if (op->mode == SS_OP_SHA1) {
>> +		for (i = 0; i < 5; i++) {
>> +			v = cpu_to_be32(readl(ss->base + SS_MD0 + i * 4));
>> +			memcpy(areq->result + i * 4, &v, 4);
>> +		}
>> +	} else {
>> +		for (i = 0; i < 4; i++) {
>> +			v = readl(ss->base + SS_MD0 + i * 4);
>> +			memcpy(areq->result + i * 4, &v, 4);
>> +		}
>> +	}
>> +	writel(0, ss->base + SS_CTL);
>> +	mutex_unlock(&ss->lock);
>> +	return 0;
>> +}
>> +
>> +/* sunxi_hash_finup: finalize hashing operation after an update */
>> +int sunxi_hash_finup(struct ahash_request *areq)
>> +{
>> +	int err;
>> +
>> +	err = sunxi_hash_update(areq);
>> +	if (err != 0)
>> +		return err;
>> +
>> +	return sunxi_hash_final(areq);
>> +}
>> +
>> +/* combo of init/update/final functions */
>> +int sunxi_hash_digest(struct ahash_request *areq)
>> +{
>> +	int err;
>> +
>> +	err = sunxi_hash_init(areq);
>> +	if (err != 0)
>> +		return err;
>> +
>> +	err = sunxi_hash_update(areq);
>> +	if (err != 0)
>> +		return err;
>> +
>> +	return sunxi_hash_final(areq);
>> +}
>> diff --git a/drivers/crypto/sunxi-ss/sunxi-ss.h b/drivers/crypto/sunxi-ss/sunxi-ss.h
>> new file mode 100644
>> index 0000000..331e75b
>> --- /dev/null
>> +++ b/drivers/crypto/sunxi-ss/sunxi-ss.h
>> @@ -0,0 +1,193 @@
>> +/*
>> + * sunxi-ss.c - hardware cryptographic accelerator for Allwinner A20 SoC
>> + *
>> + * Copyright (C) 2013-2014 Corentin LABBE <clabbe.montjoie@gmail.com>
>> + *
>> + * Support AES cipher with 128,192,256 bits keysize.
>> + * Support MD5 and SHA1 hash algorithms.
>> + * Support DES and 3DES
>> + *
>> + * You could find the datasheet in Documentation/arm/sunxi/README
>> + *
>> + * Licensed under the GPL-2.
>> + */
>> +
>> +#include <linux/clk.h>
>> +#include <linux/crypto.h>
>> +#include <linux/io.h>
>> +#include <linux/module.h>
>> +#include <linux/of.h>
>> +#include <linux/platform_device.h>
>> +#include <crypto/scatterwalk.h>
>> +#include <linux/scatterlist.h>
>> +#include <linux/interrupt.h>
>> +#include <linux/delay.h>
>> +#include <crypto/md5.h>
>> +#include <crypto/sha.h>
>> +#include <crypto/hash.h>
>> +#include <crypto/internal/hash.h>
>> +#include <crypto/aes.h>
>> +#include <crypto/des.h>
>> +#include <crypto/internal/rng.h>
>> +
>> +#define SS_CTL            0x00
>> +#define SS_KEY0           0x04
>> +#define SS_KEY1           0x08
>> +#define SS_KEY2           0x0C
>> +#define SS_KEY3           0x10
>> +#define SS_KEY4           0x14
>> +#define SS_KEY5           0x18
>> +#define SS_KEY6           0x1C
>> +#define SS_KEY7           0x20
>> +
>> +#define SS_IV0            0x24
>> +#define SS_IV1            0x28
>> +#define SS_IV2            0x2C
>> +#define SS_IV3            0x30
>> +
>> +#define SS_CNT0           0x34
>> +#define SS_CNT1           0x38
>> +#define SS_CNT2           0x3C
>> +#define SS_CNT3           0x40
>> +
>> +#define SS_FCSR           0x44
>> +#define SS_ICSR           0x48
>> +
>> +#define SS_MD0            0x4C
>> +#define SS_MD1            0x50
>> +#define SS_MD2            0x54
>> +#define SS_MD3            0x58
>> +#define SS_MD4            0x5C
>> +
>> +#define SS_RXFIFO         0x200
>> +#define SS_TXFIFO         0x204
>> +
>> +/* SS_CTL configuration values */
>> +
>> +/* PRNG generator mode - bit 15 */
>> +#define SS_PRNG_ONESHOT		(0 << 15)
>> +#define SS_PRNG_CONTINUE	(1 << 15)
>> +
>> +/* IV mode for hash */
>> +#define SS_IV_ARBITRARY		(1 << 14)
>> +
>> +/* SS operation mode - bits 12-13 */
>> +#define SS_ECB			(0 << 12)
>> +#define SS_CBC			(1 << 12)
>> +#define SS_CNT			(2 << 12)
>> +
>> +/* Counter width for CNT mode - bits 10-11 */
>> +#define SS_CNT_16BITS		(0 << 10)
>> +#define SS_CNT_32BITS		(1 << 10)
>> +#define SS_CNT_64BITS		(2 << 10)
>> +
>> +/* Key size for AES - bits 8-9 */
>> +#define SS_AES_128BITS		(0 << 8)
>> +#define SS_AES_192BITS		(1 << 8)
>> +#define SS_AES_256BITS		(2 << 8)
>> +
>> +/* Operation direction - bit 7 */
>> +#define SS_ENCRYPTION		(0 << 7)
>> +#define SS_DECRYPTION		(1 << 7)
>> +
>> +/* SS Method - bits 4-6 */
>> +#define SS_OP_AES		(0 << 4)
>> +#define SS_OP_DES		(1 << 4)
>> +#define SS_OP_3DES		(2 << 4)
>> +#define SS_OP_SHA1		(3 << 4)
>> +#define SS_OP_MD5		(4 << 4)
>> +#define SS_OP_PRNG		(5 << 4)
>> +
>> +/* Data end bit - bit 2 */
>> +#define SS_DATA_END		(1 << 2)
>> +
>> +/* PRNG start bit - bit 1 */
>> +#define SS_PRNG_START		(1 << 1)
>> +
>> +/* SS Enable bit - bit 0 */
>> +#define SS_DISABLED		(0 << 0)
>> +#define SS_ENABLED		(1 << 0)
>> +
>> +/* SS_FCSR configuration values */
>> +/* RX FIFO status - bit 30 */
>> +#define SS_RXFIFO_FREE		(1 << 30)
>> +
>> +/* RX FIFO empty spaces - bits 24-29 */
>> +#define SS_RXFIFO_SPACES(val)	(((val) >> 24) & 0x3f)
>> +
>> +/* TX FIFO status - bit 22 */
>> +#define SS_TXFIFO_AVAILABLE	(1 << 22)
>> +
>> +/* TX FIFO available spaces - bits 16-21 */
>> +#define SS_TXFIFO_SPACES(val)	(((val) >> 16) & 0x3f)
>> +
>> +#define SS_RXFIFO_EMP_INT_PENDING	(1 << 10)
>> +#define SS_TXFIFO_AVA_INT_PENDING	(1 << 8)
>> +#define SS_RXFIFO_EMP_INT_ENABLE	(1 << 2)
>> +#define SS_TXFIFO_AVA_INT_ENABLE	(1 << 0)
>> +
>> +/* SS_ICSR configuration values */
>> +#define SS_ICS_DRQ_ENABLE		(1 << 4)
>> +
>> +struct sunxi_ss_ctx {
>> +	void __iomem *base;
>> +	int irq;
>> +	struct clk *busclk;
>> +	struct clk *ssclk;
>> +	struct device *dev;
>> +	struct resource *res;
>> +	void *buf_in; /* pointer to data to be uploaded to the device */
>> +	size_t buf_in_size; /* size of buf_in */
>> +	void *buf_out;
>> +	size_t buf_out_size;
>> +	struct mutex lock; /* control the use of the device */
>> +	struct mutex bufout_lock; /* control the use of buf_out*/
>> +	struct mutex bufin_lock; /* control the sue of buf_in*/
>> +};
>> +
>> +struct sunxi_tfm_ctx {
>> +	u32 key[AES_MAX_KEY_SIZE / 4];/* divided by sizeof(u32) */
>> +	u32 keylen;
>> +	u32 keymode;
>> +};
>> +
>> +struct sunxi_req_ctx {
>> +	u32 mode;
>> +	u64 byte_count; /* number of bytes "uploaded" to the device */
>> +	u32 wb; /* a partial word waiting to be completed and
>> +			uploaded to the device */
>> +	/* number of bytes to be uploaded in the wb word */
>> +	unsigned int nbw;
>> +	u32 hash[5];
>> +	u32 wait[64];
>> +	unsigned int nwait;
>> +};
>> +
>> +#define SS_SEED_LEN (192/8)
>> +#define SS_DATA_LEN (160/8)
>> +
>> +struct prng_context {
>> +	u32 seed[SS_SEED_LEN/4];
>> +	unsigned int slen;
>> +};
>> +
>> +int sunxi_hash_crainit(struct crypto_tfm *tfm);
>> +int sunxi_hash_init(struct ahash_request *areq);
>> +int sunxi_hash_update(struct ahash_request *areq);
>> +int sunxi_hash_final(struct ahash_request *areq);
>> +int sunxi_hash_finup(struct ahash_request *areq);
>> +int sunxi_hash_digest(struct ahash_request *areq);
>> +int sunxi_hash_export(struct ahash_request *areq, void *out);
>> +int sunxi_hash_import(struct ahash_request *areq, const void *in);
>> +
>> +int sunxi_ss_aes_poll(struct ablkcipher_request *areq, u32 mode);
>> +int sunxi_ss_des_poll(struct ablkcipher_request *areq, u32 mode);
>> +int sunxi_ss_cipher_init(struct crypto_tfm *tfm);
>> +int sunxi_ss_cipher_encrypt(struct ablkcipher_request *areq);
>> +int sunxi_ss_cipher_decrypt(struct ablkcipher_request *areq);
>> +int sunxi_ss_aes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
>> +		unsigned int keylen);
>> +int sunxi_ss_des_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
>> +		unsigned int keylen);
>> +int sunxi_ss_des3_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
>> +		unsigned int keylen);
>> -- 
>> 2.0.4
>>
> 
> Thanks,
> Maxime
> 

Thanks for your review and time.

Corentin
Maxime Ripard Oct. 30, 2014, 5:19 p.m. UTC | #8
On Fri, Oct 24, 2014 at 08:52:26PM +0200, Corentin LABBE wrote:
> On 10/21/14 21:11, Maxime Ripard wrote:
> > Hi Corentin,
> > 
> > Thanks for resending it.
> > 
> > On Sun, Oct 19, 2014 at 04:16:22PM +0200, LABBE Corentin wrote:
> >> Add support for the Security System included in Allwinner SoC A20.
> >> The Security System is a hardware cryptographic accelerator that support AES/MD5/SHA1/DES/3DES/PRNG algorithms.
> >>
> >> Signed-off-by: LABBE Corentin <clabbe.montjoie@gmail.com>
> >> ---
> >>  drivers/crypto/Kconfig                    |  17 ++
> >> +static int sunxi_ss_aes_poll_atomic(struct ablkcipher_request *areq)
> >> +{
> >> +	u32 spaces;
> >> +	struct scatterlist *in_sg = areq->src;
> >> +	struct scatterlist *out_sg = areq->dst;
> >> +	void *src_addr;
> >> +	void *dst_addr;
> >> +	unsigned int ileft = areq->nbytes;
> >> +	unsigned int oleft = areq->nbytes;
> >> +	unsigned int todo;
> >> +	u32 *src32;
> >> +	u32 *dst32;
> >> +	u32 rx_cnt = 32;
> >> +	u32 tx_cnt = 0;
> >> +	int i;
> >> +
> >> +	src_addr = kmap_atomic(sg_page(in_sg)) + in_sg->offset;
> > 
> > Where does this scatter_list is coming from? Can it even be allocated
> > in highmem?
> > 
> 
> With AF_ALG and cryptodev, the SG is in highmem. Verified with some
> PageHighMem().

Then fix AF_ALG and cryptodev, because all of the other drivers might
be affected.

Thanks!
Maxime
Herbert Xu Oct. 31, 2014, 7:20 a.m. UTC | #9
On Thu, Oct 30, 2014 at 06:19:33PM +0100, Maxime Ripard wrote:
>
> > With AF_ALG and cryptodev, the SG is in highmem. Verified with some
> > PageHighMem().
> 
> Then fix AF_ALG and cryptodev, because all of the other drivers might
> be affected.

No it's the driver that needs to be fixed.  Of course if there
are enough drivers it may be worthwhile adding either copying or
a software fallback for highmem requests.

Cheers,
Maxime Ripard Oct. 31, 2014, 8:13 a.m. UTC | #10
On Fri, Oct 31, 2014 at 03:20:30PM +0800, Herbert Xu wrote:
> On Thu, Oct 30, 2014 at 06:19:33PM +0100, Maxime Ripard wrote:
> >
> > > With AF_ALG and cryptodev, the SG is in highmem. Verified with some
> > > PageHighMem().
> > 
> > Then fix AF_ALG and cryptodev, because all of the other drivers might
> > be affected.
> 
> No it's the driver that needs to be fixed.  Of course if there
> are enough drivers it may be worthwhile adding either copying or
> a software fallback for highmem requests.

I don't understand here. Why would other drivers *not* being affected?

If the scatter list passed by AF_ALG can be in highmem, I guess it's
the case for every driver out there. Almost every kernel code I've
seen so far makes the assumption that the memory it has is mapped and
accessible.

Somehow, it's the driver's fault now, and not the part of kernel that
actually does the allocation?
Herbert Xu Oct. 31, 2014, 8:18 a.m. UTC | #11
On Fri, Oct 31, 2014 at 09:13:23AM +0100, Maxime Ripard wrote:
>
> I don't understand here. Why would other drivers *not* being affected?
> 
> If the scatter list passed by AF_ALG can be in highmem, I guess it's
> the case for every driver out there. Almost every kernel code I've
> seen so far makes the assumption that the memory it has is mapped and
> accessible.
> 
> Somehow, it's the driver's fault now, and not the part of kernel that
> actually does the allocation?

If you are implementing a crypto driver that is meant to handle
requests from the crypto API then yes you need to handle highmem.

As I said if enough drivers are unable to address highmem and
require copying/software fallbacks then we could provide this
through the API and the driver would then only need to declare
its lack of highmem support or use a helper.

Cheers,
Maxime Ripard Oct. 31, 2014, 9:57 a.m. UTC | #12
On Fri, Oct 31, 2014 at 04:18:03PM +0800, Herbert Xu wrote:
> On Fri, Oct 31, 2014 at 09:13:23AM +0100, Maxime Ripard wrote:
> >
> > I don't understand here. Why would other drivers *not* being affected?
> > 
> > If the scatter list passed by AF_ALG can be in highmem, I guess it's
> > the case for every driver out there. Almost every kernel code I've
> > seen so far makes the assumption that the memory it has is mapped and
> > accessible.
> > 
> > Somehow, it's the driver's fault now, and not the part of kernel that
> > actually does the allocation?
> 
> If you are implementing a crypto driver that is meant to handle
> requests from the crypto API then yes you need to handle highmem.

Is that documented somewhere?

> As I said if enough drivers are unable to address highmem and
> require copying/software fallbacks then we could provide this
> through the API and the driver would then only need to declare
> its lack of highmem support or use a helper.

On a 3.18-rc2 kernel:

$ git grep kmap -- crypto/
crypto/ahash.c:                         walk->data = kmap(walk->pg);
crypto/ahash.c:                         walk->data = kmap_atomic(walk->pg);
crypto/async_tx/async_memcpy.c:         dest_buf = kmap_atomic(dest) + dest_offset;
crypto/async_tx/async_memcpy.c:         src_buf = kmap_atomic(src) + src_offset;
crypto/scatterwalk.c:                   return kmap_atomic(scatterwalk_page(walk)) +
crypto/shash.c:                         data = kmap_atomic(sg_page(sg));
crypto/shash.c:                         data = kmap_atomic(sg_page(sg));

None of the drivers are.

Maxime
Herbert Xu Oct. 31, 2014, 10:05 a.m. UTC | #13
On Fri, Oct 31, 2014 at 10:57:06AM +0100, Maxime Ripard wrote:
>
> On a 3.18-rc2 kernel:
> 
> $ git grep kmap -- crypto/
> crypto/ahash.c:                         walk->data = kmap(walk->pg);
> crypto/ahash.c:                         walk->data = kmap_atomic(walk->pg);
> crypto/async_tx/async_memcpy.c:         dest_buf = kmap_atomic(dest) + dest_offset;
> crypto/async_tx/async_memcpy.c:         src_buf = kmap_atomic(src) + src_offset;
> crypto/scatterwalk.c:                   return kmap_atomic(scatterwalk_page(walk)) +
> crypto/shash.c:                         data = kmap_atomic(sg_page(sg));
> crypto/shash.c:                         data = kmap_atomic(sg_page(sg));
> 
> None of the drivers are.

What do you mean? It's precisely because the page can be in highmem
that we are mapping it.  If it's not in highmem it'll be a noop.

Admittedly I haven't tested highmem since moving over to x86-64
some years ago, but it definitely used to work on x86-32.

Cheers,
Maxime Ripard Nov. 3, 2014, 9:34 a.m. UTC | #14
On Fri, Oct 31, 2014 at 06:05:22PM +0800, Herbert Xu wrote:
> On Fri, Oct 31, 2014 at 10:57:06AM +0100, Maxime Ripard wrote:
> >
> > On a 3.18-rc2 kernel:
> > 
> > $ git grep kmap -- crypto/
> > crypto/ahash.c:                         walk->data = kmap(walk->pg);
> > crypto/ahash.c:                         walk->data = kmap_atomic(walk->pg);
> > crypto/async_tx/async_memcpy.c:         dest_buf = kmap_atomic(dest) + dest_offset;
> > crypto/async_tx/async_memcpy.c:         src_buf = kmap_atomic(src) + src_offset;
> > crypto/scatterwalk.c:                   return kmap_atomic(scatterwalk_page(walk)) +
> > crypto/shash.c:                         data = kmap_atomic(sg_page(sg));
> > crypto/shash.c:                         data = kmap_atomic(sg_page(sg));
> > 
> > None of the drivers are.
> 
> What do you mean? It's precisely because the page can be in highmem
> that we are mapping it.  If it's not in highmem it'll be a noop.

What I mean is that since you are saying that drivers should do the
kmap themselves, then *all* of the drivers are broken if they are not
using it. And all of them are missing this kmap.

Maxime
Herbert Xu Nov. 3, 2014, 10:35 a.m. UTC | #15
On Mon, Nov 03, 2014 at 10:34:46AM +0100, Maxime Ripard wrote:
> What I mean is that since you are saying that drivers should do the
> kmap themselves, then *all* of the drivers are broken if they are not
> using it. And all of them are missing this kmap.

kmap is used by the software implementations to map the input/output
into virtual address space.  Drivers typically use DMA and operate
on physical addresses so they don't need kmap.

Cheers,
Herbert Xu Nov. 6, 2014, 2:13 p.m. UTC | #16
On Sun, Oct 19, 2014 at 04:16:22PM +0200, LABBE Corentin wrote:
> Add support for the Security System included in Allwinner SoC A20.
> The Security System is a hardware cryptographic accelerator that support AES/MD5/SHA1/DES/3DES/PRNG algorithms.
> 
> Signed-off-by: LABBE Corentin <clabbe.montjoie@gmail.com>

OK this is much better.  However it seems that export/import
is still missing?

> +	src_addr = kmap_atomic(sg_page(in_sg)) + in_sg->offset;
> +	if (src_addr == NULL) {
> +		dev_err(ss->dev, "kmap_atomic error for src SG\n");
> +		writel(0, ss->base + SS_CTL);
> +		mutex_unlock(&ss->lock);

I overlooked this the last time around.  You cannot use mutexes
here as you can be called from softirq context so you need spin
locks.

Cheers,
Maxime Ripard Nov. 6, 2014, 2:26 p.m. UTC | #17
On Mon, Nov 03, 2014 at 06:35:28PM +0800, Herbert Xu wrote:
> On Mon, Nov 03, 2014 at 10:34:46AM +0100, Maxime Ripard wrote:
> > What I mean is that since you are saying that drivers should do the
> > kmap themselves, then *all* of the drivers are broken if they are not
> > using it. And all of them are missing this kmap.
> 
> kmap is used by the software implementations to map the input/output
> into virtual address space.  Drivers typically use DMA and operate
> on physical addresses so they don't need kmap.

Yes, plus all memory allocated with GFP_KERNEL is in lowmem.

But you still haven't explain why the driver, while it doesn't handle
the user space buffer at any time, should be worried that the data the
framework has given him are actually mapped.

Maxime
Herbert Xu Nov. 6, 2014, 2:32 p.m. UTC | #18
On Thu, Nov 06, 2014 at 03:26:33PM +0100, Maxime Ripard wrote:
> 
> But you still haven't explain why the driver, while it doesn't handle
> the user space buffer at any time, should be worried that the data the
> framework has given him are actually mapped.

Encryption is used by IPsec and SKBs can be allocated in highmem.
algif also exposes all ciphers to user-space memory which can also
be in highmem.

Cheers,
Maxime Ripard Nov. 16, 2014, 5:13 p.m. UTC | #19
On Thu, Nov 06, 2014 at 10:32:18PM +0800, Herbert Xu wrote:
> On Thu, Nov 06, 2014 at 03:26:33PM +0100, Maxime Ripard wrote:
> > 
> > But you still haven't explain why the driver, while it doesn't handle
> > the user space buffer at any time, should be worried that the data the
> > framework has given him are actually mapped.
> 
> Encryption is used by IPsec and SKBs can be allocated in highmem.
> algif also exposes all ciphers to user-space memory which can also
> be in highmem.

Ok. We keep going in circles here.

I know that algif handles userspace memory that can be in
highmem. What I don't get, is that just like a *driver* doesn't have
to call copy_from_user, why would it need to call kmap...

That's something that should be in the framework itself, not the
driver. And the argument that most drivers use DMA seems like a broken
assumption.

But hey, you're the one that will maintain this mess, so I guess you
have the final word.

Maxime
diff mbox

Patch

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 2fb0fdf..9ba9759 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -436,4 +436,21 @@  config CRYPTO_DEV_QCE
 	  hardware. To compile this driver as a module, choose M here. The
 	  module will be called qcrypto.
 
+config CRYPTO_DEV_SUNXI_SS
+	tristate "Support for Allwinner Security System cryptographic accelerator"
+	depends on ARCH_SUNXI
+	select CRYPTO_MD5
+	select CRYPTO_SHA1
+	select CRYPTO_AES
+	select CRYPTO_DES
+	select CRYPTO_BLKCIPHER
+	help
+	  Some Allwinner SoC have a crypto accelerator named
+	  Security System. Select this if you want to use it.
+	  The Security System handle AES/DES/3DES ciphers in CBC mode
+	  and SHA1 and MD5 hash algorithms.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sunxi-ss.
+
 endif # CRYPTO_HW
diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile
index 3924f93..856545c 100644
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@@ -25,3 +25,4 @@  obj-$(CONFIG_CRYPTO_DEV_TALITOS) += talitos.o
 obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/
 obj-$(CONFIG_CRYPTO_DEV_QAT) += qat/
 obj-$(CONFIG_CRYPTO_DEV_QCE) += qce/
+obj-$(CONFIG_CRYPTO_DEV_SUNXI_SS) += sunxi-ss/
diff --git a/drivers/crypto/sunxi-ss/Makefile b/drivers/crypto/sunxi-ss/Makefile
new file mode 100644
index 0000000..8bb287d
--- /dev/null
+++ b/drivers/crypto/sunxi-ss/Makefile
@@ -0,0 +1,2 @@ 
+obj-$(CONFIG_CRYPTO_DEV_SUNXI_SS) += sunxi-ss.o
+sunxi-ss-y += sunxi-ss-core.o sunxi-ss-hash.o sunxi-ss-cipher.o
diff --git a/drivers/crypto/sunxi-ss/sunxi-ss-cipher.c b/drivers/crypto/sunxi-ss/sunxi-ss-cipher.c
new file mode 100644
index 0000000..8d0416e
--- /dev/null
+++ b/drivers/crypto/sunxi-ss/sunxi-ss-cipher.c
@@ -0,0 +1,489 @@ 
+/*
+ * sunxi-ss-cipher.c - hardware cryptographic accelerator for Allwinner A20 SoC
+ *
+ * Copyright (C) 2013-2014 Corentin LABBE <clabbe.montjoie@gmail.com>
+ *
+ * This file add support for AES cipher with 128,192,256 bits
+ * keysize in CBC mode.
+ * Add support also for DES and 3DES in CBC mode.
+ *
+ * You could find the datasheet in Documentation/arm/sunxi/README
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include "sunxi-ss.h"
+
+extern struct sunxi_ss_ctx *ss;
+
+static int sunxi_ss_cipher(struct ablkcipher_request *areq, u32 mode)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(areq);
+	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
+	const char *cipher_type;
+
+	if (areq->nbytes == 0)
+		return 0;
+
+	if (areq->info == NULL) {
+		dev_err(ss->dev, "ERROR: Empty IV\n");
+		return -EINVAL;
+	}
+
+	if (areq->src == NULL || areq->dst == NULL) {
+		dev_err(ss->dev, "ERROR: Some SGs are NULL\n");
+		return -EINVAL;
+	}
+
+	cipher_type = crypto_tfm_alg_name(crypto_ablkcipher_tfm(tfm));
+
+	if (strcmp("cbc(aes)", cipher_type) == 0) {
+		mode |= SS_OP_AES | SS_CBC | SS_ENABLED | op->keymode;
+		return sunxi_ss_aes_poll(areq, mode);
+	}
+
+	if (strcmp("cbc(des)", cipher_type) == 0) {
+		mode |= SS_OP_DES | SS_CBC | SS_ENABLED | op->keymode;
+		return sunxi_ss_des_poll(areq, mode);
+	}
+
+	if (strcmp("cbc(des3_ede)", cipher_type) == 0) {
+		mode |= SS_OP_3DES | SS_CBC | SS_ENABLED | op->keymode;
+		return sunxi_ss_des_poll(areq, mode);
+	}
+
+	dev_err(ss->dev, "ERROR: Cipher %s not handled\n", cipher_type);
+	return -EINVAL;
+}
+
+int sunxi_ss_cipher_encrypt(struct ablkcipher_request *areq)
+{
+	return sunxi_ss_cipher(areq, SS_ENCRYPTION);
+}
+
+int sunxi_ss_cipher_decrypt(struct ablkcipher_request *areq)
+{
+	return sunxi_ss_cipher(areq, SS_DECRYPTION);
+}
+
+int sunxi_ss_cipher_init(struct crypto_tfm *tfm)
+{
+	struct sunxi_tfm_ctx *op = crypto_tfm_ctx(tfm);
+
+	memset(op, 0, sizeof(struct sunxi_tfm_ctx));
+	return 0;
+}
+
+/*
+ * Optimized function for the case where we have only one SG,
+ * so we can use kmap_atomic
+ */
+static int sunxi_ss_aes_poll_atomic(struct ablkcipher_request *areq)
+{
+	u32 spaces;
+	struct scatterlist *in_sg = areq->src;
+	struct scatterlist *out_sg = areq->dst;
+	void *src_addr;
+	void *dst_addr;
+	unsigned int ileft = areq->nbytes;
+	unsigned int oleft = areq->nbytes;
+	unsigned int todo;
+	u32 *src32;
+	u32 *dst32;
+	u32 rx_cnt = 32;
+	u32 tx_cnt = 0;
+	int i;
+
+	src_addr = kmap_atomic(sg_page(in_sg)) + in_sg->offset;
+	if (src_addr == NULL) {
+		dev_err(ss->dev, "kmap_atomic error for src SG\n");
+		writel(0, ss->base + SS_CTL);
+		mutex_unlock(&ss->lock);
+		return -EINVAL;
+	}
+
+	dst_addr = kmap_atomic(sg_page(out_sg)) + out_sg->offset;
+	if (dst_addr == NULL) {
+		dev_err(ss->dev, "kmap_atomic error for dst SG\n");
+		writel(0, ss->base + SS_CTL);
+		kunmap_atomic(src_addr);
+		mutex_unlock(&ss->lock);
+		return -EINVAL;
+	}
+
+	src32 = (u32 *)src_addr;
+	dst32 = (u32 *)dst_addr;
+	ileft = areq->nbytes / 4;
+	oleft = areq->nbytes / 4;
+	i = 0;
+	do {
+		if (ileft > 0 && rx_cnt > 0) {
+			todo = min(rx_cnt, ileft);
+			ileft -= todo;
+			do {
+				writel_relaxed(*src32++,
+						ss->base +
+						SS_RXFIFO);
+				todo--;
+			} while (todo > 0);
+		}
+		if (tx_cnt > 0) {
+			todo = min(tx_cnt, oleft);
+			oleft -= todo;
+			do {
+				*dst32++ = readl_relaxed(ss->base +
+						SS_TXFIFO);
+				todo--;
+			} while (todo > 0);
+		}
+		spaces = readl_relaxed(ss->base + SS_FCSR);
+		rx_cnt = SS_RXFIFO_SPACES(spaces);
+		tx_cnt = SS_TXFIFO_SPACES(spaces);
+	} while (oleft > 0);
+	writel(0, ss->base + SS_CTL);
+	kunmap_atomic(src_addr);
+	kunmap_atomic(dst_addr);
+	mutex_unlock(&ss->lock);
+	return 0;
+}
+
+int sunxi_ss_aes_poll(struct ablkcipher_request *areq, u32 mode)
+{
+	u32 spaces;
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(areq);
+	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
+	unsigned int ivsize = crypto_ablkcipher_ivsize(tfm);
+	/* when activating SS, the default FIFO space is 32 */
+	u32 rx_cnt = 32;
+	u32 tx_cnt = 0;
+	u32 v;
+	int i;
+	struct scatterlist *in_sg = areq->src;
+	struct scatterlist *out_sg = areq->dst;
+	void *src_addr;
+	void *dst_addr;
+	unsigned int ileft = areq->nbytes;
+	unsigned int oleft = areq->nbytes;
+	unsigned int sgileft = areq->src->length;
+	unsigned int sgoleft = areq->dst->length;
+	unsigned int todo;
+	u32 *src32;
+	u32 *dst32;
+
+	mutex_lock(&ss->lock);
+
+	for (i = 0; i < op->keylen; i += 4)
+		writel(*(op->key + i/4), ss->base + SS_KEY0 + i);
+
+	if (areq->info != NULL) {
+		for (i = 0; i < 4 && i < ivsize / 4; i++) {
+			v = *(u32 *)(areq->info + i * 4);
+			writel(v, ss->base + SS_IV0 + i * 4);
+		}
+	}
+	writel(mode, ss->base + SS_CTL);
+
+	/* If we have only one SG, we can use kmap_atomic */
+	if (sg_next(in_sg) == NULL && sg_next(out_sg) == NULL)
+		return sunxi_ss_aes_poll_atomic(areq);
+
+	/*
+	 * If we have more than one SG, we cannot use kmap_atomic since
+	 * we hold the mapping too long
+	 */
+	src_addr = kmap(sg_page(in_sg)) + in_sg->offset;
+	if (src_addr == NULL) {
+		dev_err(ss->dev, "KMAP error for src SG\n");
+		mutex_unlock(&ss->lock);
+		return -EINVAL;
+	}
+	dst_addr = kmap(sg_page(out_sg)) + out_sg->offset;
+	if (dst_addr == NULL) {
+		kunmap(sg_page(in_sg));
+		dev_err(ss->dev, "KMAP error for dst SG\n");
+		mutex_unlock(&ss->lock);
+		return -EINVAL;
+	}
+	src32 = (u32 *)src_addr;
+	dst32 = (u32 *)dst_addr;
+	ileft = areq->nbytes / 4;
+	oleft = areq->nbytes / 4;
+	sgileft = in_sg->length / 4;
+	sgoleft = out_sg->length / 4;
+	do {
+		spaces = readl_relaxed(ss->base + SS_FCSR);
+		rx_cnt = SS_RXFIFO_SPACES(spaces);
+		tx_cnt = SS_TXFIFO_SPACES(spaces);
+		todo = min3(rx_cnt, ileft, sgileft);
+		if (todo > 0) {
+			ileft -= todo;
+			sgileft -= todo;
+		}
+		while (todo > 0) {
+			writel_relaxed(*src32++, ss->base + SS_RXFIFO);
+			todo--;
+		}
+		if (in_sg != NULL && sgileft == 0 && ileft > 0) {
+			kunmap(sg_page(in_sg));
+			in_sg = sg_next(in_sg);
+			while (in_sg != NULL && in_sg->length == 0)
+				in_sg = sg_next(in_sg);
+			if (in_sg != NULL && ileft > 0) {
+				src_addr = kmap(sg_page(in_sg)) + in_sg->offset;
+				if (src_addr == NULL) {
+					dev_err(ss->dev, "ERROR: KMAP for src SG\n");
+					mutex_unlock(&ss->lock);
+					return -EINVAL;
+				}
+				src32 = src_addr;
+				sgileft = in_sg->length / 4;
+			}
+		}
+		/* do not test oleft since when oleft == 0 we have finished */
+		todo = min3(tx_cnt, oleft, sgoleft);
+		if (todo > 0) {
+			oleft -= todo;
+			sgoleft -= todo;
+		}
+		while (todo > 0) {
+			*dst32++ = readl_relaxed(ss->base + SS_TXFIFO);
+			todo--;
+		}
+		if (out_sg != NULL && sgoleft == 0 && oleft >= 0) {
+			kunmap(sg_page(out_sg));
+			out_sg = sg_next(out_sg);
+			while (out_sg != NULL && out_sg->length == 0)
+				out_sg = sg_next(out_sg);
+			if (out_sg != NULL && oleft > 0) {
+				dst_addr = kmap(sg_page(out_sg)) +
+					out_sg->offset;
+				if (dst_addr == NULL) {
+					dev_err(ss->dev, "KMAP error\n");
+					mutex_unlock(&ss->lock);
+					return -EINVAL;
+				}
+				dst32 = dst_addr;
+				sgoleft = out_sg->length / 4;
+			}
+		}
+	} while (oleft > 0);
+
+	writel_relaxed(0, ss->base + SS_CTL);
+	mutex_unlock(&ss->lock);
+	return 0;
+}
+
+/*
+ * Pure CPU way of doing DES/3DES with SS
+ * Since DES and 3DES SGs could be smaller than 4 bytes, I use sg_copy_to_buffer
+ * for "linearize" them.
+ * The problem with that is that I alloc (2 x areq->nbytes) for buf_in/buf_out
+ * TODO: change this system, I need to support other mode than CBC where len
+ * is not a multiple of 4 and the hack of linearize use too much memory
+ * SGsrc -> buf_in -> SS -> buf_out -> SGdst
+ */
+int sunxi_ss_des_poll(struct ablkcipher_request *areq, u32 mode)
+{
+	u32 value, spaces;
+	size_t nb_in_sg_tx, nb_in_sg_rx;
+	size_t ir, it;
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(areq);
+	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
+	unsigned int ivsize = crypto_ablkcipher_ivsize(tfm);
+	u32 tx_cnt = 0;
+	u32 rx_cnt = 0;
+	u32 v;
+	int i;
+	int no_chunk = 1;
+	struct scatterlist *in_sg = areq->src;
+	struct scatterlist *out_sg = areq->dst;
+
+	/*
+	 * if we have only SGs with size multiple of 4,
+	 * we can use the SS AES function
+	 */
+	while (in_sg != NULL && no_chunk == 1) {
+		if ((in_sg->length % 4) != 0)
+			no_chunk = 0;
+		in_sg = sg_next(in_sg);
+	}
+	while (out_sg != NULL && no_chunk == 1) {
+		if ((out_sg->length % 4) != 0)
+			no_chunk = 0;
+		out_sg = sg_next(out_sg);
+	}
+
+	if (no_chunk == 1)
+		return sunxi_ss_aes_poll(areq, mode);
+
+	in_sg = areq->src;
+	out_sg = areq->dst;
+
+	nb_in_sg_rx = sg_nents(in_sg);
+	nb_in_sg_tx = sg_nents(out_sg);
+
+	/*
+	 * buf_in and buf_out are allocated only one time
+	 * then we keep the buffer until driver end
+	 * the allocation can only grow more
+	 * we do not reduce it for simplification
+	 */
+	mutex_lock(&ss->bufin_lock);
+	if (ss->buf_in == NULL) {
+		ss->buf_in = kmalloc(areq->nbytes, GFP_KERNEL);
+		ss->buf_in_size = areq->nbytes;
+	} else {
+		if (areq->nbytes > ss->buf_in_size) {
+			kfree(ss->buf_in);
+			ss->buf_in = kmalloc(areq->nbytes, GFP_KERNEL);
+			ss->buf_in_size = areq->nbytes;
+		}
+	}
+	if (ss->buf_in == NULL) {
+		ss->buf_in_size = 0;
+		mutex_unlock(&ss->bufin_lock);
+		dev_err(ss->dev, "Unable to allocate pages.\n");
+		return -ENOMEM;
+	}
+	mutex_lock(&ss->bufout_lock);
+	if (ss->buf_out == NULL) {
+		ss->buf_out = kmalloc(areq->nbytes, GFP_KERNEL);
+		if (ss->buf_out == NULL) {
+			ss->buf_out_size = 0;
+			mutex_unlock(&ss->bufin_lock);
+			mutex_unlock(&ss->bufout_lock);
+			dev_err(ss->dev, "Unable to allocate pages.\n");
+			return -ENOMEM;
+		}
+		ss->buf_out_size = areq->nbytes;
+	} else {
+		if (areq->nbytes > ss->buf_out_size) {
+			kfree(ss->buf_out);
+			ss->buf_out = kmalloc(areq->nbytes, GFP_KERNEL);
+			if (ss->buf_out == NULL) {
+				ss->buf_out_size = 0;
+				mutex_unlock(&ss->bufin_lock);
+				mutex_unlock(&ss->bufout_lock);
+				dev_err(ss->dev, "Unable to allocate pages.\n");
+				return -ENOMEM;
+			}
+			ss->buf_out_size = areq->nbytes;
+		}
+	}
+
+	sg_copy_to_buffer(areq->src, nb_in_sg_rx, ss->buf_in, areq->nbytes);
+
+	ir = 0;
+	it = 0;
+	mutex_lock(&ss->lock);
+
+	for (i = 0; i < op->keylen; i += 4)
+		writel(*(op->key + i/4), ss->base + SS_KEY0 + i);
+	if (areq->info != NULL) {
+		for (i = 0; i < 4 && i < ivsize / 4; i++) {
+			v = *(u32 *)(areq->info + i * 4);
+			writel(v, ss->base + SS_IV0 + i * 4);
+		}
+	}
+	writel(mode, ss->base + SS_CTL);
+
+	do {
+		if (rx_cnt == 0 || tx_cnt == 0) {
+			spaces = readl(ss->base + SS_FCSR);
+			rx_cnt = SS_RXFIFO_SPACES(spaces);
+			tx_cnt = SS_TXFIFO_SPACES(spaces);
+		}
+		if (rx_cnt > 0 && ir < areq->nbytes) {
+			do {
+				value = *(u32 *)(ss->buf_in + ir);
+				writel(value, ss->base + SS_RXFIFO);
+				ir += 4;
+				rx_cnt--;
+			} while (rx_cnt > 0 && ir < areq->nbytes);
+		}
+		if (tx_cnt > 0 && it < areq->nbytes) {
+			do {
+				value = readl(ss->base + SS_TXFIFO);
+				*(u32 *)(ss->buf_out + it) = value;
+				it += 4;
+				tx_cnt--;
+			} while (tx_cnt > 0 && it < areq->nbytes);
+		}
+		if (ir == areq->nbytes) {
+			mutex_unlock(&ss->bufin_lock);
+			ir++;
+		}
+	} while (it < areq->nbytes);
+
+	writel(0, ss->base + SS_CTL);
+	mutex_unlock(&ss->lock);
+
+	/*
+	 * a simple optimization, since we dont need the hardware for this copy
+	 * we release the lock and do the copy. With that we gain 5/10% perf
+	 */
+	sg_copy_from_buffer(areq->dst, nb_in_sg_tx, ss->buf_out, areq->nbytes);
+
+	mutex_unlock(&ss->bufout_lock);
+	return 0;
+}
+
+/* check and set the AES key, prepare the mode to be used */
+int sunxi_ss_aes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
+		unsigned int keylen)
+{
+	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
+
+	switch (keylen) {
+	case 128 / 8:
+		op->keymode = SS_AES_128BITS;
+		break;
+	case 192 / 8:
+		op->keymode = SS_AES_192BITS;
+		break;
+	case 256 / 8:
+		op->keymode = SS_AES_256BITS;
+		break;
+	default:
+		dev_err(ss->dev, "ERROR: Invalid keylen %u\n", keylen);
+		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	op->keylen = keylen;
+	memcpy(op->key, key, keylen);
+	return 0;
+}
+
+/* check and set the DES key, prepare the mode to be used */
+int sunxi_ss_des_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
+		unsigned int keylen)
+{
+	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
+
+	if (keylen != DES_KEY_SIZE) {
+		dev_err(ss->dev, "Invalid keylen %u\n", keylen);
+		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	op->keylen = keylen;
+	memcpy(op->key, key, keylen);
+	return 0;
+}
+
+/* check and set the 3DES key, prepare the mode to be used */
+int sunxi_ss_des3_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
+		unsigned int keylen)
+{
+	struct sunxi_tfm_ctx *op = crypto_ablkcipher_ctx(tfm);
+
+	if (keylen != 3 * DES_KEY_SIZE) {
+		dev_err(ss->dev, "Invalid keylen %u\n", keylen);
+		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	op->keylen = keylen;
+	memcpy(op->key, key, keylen);
+	return 0;
+}
diff --git a/drivers/crypto/sunxi-ss/sunxi-ss-core.c b/drivers/crypto/sunxi-ss/sunxi-ss-core.c
new file mode 100644
index 0000000..e66d7e2
--- /dev/null
+++ b/drivers/crypto/sunxi-ss/sunxi-ss-core.c
@@ -0,0 +1,318 @@ 
+/*
+ * sunxi-ss-core.c - hardware cryptographic accelerator for Allwinner A20 SoC
+ *
+ * Copyright (C) 2013-2014 Corentin LABBE <clabbe.montjoie@gmail.com>
+ *
+ * Core file which registers crypto algorithms supported by the SS.
+ *
+ * You could find a link for the datasheet in Documentation/arm/sunxi/README
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/clk.h>
+#include <linux/crypto.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <crypto/scatterwalk.h>
+#include <linux/scatterlist.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+
+#include "sunxi-ss.h"
+
+struct sunxi_ss_ctx *ss;
+
+/*
+ * General notes for whole driver:
+ *
+ * After each request the device must be disabled with a write of 0 in SS_CTL
+ *
+ * For performance reason, we use writel_relaxed/read_relaxed for all
+ * operations on RX and TX FIFO and also SS_FCSR.
+ * Excepts for the last write on TX FIFO.
+ * For all other registers, we use writel/readl.
+ * See http://permalink.gmane.org/gmane.linux.ports.arm.kernel/117644
+ * and http://permalink.gmane.org/gmane.linux.ports.arm.kernel/117640
+ */
+
+static struct ahash_alg sunxi_md5_alg = {
+	.init = sunxi_hash_init,
+	.update = sunxi_hash_update,
+	.final = sunxi_hash_final,
+	.finup = sunxi_hash_finup,
+	.digest = sunxi_hash_digest,
+	.halg = {
+		.digestsize = MD5_DIGEST_SIZE,
+		.base = {
+			.cra_name = "md5",
+			.cra_driver_name = "md5-sunxi-ss",
+			.cra_priority = 300,
+			.cra_alignmask = 3,
+			.cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
+			.cra_blocksize = MD5_HMAC_BLOCK_SIZE,
+			.cra_ctxsize = sizeof(struct sunxi_req_ctx),
+			.cra_module = THIS_MODULE,
+			.cra_type = &crypto_ahash_type,
+			.cra_init = sunxi_hash_crainit
+		}
+	}
+};
+
+static struct ahash_alg sunxi_sha1_alg = {
+	.init = sunxi_hash_init,
+	.update = sunxi_hash_update,
+	.final = sunxi_hash_final,
+	.finup = sunxi_hash_finup,
+	.digest = sunxi_hash_digest,
+	.halg = {
+		.digestsize = SHA1_DIGEST_SIZE,
+		.base = {
+			.cra_name = "sha1",
+			.cra_driver_name = "sha1-sunxi-ss",
+			.cra_priority = 300,
+			.cra_alignmask = 3,
+			.cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
+			.cra_blocksize = SHA1_BLOCK_SIZE,
+			.cra_ctxsize = sizeof(struct sunxi_req_ctx),
+			.cra_module = THIS_MODULE,
+			.cra_type = &crypto_ahash_type,
+			.cra_init = sunxi_hash_crainit
+		}
+	}
+};
+
+static struct crypto_alg sunxi_cipher_algs[] = {
+{
+	.cra_name = "cbc(aes)",
+	.cra_driver_name = "cbc-aes-sunxi-ss",
+	.cra_priority = 300,
+	.cra_blocksize = AES_BLOCK_SIZE,
+	.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	.cra_ctxsize = sizeof(struct sunxi_tfm_ctx),
+	.cra_module = THIS_MODULE,
+	.cra_alignmask = 3,
+	.cra_type = &crypto_ablkcipher_type,
+	.cra_init = sunxi_ss_cipher_init,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize    = AES_MIN_KEY_SIZE,
+			.max_keysize    = AES_MAX_KEY_SIZE,
+			.ivsize         = AES_BLOCK_SIZE,
+			.setkey         = sunxi_ss_aes_setkey,
+			.encrypt        = sunxi_ss_cipher_encrypt,
+			.decrypt        = sunxi_ss_cipher_decrypt,
+		}
+	}
+}, {
+	.cra_name = "cbc(des)",
+	.cra_driver_name = "cbc-des-sunxi-ss",
+	.cra_priority = 300,
+	.cra_blocksize = DES_BLOCK_SIZE,
+	.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	.cra_ctxsize = sizeof(struct sunxi_req_ctx),
+	.cra_module = THIS_MODULE,
+	.cra_alignmask = 3,
+	.cra_type = &crypto_ablkcipher_type,
+	.cra_init = sunxi_ss_cipher_init,
+	.cra_u.ablkcipher = {
+		.min_keysize    = DES_KEY_SIZE,
+		.max_keysize    = DES_KEY_SIZE,
+		.ivsize         = DES_BLOCK_SIZE,
+		.setkey         = sunxi_ss_des_setkey,
+		.encrypt        = sunxi_ss_cipher_encrypt,
+		.decrypt        = sunxi_ss_cipher_decrypt,
+	}
+}, {
+	.cra_name = "cbc(des3_ede)",
+	.cra_driver_name = "cbc-des3-sunxi-ss",
+	.cra_priority = 300,
+	.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+	.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	.cra_ctxsize = sizeof(struct sunxi_req_ctx),
+	.cra_module = THIS_MODULE,
+	.cra_alignmask = 3,
+	.cra_type = &crypto_ablkcipher_type,
+	.cra_init = sunxi_ss_cipher_init,
+	.cra_u.ablkcipher = {
+		.min_keysize    = DES3_EDE_KEY_SIZE,
+		.max_keysize    = DES3_EDE_KEY_SIZE,
+		.ivsize         = DES3_EDE_BLOCK_SIZE,
+		.setkey         = sunxi_ss_des3_setkey,
+		.encrypt        = sunxi_ss_cipher_encrypt,
+		.decrypt        = sunxi_ss_cipher_decrypt,
+	}
+}
+};
+
+static int sunxi_ss_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	u32 v;
+	int err;
+	unsigned long cr;
+	const unsigned long cr_ahb = 24 * 1000 * 1000;
+	const unsigned long cr_mod = 150 * 1000 * 1000;
+
+	if (!pdev->dev.of_node)
+		return -ENODEV;
+
+	ss = devm_kzalloc(&pdev->dev, sizeof(*ss), GFP_KERNEL);
+	if (ss == NULL)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	ss->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(ss->base)) {
+		dev_err(&pdev->dev, "Cannot request MMIO\n");
+		return PTR_ERR(ss->base);
+	}
+
+	ss->ssclk = devm_clk_get(&pdev->dev, "mod");
+	if (IS_ERR(ss->ssclk)) {
+		err = PTR_ERR(ss->ssclk);
+		dev_err(&pdev->dev, "Cannot get SS clock err=%d\n", err);
+		return err;
+	}
+	dev_dbg(&pdev->dev, "clock ss acquired\n");
+
+	ss->busclk = devm_clk_get(&pdev->dev, "ahb");
+	if (IS_ERR(ss->busclk)) {
+		err = PTR_ERR(ss->busclk);
+		dev_err(&pdev->dev, "Cannot get AHB SS clock err=%d\n", err);
+		return err;
+	}
+	dev_dbg(&pdev->dev, "clock ahb_ss acquired\n");
+
+	/* Enable both clocks */
+	err = clk_prepare_enable(ss->busclk);
+	if (err != 0) {
+		dev_err(&pdev->dev, "Cannot prepare_enable busclk\n");
+		return err;
+	}
+	err = clk_prepare_enable(ss->ssclk);
+	if (err != 0) {
+		dev_err(&pdev->dev, "Cannot prepare_enable ssclk\n");
+		clk_disable_unprepare(ss->busclk);
+		return err;
+	}
+
+	/*
+	 * Check that clock have the correct rates gived in the datasheet
+	 * Try to set the clock to the maximum allowed
+	 */
+	err = clk_set_rate(ss->ssclk, cr_mod);
+	if (err != 0) {
+		dev_err(&pdev->dev, "Cannot set clock rate to ssclk\n");
+		clk_disable_unprepare(ss->ssclk);
+		clk_disable_unprepare(ss->busclk);
+		return err;
+	}
+
+	cr = clk_get_rate(ss->busclk);
+	if (cr >= cr_ahb)
+		dev_dbg(&pdev->dev, "Clock bus %lu (%lu MHz) (must be >= %lu)\n",
+				cr, cr / 1000000, cr_ahb);
+	else
+		dev_warn(&pdev->dev, "Clock bus %lu (%lu MHz) (must be >= %lu)\n",
+				cr, cr / 1000000, cr_ahb);
+
+	cr = clk_get_rate(ss->ssclk);
+	if (cr <= cr_mod)
+		if (cr < cr_mod)
+			dev_info(&pdev->dev, "Clock ss %lu (%lu MHz) (must be <= %lu)\n",
+					cr, cr / 1000000, cr_mod);
+		else
+			dev_dbg(&pdev->dev, "Clock ss %lu (%lu MHz) (must be <= %lu)\n",
+					cr, cr / 1000000, cr_mod);
+	else
+		dev_warn(&pdev->dev, "Clock ss is at %lu (%lu MHz) (must be <= %lu)\n",
+				cr, cr / 1000000, cr_mod);
+
+	/*
+	 * Datasheet named it "Die Bonding ID"
+	 * I expect to be a sort of Security System Revision number.
+	 * Since the A80 seems to have an other version of SS
+	 * this info could be useful
+	 */
+	writel(SS_ENABLED, ss->base + SS_CTL);
+	v = readl(ss->base + SS_CTL);
+	v >>= 16;
+	v &= 0x07;
+	dev_info(&pdev->dev, "Die ID %d\n", v);
+	writel(0, ss->base + SS_CTL);
+
+	ss->dev = &pdev->dev;
+
+	mutex_init(&ss->lock);
+	mutex_init(&ss->bufin_lock);
+	mutex_init(&ss->bufout_lock);
+
+	err = crypto_register_ahash(&sunxi_md5_alg);
+	if (err)
+		goto error_md5;
+	err = crypto_register_ahash(&sunxi_sha1_alg);
+	if (err)
+		goto error_sha1;
+	err = crypto_register_algs(sunxi_cipher_algs,
+			ARRAY_SIZE(sunxi_cipher_algs));
+	if (err)
+		goto error_ciphers;
+
+	return 0;
+error_ciphers:
+	crypto_unregister_ahash(&sunxi_sha1_alg);
+error_sha1:
+	crypto_unregister_ahash(&sunxi_md5_alg);
+error_md5:
+	clk_disable_unprepare(ss->ssclk);
+	clk_disable_unprepare(ss->busclk);
+	return err;
+}
+
+static int __exit sunxi_ss_remove(struct platform_device *pdev)
+{
+	if (!pdev->dev.of_node)
+		return 0;
+
+	crypto_unregister_ahash(&sunxi_md5_alg);
+	crypto_unregister_ahash(&sunxi_sha1_alg);
+	crypto_unregister_algs(sunxi_cipher_algs,
+			ARRAY_SIZE(sunxi_cipher_algs));
+
+	if (ss->buf_in != NULL)
+		kfree(ss->buf_in);
+	if (ss->buf_out != NULL)
+		kfree(ss->buf_out);
+
+	writel(0, ss->base + SS_CTL);
+	clk_disable_unprepare(ss->busclk);
+	clk_disable_unprepare(ss->ssclk);
+	return 0;
+}
+
+static const struct of_device_id a20ss_crypto_of_match_table[] = {
+	{ .compatible = "allwinner,sun7i-a20-crypto" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, a20ss_crypto_of_match_table);
+
+static struct platform_driver sunxi_ss_driver = {
+	.probe          = sunxi_ss_probe,
+	.remove         = __exit_p(sunxi_ss_remove),
+	.driver         = {
+		.owner          = THIS_MODULE,
+		.name           = "sunxi-ss",
+		.of_match_table	= a20ss_crypto_of_match_table,
+	},
+};
+
+module_platform_driver(sunxi_ss_driver);
+
+MODULE_DESCRIPTION("Allwinner Security System cryptographic accelerator");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Corentin LABBE <clabbe.montjoie@gmail.com>");
diff --git a/drivers/crypto/sunxi-ss/sunxi-ss-hash.c b/drivers/crypto/sunxi-ss/sunxi-ss-hash.c
new file mode 100644
index 0000000..ec8758f
--- /dev/null
+++ b/drivers/crypto/sunxi-ss/sunxi-ss-hash.c
@@ -0,0 +1,445 @@ 
+/*
+ * sunxi-ss-hash.c - hardware cryptographic accelerator for Allwinner A20 SoC
+ *
+ * Copyright (C) 2013-2014 Corentin LABBE <clabbe.montjoie@gmail.com>
+ *
+ * This file add support for MD5 and SHA1.
+ *
+ * You could find the datasheet in Documentation/arm/sunxi/README
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include "sunxi-ss.h"
+
+/* This is a totaly arbitrary value */
+#define SS_TIMEOUT 100
+
+extern struct sunxi_ss_ctx *ss;
+
+int sunxi_hash_crainit(struct crypto_tfm *tfm)
+{
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+			sizeof(struct sunxi_req_ctx));
+	return 0;
+}
+
+/* sunxi_hash_init: initialize request context */
+int sunxi_hash_init(struct ahash_request *areq)
+{
+	const char *hash_type;
+	struct sunxi_req_ctx *op = ahash_request_ctx(areq);
+
+	memset(op, 0, sizeof(struct sunxi_req_ctx));
+
+	hash_type = crypto_tfm_alg_name(areq->base.tfm);
+
+	if (strcmp(hash_type, "sha1") == 0)
+		op->mode = SS_OP_SHA1;
+	if (strcmp(hash_type, "md5") == 0)
+		op->mode = SS_OP_MD5;
+	if (op->mode == 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static u32 rx_cnt;
+
+inline void ss_writer(const u32 v)
+{
+	u32 spaces;
+
+	writel(v, ss->base + SS_RXFIFO);
+	rx_cnt--;
+	while (rx_cnt == 0) {
+		spaces = readl_relaxed(ss->base + SS_FCSR);
+		rx_cnt = SS_RXFIFO_SPACES(spaces);
+	}
+}
+
+inline void ss_writer_relaxed(const u32 v)
+{
+	u32 spaces;
+
+	writel_relaxed(v, ss->base + SS_RXFIFO);
+	rx_cnt--;
+	while (rx_cnt == 0) {
+		spaces = readl_relaxed(ss->base + SS_FCSR);
+		rx_cnt = SS_RXFIFO_SPACES(spaces);
+	}
+}
+
+/*
+ * sunxi_hash_update: update hash engine
+ *
+ * Could be used for both SHA1 and MD5
+ * Write data by step of 32bits and put then in the SS.
+ *
+ * Since we cannot leave partial data and hash state in the engine,
+ * we need to get the hash state at the end of this function.
+ * After some work, I have found that we can get the hash state every 64o
+ *
+ * So the first work is to get the number of bytes to write to SS modulo 64
+ * The extra bytes will go to two different destination:
+ * op->wait for full 32bits word
+ * op->wb (waiting bytes) for partial 32 bits word
+ * So we can have up to (64/4)-1 op->wait words and 0/1/2/3 bytes in wb
+ *
+ * So at the begin of update()
+ * if op->nwait * 4 + areq->nbytes < 64
+ * => all data writed to wait buffers and end=0
+ * if not write all nwait to the device and position end to complete to 64o
+ *
+ * example 1:
+ * update1 60o => nwait=15
+ * update2 60o => need one more word to have 64o
+ * end=4
+ * so write all data in op->wait and one word of SGs
+ * write remaining data in op->wait
+ * final state op->nwait=14
+ */
+int sunxi_hash_update(struct ahash_request *areq)
+{
+	u32 v, ivmode = 0;
+	unsigned int i = 0;
+	/*
+	 * i is the total bytes read from SGs, to be compared to areq->nbytes
+	 * i is important because we cannot rely on SG length since the sum of
+	 * SG->length could be greater than areq->nbytes
+	 */
+
+	struct sunxi_req_ctx *op = ahash_request_ctx(areq);
+	struct scatterlist *in_sg;
+	unsigned int in_i = 0; /* advancement in the current SG */
+	u64 end;
+	/*
+	 * end is the position when we need to stop writing to the device,
+	 * to be compared to i
+	 */
+	int in_r;
+	void *src_addr;
+
+	dev_dbg(ss->dev, "%s %s bc=%llu len=%u mode=%x bw=%u ww=%u",
+			__func__, crypto_tfm_alg_name(areq->base.tfm),
+			op->byte_count, areq->nbytes, op->mode,
+			op->nbw, op->nwait);
+
+	if (areq->nbytes == 0)
+		return 0;
+
+	end = ((areq->nbytes + op->nwait * 4 + op->nbw) / 64) * 64
+		- op->nbw - op->nwait * 4;
+
+	if (end > areq->nbytes || areq->nbytes - end > 63) {
+		dev_err(ss->dev, "ERROR: Bound error %llu %u\n",
+				end, areq->nbytes);
+		return -EINVAL;
+	}
+
+	if (op->nwait > 0 && end > 0) {
+		/* a precedent update was done */
+		for (i = 0; i < op->nwait; i++) {
+			ss_writer(op->wait[i]);
+			op->byte_count += 4;
+		}
+		op->nwait = 0;
+	}
+
+	mutex_lock(&ss->lock);
+	/*
+	 * if some data have been processed before,
+	 * we need to restore the partial hash state
+	 */
+	if (op->byte_count > 0) {
+		ivmode = SS_IV_ARBITRARY;
+		for (i = 0; i < 5; i++)
+			writel(op->hash[i], ss->base + SS_IV0 + i * 4);
+	}
+	/* Enable the device */
+	writel(op->mode | SS_ENABLED | ivmode, ss->base + SS_CTL);
+
+	rx_cnt = 0;
+	i = 0;
+
+	in_sg = areq->src;
+	src_addr = kmap(sg_page(in_sg)) + in_sg->offset;
+	if (src_addr == NULL) {
+		mutex_unlock(&ss->lock);
+		dev_err(ss->dev, "ERROR: Cannot kmap source buffer\n");
+		return -EFAULT;
+	}
+	do {
+		/*
+		 * step 1, if some bytes remains from last SG,
+		 * try to complete them to 4 and send that word
+		 */
+		if (op->nbw > 0) {
+			while (op->nbw < 4 && i < areq->nbytes &&
+					in_i < in_sg->length) {
+				op->wb |= (*(u8 *)(src_addr + in_i))
+					<< (8 * op->nbw);
+				dev_dbg(ss->dev, "%s Complete w=%d wb=%x\n",
+						__func__, op->nbw, op->wb);
+				i++;
+				in_i++;
+				op->nbw++;
+			}
+			if (op->nbw == 4) {
+				if (i <= end) {
+					ss_writer(op->wb);
+					op->byte_count += 4;
+				} else {
+					op->wait[op->nwait] = op->wb;
+					op->nwait++;
+					dev_dbg(ss->dev, "%s Keep %u bytes after %llu\n",
+						__func__, op->nwait, end);
+				}
+				op->nbw = 0;
+				op->wb = 0;
+			}
+		}
+		/* step 2, main loop, read data 4bytes at a time */
+		while (i < areq->nbytes && in_i < in_sg->length) {
+			/* how many bytes we can read, (we need 4) */
+			in_r = min(in_sg->length - in_i, areq->nbytes - i);
+			if (in_r < 4) {
+				/* Not enough data to write to the device */
+				op->wb = 0;
+				while (in_r > 0) {
+					op->wb |= (*(u8 *)(src_addr + in_i))
+						<< (8 * op->nbw);
+					dev_dbg(ss->dev, "%s ending bw=%d wb=%x\n",
+						__func__, op->nbw, op->wb);
+					in_r--;
+					i++;
+					in_i++;
+					op->nbw++;
+				}
+				goto nextsg;
+			}
+			v = *(u32 *)(src_addr + in_i);
+			if (i < end) {
+				/* last write must be done without relaxed */
+				if (i + 4 >= end)
+					ss_writer(v);
+				else
+					ss_writer_relaxed(v);
+				i += 4;
+				op->byte_count += 4;
+				in_i += 4;
+			} else {
+				op->wait[op->nwait] = v;
+				i += 4;
+				in_i += 4;
+				op->nwait++;
+				dev_dbg(ss->dev, "%s Keep word ww=%u after %llu\n",
+						__func__, op->nwait, end);
+				if (op->nwait > 15) {
+					dev_err(ss->dev, "FATAL: Cannot enqueue more, bug?\n");
+					writel(0, ss->base + SS_CTL);
+					mutex_unlock(&ss->lock);
+					return -EIO;
+				}
+			}
+		}
+nextsg:
+		/* Nothing more to read in this SG */
+		if (in_i == in_sg->length) {
+			kunmap(sg_page(in_sg));
+			do {
+				in_sg = sg_next(in_sg);
+			} while (in_sg != NULL && in_sg->length == 0);
+			in_i = 0;
+			if (in_sg != NULL) {
+				src_addr = kmap(sg_page(in_sg)) + in_sg->offset;
+				if (src_addr == NULL) {
+					mutex_unlock(&ss->lock);
+					dev_err(ss->dev, "ERROR: Cannot kmap source buffer\n");
+					return -EFAULT;
+				}
+			}
+		}
+	} while (in_sg != NULL && i < areq->nbytes);
+
+	/* ask the device to finish the hashing */
+	writel(op->mode | SS_ENABLED | SS_DATA_END, ss->base + SS_CTL);
+	i = 0;
+	do {
+		v = readl(ss->base + SS_CTL);
+		i++;
+	} while (i < SS_TIMEOUT && (v & SS_DATA_END) > 0);
+	if (i >= SS_TIMEOUT) {
+		dev_err(ss->dev, "ERROR: %s hash end timeout after %d loop, CTL=%x\n",
+				__func__, i, v);
+		writel(0, ss->base + SS_CTL);
+		mutex_unlock(&ss->lock);
+		return -EIO;
+	}
+
+	/* get the partial hash */
+	if (op->mode == SS_OP_SHA1) {
+		for (i = 0; i < 5; i++)
+			op->hash[i] = readl(ss->base + SS_MD0 + i * 4);
+	} else {
+		for (i = 0; i < 4; i++)
+			op->hash[i] = readl(ss->base + SS_MD0 + i * 4);
+	}
+
+	writel(0, ss->base + SS_CTL);
+	mutex_unlock(&ss->lock);
+	return 0;
+}
+
+/*
+ * sunxi_hash_final: finalize hashing operation
+ *
+ * If we have some remaining bytes, we write them.
+ * Then ask the SS for finalizing the hashing operation
+ */
+int sunxi_hash_final(struct ahash_request *areq)
+{
+	u32 v, ivmode = 0;
+	unsigned int i;
+	int zeros;
+	unsigned int index, padlen;
+	__be64 bits;
+	struct sunxi_req_ctx *op = ahash_request_ctx(areq);
+
+	dev_dbg(ss->dev, "%s byte=%llu len=%u mode=%x bw=%u %x h=%x ww=%u",
+			__func__, op->byte_count, areq->nbytes, op->mode,
+			op->nbw, op->wb, op->hash[0], op->nwait);
+
+	mutex_lock(&ss->lock);
+	rx_cnt = 0;
+
+	/*
+	 * if we have already writed something,
+	 * restore the partial hash state
+	 */
+	if (op->byte_count > 0) {
+		ivmode = SS_IV_ARBITRARY;
+		for (i = 0; i < 5; i++)
+			writel(op->hash[i], ss->base + SS_IV0 + i * 4);
+	}
+	writel(op->mode | SS_ENABLED | ivmode, ss->base + SS_CTL);
+
+	/* write the remaining words of the wait buffer */
+	if (op->nwait > 0) {
+		for (i = 0; i < op->nwait; i++) {
+			v = op->wait[i];
+			ss_writer(v);
+			op->byte_count += 4;
+			dev_dbg(ss->dev, "%s write %llu i=%u %x\n",
+					__func__, op->byte_count, i, v);
+		}
+		op->nwait = 0;
+	}
+
+	/* write the remaining bytes of the nbw buffer */
+	if (op->nbw > 0) {
+		op->wb |= ((1 << 7) << (op->nbw * 8));
+		ss_writer(op->wb);
+	} else {
+		ss_writer((1 << 7));
+	}
+
+	/*
+	 * number of space to pad to obtain 64o minus 8(size) minus 4 (final 1)
+	 * I take the operations from other md5/sha1 implementations
+	 */
+
+	/* we have already send 4 more byte of which nbw data */
+	if (op->mode == SS_OP_MD5) {
+		index = (op->byte_count + 4) & 0x3f;
+		op->byte_count += op->nbw;
+		if (index > 56)
+			zeros = (120 - index) / 4;
+		else
+			zeros = (56 - index) / 4;
+	} else {
+		op->byte_count += op->nbw;
+		index = op->byte_count & 0x3f;
+		padlen = (index < 56) ? (56 - index) : ((64+56) - index);
+		zeros = (padlen - 1) / 4;
+	}
+	for (i = 0; i < zeros; i++)
+		ss_writer(0);
+
+	/* write the length of data */
+	if (op->mode == SS_OP_SHA1) {
+		bits = cpu_to_be64(op->byte_count << 3);
+		ss_writer(bits & 0xffffffff);
+		ss_writer((bits >> 32) & 0xffffffff);
+	} else {
+		ss_writer((op->byte_count << 3) & 0xffffffff);
+		ss_writer((op->byte_count >> 29) & 0xffffffff);
+	}
+
+	/* Tell the SS to stop the hashing */
+	writel(op->mode | SS_ENABLED | SS_DATA_END, ss->base + SS_CTL);
+
+	/*
+	 * Wait for SS to finish the hash.
+	 * The timeout could happend only in case of bad overcloking
+	 * or driver bug.
+	 */
+	i = 0;
+	do {
+		v = readl(ss->base + SS_CTL);
+		i++;
+	} while (i < SS_TIMEOUT && (v & SS_DATA_END) > 0);
+	if (i >= SS_TIMEOUT) {
+		dev_err(ss->dev, "ERROR: hash end timeout %d>%d ctl=%x len=%u\n",
+				i, SS_TIMEOUT, v, areq->nbytes);
+		writel(0, ss->base + SS_CTL);
+		mutex_unlock(&ss->lock);
+		return -EIO;
+	}
+
+	/* Get the hash from the device */
+	if (op->mode == SS_OP_SHA1) {
+		for (i = 0; i < 5; i++) {
+			v = cpu_to_be32(readl(ss->base + SS_MD0 + i * 4));
+			memcpy(areq->result + i * 4, &v, 4);
+		}
+	} else {
+		for (i = 0; i < 4; i++) {
+			v = readl(ss->base + SS_MD0 + i * 4);
+			memcpy(areq->result + i * 4, &v, 4);
+		}
+	}
+	writel(0, ss->base + SS_CTL);
+	mutex_unlock(&ss->lock);
+	return 0;
+}
+
+/* sunxi_hash_finup: finalize hashing operation after an update */
+int sunxi_hash_finup(struct ahash_request *areq)
+{
+	int err;
+
+	err = sunxi_hash_update(areq);
+	if (err != 0)
+		return err;
+
+	return sunxi_hash_final(areq);
+}
+
+/* combo of init/update/final functions */
+int sunxi_hash_digest(struct ahash_request *areq)
+{
+	int err;
+
+	err = sunxi_hash_init(areq);
+	if (err != 0)
+		return err;
+
+	err = sunxi_hash_update(areq);
+	if (err != 0)
+		return err;
+
+	return sunxi_hash_final(areq);
+}
diff --git a/drivers/crypto/sunxi-ss/sunxi-ss.h b/drivers/crypto/sunxi-ss/sunxi-ss.h
new file mode 100644
index 0000000..331e75b
--- /dev/null
+++ b/drivers/crypto/sunxi-ss/sunxi-ss.h
@@ -0,0 +1,193 @@ 
+/*
+ * sunxi-ss.c - hardware cryptographic accelerator for Allwinner A20 SoC
+ *
+ * Copyright (C) 2013-2014 Corentin LABBE <clabbe.montjoie@gmail.com>
+ *
+ * Support AES cipher with 128,192,256 bits keysize.
+ * Support MD5 and SHA1 hash algorithms.
+ * Support DES and 3DES
+ *
+ * You could find the datasheet in Documentation/arm/sunxi/README
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include <linux/clk.h>
+#include <linux/crypto.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <crypto/scatterwalk.h>
+#include <linux/scatterlist.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <crypto/md5.h>
+#include <crypto/sha.h>
+#include <crypto/hash.h>
+#include <crypto/internal/hash.h>
+#include <crypto/aes.h>
+#include <crypto/des.h>
+#include <crypto/internal/rng.h>
+
+#define SS_CTL            0x00
+#define SS_KEY0           0x04
+#define SS_KEY1           0x08
+#define SS_KEY2           0x0C
+#define SS_KEY3           0x10
+#define SS_KEY4           0x14
+#define SS_KEY5           0x18
+#define SS_KEY6           0x1C
+#define SS_KEY7           0x20
+
+#define SS_IV0            0x24
+#define SS_IV1            0x28
+#define SS_IV2            0x2C
+#define SS_IV3            0x30
+
+#define SS_CNT0           0x34
+#define SS_CNT1           0x38
+#define SS_CNT2           0x3C
+#define SS_CNT3           0x40
+
+#define SS_FCSR           0x44
+#define SS_ICSR           0x48
+
+#define SS_MD0            0x4C
+#define SS_MD1            0x50
+#define SS_MD2            0x54
+#define SS_MD3            0x58
+#define SS_MD4            0x5C
+
+#define SS_RXFIFO         0x200
+#define SS_TXFIFO         0x204
+
+/* SS_CTL configuration values */
+
+/* PRNG generator mode - bit 15 */
+#define SS_PRNG_ONESHOT		(0 << 15)
+#define SS_PRNG_CONTINUE	(1 << 15)
+
+/* IV mode for hash */
+#define SS_IV_ARBITRARY		(1 << 14)
+
+/* SS operation mode - bits 12-13 */
+#define SS_ECB			(0 << 12)
+#define SS_CBC			(1 << 12)
+#define SS_CNT			(2 << 12)
+
+/* Counter width for CNT mode - bits 10-11 */
+#define SS_CNT_16BITS		(0 << 10)
+#define SS_CNT_32BITS		(1 << 10)
+#define SS_CNT_64BITS		(2 << 10)
+
+/* Key size for AES - bits 8-9 */
+#define SS_AES_128BITS		(0 << 8)
+#define SS_AES_192BITS		(1 << 8)
+#define SS_AES_256BITS		(2 << 8)
+
+/* Operation direction - bit 7 */
+#define SS_ENCRYPTION		(0 << 7)
+#define SS_DECRYPTION		(1 << 7)
+
+/* SS Method - bits 4-6 */
+#define SS_OP_AES		(0 << 4)
+#define SS_OP_DES		(1 << 4)
+#define SS_OP_3DES		(2 << 4)
+#define SS_OP_SHA1		(3 << 4)
+#define SS_OP_MD5		(4 << 4)
+#define SS_OP_PRNG		(5 << 4)
+
+/* Data end bit - bit 2 */
+#define SS_DATA_END		(1 << 2)
+
+/* PRNG start bit - bit 1 */
+#define SS_PRNG_START		(1 << 1)
+
+/* SS Enable bit - bit 0 */
+#define SS_DISABLED		(0 << 0)
+#define SS_ENABLED		(1 << 0)
+
+/* SS_FCSR configuration values */
+/* RX FIFO status - bit 30 */
+#define SS_RXFIFO_FREE		(1 << 30)
+
+/* RX FIFO empty spaces - bits 24-29 */
+#define SS_RXFIFO_SPACES(val)	(((val) >> 24) & 0x3f)
+
+/* TX FIFO status - bit 22 */
+#define SS_TXFIFO_AVAILABLE	(1 << 22)
+
+/* TX FIFO available spaces - bits 16-21 */
+#define SS_TXFIFO_SPACES(val)	(((val) >> 16) & 0x3f)
+
+#define SS_RXFIFO_EMP_INT_PENDING	(1 << 10)
+#define SS_TXFIFO_AVA_INT_PENDING	(1 << 8)
+#define SS_RXFIFO_EMP_INT_ENABLE	(1 << 2)
+#define SS_TXFIFO_AVA_INT_ENABLE	(1 << 0)
+
+/* SS_ICSR configuration values */
+#define SS_ICS_DRQ_ENABLE		(1 << 4)
+
+struct sunxi_ss_ctx {
+	void __iomem *base;
+	int irq;
+	struct clk *busclk;
+	struct clk *ssclk;
+	struct device *dev;
+	struct resource *res;
+	void *buf_in; /* pointer to data to be uploaded to the device */
+	size_t buf_in_size; /* size of buf_in */
+	void *buf_out;
+	size_t buf_out_size;
+	struct mutex lock; /* control the use of the device */
+	struct mutex bufout_lock; /* control the use of buf_out*/
+	struct mutex bufin_lock; /* control the sue of buf_in*/
+};
+
+struct sunxi_tfm_ctx {
+	u32 key[AES_MAX_KEY_SIZE / 4];/* divided by sizeof(u32) */
+	u32 keylen;
+	u32 keymode;
+};
+
+struct sunxi_req_ctx {
+	u32 mode;
+	u64 byte_count; /* number of bytes "uploaded" to the device */
+	u32 wb; /* a partial word waiting to be completed and
+			uploaded to the device */
+	/* number of bytes to be uploaded in the wb word */
+	unsigned int nbw;
+	u32 hash[5];
+	u32 wait[64];
+	unsigned int nwait;
+};
+
+#define SS_SEED_LEN (192/8)
+#define SS_DATA_LEN (160/8)
+
+struct prng_context {
+	u32 seed[SS_SEED_LEN/4];
+	unsigned int slen;
+};
+
+int sunxi_hash_crainit(struct crypto_tfm *tfm);
+int sunxi_hash_init(struct ahash_request *areq);
+int sunxi_hash_update(struct ahash_request *areq);
+int sunxi_hash_final(struct ahash_request *areq);
+int sunxi_hash_finup(struct ahash_request *areq);
+int sunxi_hash_digest(struct ahash_request *areq);
+int sunxi_hash_export(struct ahash_request *areq, void *out);
+int sunxi_hash_import(struct ahash_request *areq, const void *in);
+
+int sunxi_ss_aes_poll(struct ablkcipher_request *areq, u32 mode);
+int sunxi_ss_des_poll(struct ablkcipher_request *areq, u32 mode);
+int sunxi_ss_cipher_init(struct crypto_tfm *tfm);
+int sunxi_ss_cipher_encrypt(struct ablkcipher_request *areq);
+int sunxi_ss_cipher_decrypt(struct ablkcipher_request *areq);
+int sunxi_ss_aes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
+		unsigned int keylen);
+int sunxi_ss_des_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
+		unsigned int keylen);
+int sunxi_ss_des3_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
+		unsigned int keylen);