Message ID | E1Zke3a-00042e-Jx@rmk-PC.arm.linux.org.uk (mailing list archive) |
---|---|
State | Accepted |
Delegated to: | Herbert Xu |
Headers | show |
On Fri, 09 Oct 2015 21:14:22 +0100 Russell King <rmk+kernel@arm.linux.org.uk> wrote: > As all the import functions and export functions are virtually > identical, factor out their common parts into a generic > mv_cesa_ahash_import() and mv_cesa_ahash_export() respectively. This > performs the actual import or export, and we pass the data pointers and > length into these functions. > > We have to switch a % const operation to do_div() in the common import > function to avoid provoking gcc to use the expensive 64-bit by 64-bit > modulus operation. > > Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> Looks good to me. Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com> Thanks again for taking care of that. Regards, Boris > --- > Replacement to patch 5, going a little further, as requested by Boris. > > drivers/crypto/marvell/hash.c | 157 ++++++++++++++---------------------------- > 1 file changed, 53 insertions(+), 104 deletions(-) > > diff --git a/drivers/crypto/marvell/hash.c b/drivers/crypto/marvell/hash.c > index b7c2c05f1a01..a24ceda7acfe 100644 > --- a/drivers/crypto/marvell/hash.c > +++ b/drivers/crypto/marvell/hash.c > @@ -795,39 +795,32 @@ static int mv_cesa_ahash_finup(struct ahash_request *req) > return ret; > } > > -static int mv_cesa_md5_init(struct ahash_request *req) > +static int mv_cesa_ahash_export(struct ahash_request *req, void *hash, > + u64 *len, void *cache) > { > - struct mv_cesa_op_ctx tmpl; > - > - mv_cesa_set_op_cfg(&tmpl, CESA_SA_DESC_CFG_MACM_MD5); > - > - mv_cesa_ahash_init(req, &tmpl); > - > - return 0; > -} > - > -static int mv_cesa_md5_export(struct ahash_request *req, void *out) > -{ > - struct md5_state *out_state = out; > struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); > struct mv_cesa_ahash_req *creq = ahash_request_ctx(req); > unsigned int digsize = crypto_ahash_digestsize(ahash); > + unsigned int blocksize; > + > + blocksize = crypto_tfm_alg_blocksize(crypto_ahash_tfm(ahash)); > > - out_state->byte_count = creq->len; > - memcpy(out_state->hash, creq->state, digsize); > - memset(out_state->block, 0, sizeof(out_state->block)); > + *len = creq->len; > + memcpy(hash, creq->state, digsize); > + memset(cache, 0, blocksize); > if (creq->cache) > - memcpy(out_state->block, creq->cache, creq->cache_ptr); > + memcpy(cache, creq->cache, creq->cache_ptr); > > return 0; > } > > -static int mv_cesa_md5_import(struct ahash_request *req, const void *in) > +static int mv_cesa_ahash_import(struct ahash_request *req, const void *hash, > + u64 len, const void *cache) > { > - const struct md5_state *in_state = in; > struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); > struct mv_cesa_ahash_req *creq = ahash_request_ctx(req); > unsigned int digsize = crypto_ahash_digestsize(ahash); > + unsigned int blocksize; > unsigned int cache_ptr; > int ret; > > @@ -835,16 +828,17 @@ static int mv_cesa_md5_import(struct ahash_request *req, const void *in) > if (ret) > return ret; > > - if (in_state->byte_count >= sizeof(in_state->block)) > + blocksize = crypto_tfm_alg_blocksize(crypto_ahash_tfm(ahash)); > + if (len >= blocksize) > mv_cesa_update_op_cfg(&creq->op_tmpl, > CESA_SA_DESC_CFG_MID_FRAG, > CESA_SA_DESC_CFG_FRAG_MSK); > > - creq->len = in_state->byte_count; > - memcpy(creq->state, in_state->hash, digsize); > + creq->len = len; > + memcpy(creq->state, hash, digsize); > creq->cache_ptr = 0; > > - cache_ptr = creq->len % sizeof(in_state->block); > + cache_ptr = do_div(len, blocksize); > if (!cache_ptr) > return 0; > > @@ -852,12 +846,39 @@ static int mv_cesa_md5_import(struct ahash_request *req, const void *in) > if (ret) > return ret; > > - memcpy(creq->cache, in_state->block, cache_ptr); > + memcpy(creq->cache, cache, cache_ptr); > creq->cache_ptr = cache_ptr; > > return 0; > } > > +static int mv_cesa_md5_init(struct ahash_request *req) > +{ > + struct mv_cesa_op_ctx tmpl; > + > + mv_cesa_set_op_cfg(&tmpl, CESA_SA_DESC_CFG_MACM_MD5); > + > + mv_cesa_ahash_init(req, &tmpl); > + > + return 0; > +} > + > +static int mv_cesa_md5_export(struct ahash_request *req, void *out) > +{ > + struct md5_state *out_state = out; > + > + return mv_cesa_ahash_export(req, out_state->hash, > + &out_state->byte_count, out_state->block); > +} > + > +static int mv_cesa_md5_import(struct ahash_request *req, const void *in) > +{ > + const struct md5_state *in_state = in; > + > + return mv_cesa_ahash_import(req, in_state->hash, in_state->byte_count, > + in_state->block); > +} > + > static int mv_cesa_md5_digest(struct ahash_request *req) > { > int ret; > @@ -908,53 +929,17 @@ static int mv_cesa_sha1_init(struct ahash_request *req) > static int mv_cesa_sha1_export(struct ahash_request *req, void *out) > { > struct sha1_state *out_state = out; > - struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); > - struct mv_cesa_ahash_req *creq = ahash_request_ctx(req); > - unsigned int digsize = crypto_ahash_digestsize(ahash); > - > - out_state->count = creq->len; > - memcpy(out_state->state, creq->state, digsize); > - memset(out_state->buffer, 0, sizeof(out_state->buffer)); > - if (creq->cache) > - memcpy(out_state->buffer, creq->cache, creq->cache_ptr); > > - return 0; > + return mv_cesa_ahash_export(req, out_state->state, &out_state->count, > + out_state->buffer); > } > > static int mv_cesa_sha1_import(struct ahash_request *req, const void *in) > { > const struct sha1_state *in_state = in; > - struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); > - struct mv_cesa_ahash_req *creq = ahash_request_ctx(req); > - unsigned int digsize = crypto_ahash_digestsize(ahash); > - unsigned int cache_ptr; > - int ret; > - > - ret = crypto_ahash_init(req); > - if (ret) > - return ret; > - > - if (in_state->count >= SHA1_BLOCK_SIZE) > - mv_cesa_update_op_cfg(&creq->op_tmpl, > - CESA_SA_DESC_CFG_MID_FRAG, > - CESA_SA_DESC_CFG_FRAG_MSK); > - > - creq->len = in_state->count; > - memcpy(creq->state, in_state->state, digsize); > - creq->cache_ptr = 0; > - > - cache_ptr = creq->len % SHA1_BLOCK_SIZE; > - if (!cache_ptr) > - return 0; > - > - ret = mv_cesa_ahash_alloc_cache(req); > - if (ret) > - return ret; > - > - memcpy(creq->cache, in_state->buffer, cache_ptr); > - creq->cache_ptr = cache_ptr; > > - return 0; > + return mv_cesa_ahash_import(req, in_state->state, in_state->count, > + in_state->buffer); > } > > static int mv_cesa_sha1_digest(struct ahash_request *req) > @@ -1018,53 +1003,17 @@ static int mv_cesa_sha256_digest(struct ahash_request *req) > static int mv_cesa_sha256_export(struct ahash_request *req, void *out) > { > struct sha256_state *out_state = out; > - struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); > - struct mv_cesa_ahash_req *creq = ahash_request_ctx(req); > - unsigned int ds = crypto_ahash_digestsize(ahash); > > - out_state->count = creq->len; > - memcpy(out_state->state, creq->state, ds); > - memset(out_state->buf, 0, sizeof(out_state->buf)); > - if (creq->cache) > - memcpy(out_state->buf, creq->cache, creq->cache_ptr); > - > - return 0; > + return mv_cesa_ahash_export(req, out_state->state, &out_state->count, > + out_state->buf); > } > > static int mv_cesa_sha256_import(struct ahash_request *req, const void *in) > { > const struct sha256_state *in_state = in; > - struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); > - struct mv_cesa_ahash_req *creq = ahash_request_ctx(req); > - unsigned int digsize = crypto_ahash_digestsize(ahash); > - unsigned int cache_ptr; > - int ret; > - > - ret = crypto_ahash_init(req); > - if (ret) > - return ret; > - > - if (in_state->count >= SHA256_BLOCK_SIZE) > - mv_cesa_update_op_cfg(&creq->op_tmpl, > - CESA_SA_DESC_CFG_MID_FRAG, > - CESA_SA_DESC_CFG_FRAG_MSK); > > - creq->len = in_state->count; > - memcpy(creq->state, in_state->state, digsize); > - creq->cache_ptr = 0; > - > - cache_ptr = creq->len % SHA256_BLOCK_SIZE; > - if (!cache_ptr) > - return 0; > - > - ret = mv_cesa_ahash_alloc_cache(req); > - if (ret) > - return ret; > - > - memcpy(creq->cache, in_state->buf, cache_ptr); > - creq->cache_ptr = cache_ptr; > - > - return 0; > + return mv_cesa_ahash_import(req, in_state->state, in_state->count, > + in_state->buf); > } > > struct ahash_alg mv_sha256_alg = {
Hi Russel, Russell King <rmk+kernel@arm.linux.org.uk> writes: > As all the import functions and export functions are virtually > identical, factor out their common parts into a generic > mv_cesa_ahash_import() and mv_cesa_ahash_export() respectively. This > performs the actual import or export, and we pass the data pointers and > length into these functions. > > We have to switch a % const operation to do_div() in the common import > function to avoid provoking gcc to use the expensive 64-bit by 64-bit > modulus operation. > > Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> Thanks for the refactoring and for the fixes. All patches look good to me. Out of curiosity, can I ask what perf you get w/ openssh or openssl using AF_ALG and the CESA? Cheers, a+ -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Oct 10, 2015 at 12:37:33AM +0200, Arnaud Ebalard wrote: > Hi Russel, > > Russell King <rmk+kernel@arm.linux.org.uk> writes: > > > As all the import functions and export functions are virtually > > identical, factor out their common parts into a generic > > mv_cesa_ahash_import() and mv_cesa_ahash_export() respectively. This > > performs the actual import or export, and we pass the data pointers and > > length into these functions. > > > > We have to switch a % const operation to do_div() in the common import > > function to avoid provoking gcc to use the expensive 64-bit by 64-bit > > modulus operation. > > > > Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> > > Thanks for the refactoring and for the fixes. All patches look good to > me. Out of curiosity, can I ask what perf you get w/ openssh or openssl > using AF_ALG and the CESA? I would do, but it seems this AF_ALG plugin for openssl isn't actually using it for encryption. When I try: openssl speed -engine af_alg aes-128-cbc I get results for using openssl's software implementation. If I do: openssl speed -engine af_alg md5 then I get results from using the kernel's MD5. Hence, I think the only thing that I think openssh is using it for is the digest stuff, not the crypto itself. I can't be certain about that though. I've tried debugging the af_alg engine plugin, but I'm not getting very far (I'm not an openssl hacker!) I see it registering the function to get the ciphers (via ENGINE_set_ciphers), and I see this called several times, returning a list of NID_xxx values describing the methods it supports, which includes aes-128-cbc. However, unlike the equivalent digest function, I never see it called requesting any of the ciphers. Maybe it's an openssl bug, or a "feature" preventing hardware crypto? Maybe something is missing from its initialisation? I've no idea yet. It seems I'm not alone in this - this report from April 2015 is exactly what I'm seeing: https://mta.openssl.org/pipermail/openssl-users/2015-April/001124.html However, I'm coming to the conclusion that AF_ALG with openssl is a dead project, and the only interface that everyone is using for that is cryptodev - probably contary to Herbert and/or DaveM's wishes. For example, the openwrt guys seem to only support cryptodev, according to their wiki page on the subject of hardware crypto: http://wiki.openwrt.org/doc/hardware/cryptographic.hardware.accelerators Here's the references to code for AF_ALG with openssl I've found so far: Original af_alg plugin (dead): http://src.carnivore.it/users/common/af_alg/ 3rd party "maintained" af_alg openssl plugin, derived from commit 1851bbb010c38878c83729be844f168192059189 in the above repo but with no history: https://github.com/RidgeRun/af-alg-rr and that doesn't contain any changes to the C code originally committed. Whether this C code contains changes or not is anyone's guess: there's no way to refer back to the original repository. Anyway, here's the digest results: Software: The 'numbers' are in 1000s of bytes per second processed. type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes md5 13948.89k 42477.61k 104619.41k 165140.82k 199273.13k sha1 13091.91k 36463.89k 75393.88k 103893.33k 117104.50k sha256 13573.92k 30492.25k 52700.33k 64247.81k 68722.69k Hardware: The 'numbers' are in 1000s of bytes per second processed. type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes md5 3964.55k 13782.11k 43181.71k 180263.38k 1446616.18k sha1 4609.16k 8922.35k 35422.87k 333575.31k 2122547.20k sha256 13519.62k 30484.10k 52547.47k 64285.21k 68530.60k There's actually something suspicious while running these tests: Doing md5 for 3s on 16 size blocks: 32212 md5's in 0.13s Doing md5 for 3s on 64 size blocks: 23688 md5's in 0.11s Doing md5 for 3s on 256 size blocks: 23615 md5's in 0.14s Doing md5 for 3s on 1024 size blocks: 22885 md5's in 0.13s Doing md5 for 3s on 8192 size blocks: 15893 md5's in 0.09s Doing sha1 for 3s on 16 size blocks: 31688 sha1's in 0.11s Doing sha1 for 3s on 64 size blocks: 23700 sha1's in 0.17s Doing sha1 for 3s on 256 size blocks: 23523 sha1's in 0.17s Doing sha1 for 3s on 1024 size blocks: 22803 sha1's in 0.07s Doing sha1 for 3s on 8192 size blocks: 15546 sha1's in 0.06s Doing sha256 for 3s on 16 size blocks: 2518030 sha256's in 2.98s Doing sha256 for 3s on 64 size blocks: 1419416 sha256's in 2.98s Doing sha256 for 3s on 256 size blocks: 613738 sha256's in 2.99s Doing sha256 for 3s on 1024 size blocks: 187080 sha256's in 2.98s Doing sha256 for 3s on 8192 size blocks: 25013 sha256's in 2.99s from the hardware - note the "in" figures are rediculously low, yet they do wait 3s for each test. Also, the sha256 results are close enough to being the software version. No ideas on any of this yet... but I'm not about to start digging in the openssl code to try and work out what it's up to. As I say, I think this is AF_ALG with openssl is a dead project.
Hi Russel, Russell King - ARM Linux <linux@arm.linux.org.uk> writes: > On Sat, Oct 10, 2015 at 12:37:33AM +0200, Arnaud Ebalard wrote: >> Hi Russel, >> >> Russell King <rmk+kernel@arm.linux.org.uk> writes: >> >> > As all the import functions and export functions are virtually >> > identical, factor out their common parts into a generic >> > mv_cesa_ahash_import() and mv_cesa_ahash_export() respectively. This >> > performs the actual import or export, and we pass the data pointers and >> > length into these functions. >> > >> > We have to switch a % const operation to do_div() in the common import >> > function to avoid provoking gcc to use the expensive 64-bit by 64-bit >> > modulus operation. >> > >> > Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> >> >> Thanks for the refactoring and for the fixes. All patches look good to >> me. Out of curiosity, can I ask what perf you get w/ openssh or openssl >> using AF_ALG and the CESA? > > I would do, but it seems this AF_ALG plugin for openssl isn't > actually using it for encryption. When I try: > > openssl speed -engine af_alg aes-128-cbc > > I get results for using openssl's software implementation. If I do: > > openssl speed -engine af_alg md5 > > then I get results from using the kernel's MD5. Hence, I think the > only thing that I think openssh is using it for is the digest stuff, > not the crypto itself. I can't be certain about that though. > > I've tried debugging the af_alg engine plugin, but I'm not getting > very far (I'm not an openssl hacker!) I see it registering the > function to get the ciphers (via ENGINE_set_ciphers), and I see this > called several times, returning a list of NID_xxx values describing > the methods it supports, which includes aes-128-cbc. However, > unlike the equivalent digest function, I never see it called > requesting any of the ciphers. Maybe it's an openssl bug, or a > "feature" preventing hardware crypto? Maybe something is missing > from its initialisation? I've no idea yet. It seems I'm not alone > in this - this report from April 2015 is exactly what I'm seeing: > > https://mta.openssl.org/pipermail/openssl-users/2015-April/001124.html > > However, I'm coming to the conclusion that AF_ALG with openssl is a > dead project, and the only interface that everyone is using for that > is cryptodev - probably contary to Herbert and/or DaveM's wishes. For > example, the openwrt guys seem to only support cryptodev, according to > their wiki page on the subject of hardware crypto: > > http://wiki.openwrt.org/doc/hardware/cryptographic.hardware.accelerators > > Here's the references to code for AF_ALG with openssl I've found so far: > > Original af_alg plugin (dead): > > http://src.carnivore.it/users/common/af_alg/ > > 3rd party "maintained" af_alg openssl plugin, derived from commit > 1851bbb010c38878c83729be844f168192059189 in the above repo but with > no history: > > https://github.com/RidgeRun/af-alg-rr > > and that doesn't contain any changes to the C code originally committed. > Whether this C code contains changes or not is anyone's guess: there's > no way to refer back to the original repository. > > Anyway, here's the digest results: > > Software: > The 'numbers' are in 1000s of bytes per second processed. > type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes > md5 13948.89k 42477.61k 104619.41k 165140.82k 199273.13k > sha1 13091.91k 36463.89k 75393.88k 103893.33k 117104.50k > sha256 13573.92k 30492.25k 52700.33k 64247.81k 68722.69k > > Hardware: > The 'numbers' are in 1000s of bytes per second processed. > type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes > md5 3964.55k 13782.11k 43181.71k 180263.38k 1446616.18k > sha1 4609.16k 8922.35k 35422.87k 333575.31k 2122547.20k > sha256 13519.62k 30484.10k 52547.47k 64285.21k 68530.60k > > There's actually something suspicious while running these tests: > > Doing md5 for 3s on 16 size blocks: 32212 md5's in 0.13s > Doing md5 for 3s on 64 size blocks: 23688 md5's in 0.11s > Doing md5 for 3s on 256 size blocks: 23615 md5's in 0.14s > Doing md5 for 3s on 1024 size blocks: 22885 md5's in 0.13s > Doing md5 for 3s on 8192 size blocks: 15893 md5's in 0.09s > Doing sha1 for 3s on 16 size blocks: 31688 sha1's in 0.11s > Doing sha1 for 3s on 64 size blocks: 23700 sha1's in 0.17s > Doing sha1 for 3s on 256 size blocks: 23523 sha1's in 0.17s > Doing sha1 for 3s on 1024 size blocks: 22803 sha1's in 0.07s > Doing sha1 for 3s on 8192 size blocks: 15546 sha1's in 0.06s > Doing sha256 for 3s on 16 size blocks: 2518030 sha256's in 2.98s > Doing sha256 for 3s on 64 size blocks: 1419416 sha256's in 2.98s > Doing sha256 for 3s on 256 size blocks: 613738 sha256's in 2.99s > Doing sha256 for 3s on 1024 size blocks: 187080 sha256's in 2.98s > Doing sha256 for 3s on 8192 size blocks: 25013 sha256's in 2.99s > > from the hardware - note the "in" figures are rediculously low, yet > they do wait 3s for each test. Also, the sha256 results are close > enough to being the software version. > > No ideas on any of this yet... but I'm not about to start digging in > the openssl code to try and work out what it's up to. As I say, I > think this is AF_ALG with openssl is a dead project. Thanks for the time you took to assemble the information in previous email. Yesterday, when reading your patches, I ended up on [1], where Marek (added him to Cc: list) basically has the same kind of conclusion as yours, i.e. openssl w/ cryptodev is what currently works better even if AF_ALG is the expected target for kernel to provide access to hardware engines to userland apps. I had a lot of performance results at various levels (tcrypt module on variations of the drivers (tasklet, threaded irq, full polling, etc), IPsec tunnel and transport mode through to see how it behaves w/ two mvneta instances also eating CPU cycles for incoming/outgoing packets) but those where done on an encryption use case. Some are provided in [2]. In an early (read dirty) polling-based version of the driver, the CESA on an Armada 370 (mirabox) was verified to be capable of near 100MB/s on buffers of 1500+ bytes for AES CBC encryption. Current version of the driver is not as good (say half that value) but it behaves better. A Mirabox can easily route 1500 bytes packets at 100MB/s between its two interfaces but when you mix both using IPsec in tunnel mode on one side, you end up w/ perfs between 10 to 15MB/s, IIRC. I think it's interesting to see where it ends up w/ the engine exposed to userland consumers (e.g. sth like SSH). I cannot promise a huge amount of time but I'll try and find some to play w/ AF_ALG using openssl and CESA in the coming weeks. Cheers, a+ [1]: http://events.linuxfoundation.org/sites/events/files/slides/lcj-2014-crypto-user.pdf [2]: http://lists.infradead.org/pipermail/linux-arm-kernel/2015-April/336599.html -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Oct 10, 2015 at 12:31:29PM +0200, Arnaud Ebalard wrote: > Hi Russel, ^ > Russell King - ARM Linux <linux@arm.linux.org.uk> writes: > > Software: > > The 'numbers' are in 1000s of bytes per second processed. > > type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes > > md5 13948.89k 42477.61k 104619.41k 165140.82k 199273.13k > > sha1 13091.91k 36463.89k 75393.88k 103893.33k 117104.50k > > sha256 13573.92k 30492.25k 52700.33k 64247.81k 68722.69k > > > > Hardware: > > The 'numbers' are in 1000s of bytes per second processed. > > type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes > > md5 3964.55k 13782.11k 43181.71k 180263.38k 1446616.18k > > sha1 4609.16k 8922.35k 35422.87k 333575.31k 2122547.20k > > sha256 13519.62k 30484.10k 52547.47k 64285.21k 68530.60k Okay, the reason for the difference in SHA256 speed is because the "openssl speed" code *totally* *bypasses* the engine support, whereas the md5 and sha1 do not. It even bypasses the normal method used to get hold of the sha256 implementation (EVP_sha256), and goes straight to using SHA256() directly in openssl/crypto/sha/sha256.c. It looks like the same goes for the AES tests too. > I had a lot of performance results at various levels (tcrypt module on > variations of the drivers (tasklet, threaded irq, full polling, etc), > IPsec tunnel and transport mode through to see how it behaves w/ two > mvneta instances also eating CPU cycles for incoming/outgoing packets) > but those where done on an encryption use case. Some are provided > in [2]. In an early (read dirty) polling-based version of the driver, > the CESA on an Armada 370 (mirabox) was verified to be capable of near > 100MB/s on buffers of 1500+ bytes for AES CBC encryption. Current > version of the driver is not as good (say half that value) but it > behaves better. A Mirabox can easily route 1500 bytes packets at 100MB/s > between its two interfaces but when you mix both using IPsec in tunnel > mode on one side, you end up w/ perfs between 10 to 15MB/s, IIRC. I > think it's interesting to see where it ends up w/ the engine exposed to > userland consumers (e.g. sth like SSH). > > I cannot promise a huge amount of time but I'll try and find some to > play w/ AF_ALG using openssl and CESA in the coming weeks. I think what we draw from my investigation is that "openssl speed" is utterly crap - you don't actually know what's being tested there. Some things test the engine, others bypass the engine infrastructure totally and test the openssl software implementation instead. So, if you think "openssl speed" is a good way to measure the speed of digests and ciphers that openssl supplies to applications, *think again*. It doesn't.
On Sat, Oct 10, 2015 at 12:29:25PM +0100, Russell King - ARM Linux wrote: > On Sat, Oct 10, 2015 at 12:31:29PM +0200, Arnaud Ebalard wrote: > > Russell King - ARM Linux <linux@arm.linux.org.uk> writes: > > > Software: > > > The 'numbers' are in 1000s of bytes per second processed. > > > type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes > > > md5 13948.89k 42477.61k 104619.41k 165140.82k 199273.13k > > > sha1 13091.91k 36463.89k 75393.88k 103893.33k 117104.50k > > > sha256 13573.92k 30492.25k 52700.33k 64247.81k 68722.69k > > > > > > Hardware: > > > The 'numbers' are in 1000s of bytes per second processed. > > > type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes > > > md5 3964.55k 13782.11k 43181.71k 180263.38k 1446616.18k > > > sha1 4609.16k 8922.35k 35422.87k 333575.31k 2122547.20k > > > sha256 13519.62k 30484.10k 52547.47k 64285.21k 68530.60k > > Okay, the reason for the difference in SHA256 speed is because the > "openssl speed" code *totally* *bypasses* the engine support, whereas > the md5 and sha1 do not. It even bypasses the normal method used to > get hold of the sha256 implementation (EVP_sha256), and goes straight > to using SHA256() directly in openssl/crypto/sha/sha256.c. It looks > like the same goes for the AES tests too. Okay, what's required is openssl speed -evp <cipher|digest> and it can only do one at a time. That's really confusing and non-intuitive, given that some of the non-evp options end up testing the EVP methods. So, running: $ openssl speed -evp aes-128-cbc I get the kernel spitting out a number of these warnings during its run: DRBG: could not allocate digest TFM handle: hmac(sha512) I also notice is that 50% of CPU time is spent in the kernel, presumably because the lack of hardware sha512 means that we end up having to do sha512 in software in the kernel. I _also_ notice that despite having the ARM assembly crypto functions enabled and built-in to the kernel, they don't appear in /proc/crypto - and this is because of patch 1 in this series, which blocks out any crypto driver which has a zero statesize (as the ARM crypto functions appear to have.) I found this by rebuilding the ARM crypto stuff as modules, and then trying to insert them: # modprobe sha512-arm modprobe: ERROR: could not insert 'sha512_arm': Invalid argument So, I think it's best if this patch series is *NOT* merged until someone who knows the kernel crypto code gets to grips with what's supposed to be done in various parts of the code. Yes, I know that Herbert suggested the approach in patch 1, but that _will_ cause regressions like the above when if it's merged. Either the ARM crypto code in arch/arm/crypto is buggy, or the approach in patch 1 is buggy.
On Saturday, October 10, 2015 at 01:29:25 PM, Russell King - ARM Linux wrote: > On Sat, Oct 10, 2015 at 12:31:29PM +0200, Arnaud Ebalard wrote: > > Hi Russel, > > ^ > > > Russell King - ARM Linux <linux@arm.linux.org.uk> writes: > > > Software: > > > The 'numbers' are in 1000s of bytes per second processed. > > > type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 > > > bytes md5 13948.89k 42477.61k 104619.41k > > > 165140.82k 199273.13k sha1 13091.91k 36463.89k > > > 75393.88k 103893.33k 117104.50k sha256 13573.92k > > > 30492.25k 52700.33k 64247.81k 68722.69k > > > > > > Hardware: > > > The 'numbers' are in 1000s of bytes per second processed. > > > type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 > > > bytes md5 3964.55k 13782.11k 43181.71k > > > 180263.38k 1446616.18k sha1 4609.16k 8922.35k > > > 35422.87k 333575.31k 2122547.20k sha256 13519.62k > > > 30484.10k 52547.47k 64285.21k 68530.60k > > Okay, the reason for the difference in SHA256 speed is because the > "openssl speed" code *totally* *bypasses* the engine support, whereas > the md5 and sha1 do not. It even bypasses the normal method used to > get hold of the sha256 implementation (EVP_sha256), and goes straight > to using SHA256() directly in openssl/crypto/sha/sha256.c. It looks > like the same goes for the AES tests too. > > > I had a lot of performance results at various levels (tcrypt module on > > variations of the drivers (tasklet, threaded irq, full polling, etc), > > IPsec tunnel and transport mode through to see how it behaves w/ two > > mvneta instances also eating CPU cycles for incoming/outgoing packets) > > but those where done on an encryption use case. Some are provided > > in [2]. In an early (read dirty) polling-based version of the driver, > > the CESA on an Armada 370 (mirabox) was verified to be capable of near > > 100MB/s on buffers of 1500+ bytes for AES CBC encryption. Current > > version of the driver is not as good (say half that value) but it > > behaves better. A Mirabox can easily route 1500 bytes packets at 100MB/s > > between its two interfaces but when you mix both using IPsec in tunnel > > mode on one side, you end up w/ perfs between 10 to 15MB/s, IIRC. I > > think it's interesting to see where it ends up w/ the engine exposed to > > userland consumers (e.g. sth like SSH). > > > > I cannot promise a huge amount of time but I'll try and find some to > > play w/ AF_ALG using openssl and CESA in the coming weeks. > > I think what we draw from my investigation is that "openssl speed" is > utterly crap - you don't actually know what's being tested there. Some > things test the engine, others bypass the engine infrastructure totally > and test the openssl software implementation instead. > > So, if you think "openssl speed" is a good way to measure the speed of > digests and ciphers that openssl supplies to applications, *think again*. > It doesn't. Add to that the fact that openssl speed does NOT verify the results of the transformations, so it's not usable for detecting errors during high load. It's utter crap, just like you said. Best regards, Marek Vasut -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Oct 10, 2015 at 05:17:54PM +0100, Russell King - ARM Linux wrote: > > I _also_ notice that despite having the ARM assembly crypto functions > enabled and built-in to the kernel, they don't appear in /proc/crypto - > and this is because of patch 1 in this series, which blocks out any > crypto driver which has a zero statesize (as the ARM crypto functions > appear to have.) I found this by rebuilding the ARM crypto stuff as > modules, and then trying to insert them: They're buggy and unfortunately this wasn't caught during the review process. The import/export functions are required and certainly not optional. > # modprobe sha512-arm > modprobe: ERROR: could not insert 'sha512_arm': Invalid argument This is the correct response and what will happen is that the software implementation of sha512 will be used rather than the accelerated sha512_arm. Once the sha512_arm driver has been fixed then it can be loaded again. > So, I think it's best if this patch series is *NOT* merged until someone > who knows the kernel crypto code gets to grips with what's supposed to be > done in various parts of the code. Yes, I know that Herbert suggested > the approach in patch 1, but that _will_ cause regressions like the above > when if it's merged. Considering that the current driver can be used to trigger a kernel oops from user-space I think preventing the buggy driver from loading is the right thing to do. We can then fix them one-by-one. Cheers,
On Sun, Oct 11, 2015 at 02:55:05PM +0800, Herbert Xu wrote: > On Sat, Oct 10, 2015 at 05:17:54PM +0100, Russell King - ARM Linux wrote: > > > > I _also_ notice that despite having the ARM assembly crypto functions > > enabled and built-in to the kernel, they don't appear in /proc/crypto - > > and this is because of patch 1 in this series, which blocks out any > > crypto driver which has a zero statesize (as the ARM crypto functions > > appear to have.) I found this by rebuilding the ARM crypto stuff as > > modules, and then trying to insert them: > > They're buggy and unfortunately this wasn't caught during the > review process. The import/export functions are required and > certainly not optional. OK it looks like I was confused. Yes the import and export functions are required but for shash algorithms we provide a simple default. This means that implementations can simply leave it NULL and they will then use the default import/export functions which exports the shash_desc as the state. So Russell what I'll do is apply your patch without the hunk for shash_prepare_alg. IOW we will block any ahash algorithms that leave state as zero since ahash does not provide a default import/export function (it can't because it may involve hardware state). shash algorithms on the other hand won't be affected. Cheers,
On Tue, Oct 13, 2015 at 09:00:48PM +0800, Herbert Xu wrote: > On Sun, Oct 11, 2015 at 02:55:05PM +0800, Herbert Xu wrote: > > On Sat, Oct 10, 2015 at 05:17:54PM +0100, Russell King - ARM Linux wrote: > > > > > > I _also_ notice that despite having the ARM assembly crypto functions > > > enabled and built-in to the kernel, they don't appear in /proc/crypto - > > > and this is because of patch 1 in this series, which blocks out any > > > crypto driver which has a zero statesize (as the ARM crypto functions > > > appear to have.) I found this by rebuilding the ARM crypto stuff as > > > modules, and then trying to insert them: > > > > They're buggy and unfortunately this wasn't caught during the > > review process. The import/export functions are required and > > certainly not optional. > > OK it looks like I was confused. Yes the import and export functions > are required but for shash algorithms we provide a simple default. > This means that implementations can simply leave it NULL and they > will then use the default import/export functions which exports > the shash_desc as the state. > > So Russell what I'll do is apply your patch without the hunk > for shash_prepare_alg. IOW we will block any ahash algorithms > that leave state as zero since ahash does not provide a default > import/export function (it can't because it may involve hardware > state). shash algorithms on the other hand won't be affected. Okay. I've a larger queue of patches building at the moment for the Marvell driver - I've found that it's touch-and-go whether it produces the correct hash result, and previous hashes can affect subsequent hashes in some circumstances. I'm currently at another 7 patches, plus a "big" as-yet uncommitted patch which is virtually a rewrite of the DMA preparation - diffstat wise, in total it's looking like: drivers/crypto/marvell/cesa.h | 8 +- drivers/crypto/marvell/hash.c | 258 ++++++++++++++++++++++-------------------- drivers/crypto/marvell/tdma.c | 7 ++ 3 files changed, 151 insertions(+), 122 deletions(-) and there's still a few more cases that need solving: zero bytes of input should not give a zero hash, and an input which is the multiple of the blocksize causes the hardware to hang - though I'm not yet sure whether that's a result of my above changes. At least with my above changes, I now get stable and correct hash values for sizes which do not hit any of those remaining bugs. Given it's size so far, I'm not sure it makes much sense to backport any of these fixes to stable; I think maybe we should just say "okay, before these fixes, it's just broken."
On Tue, Oct 13, 2015 at 02:55:18PM +0100, Russell King - ARM Linux wrote: > > Given it's size so far, I'm not sure it makes much sense to backport > any of these fixes to stable; I think maybe we should just say "okay, > before these fixes, it's just broken." OK as long as your first patch goes into stable it should be fine as it'll essentially disable the marvell driver. Cheers,
On Tue, Oct 13, 2015 at 09:57:22PM +0800, Herbert Xu wrote: > On Tue, Oct 13, 2015 at 02:55:18PM +0100, Russell King - ARM Linux wrote: > > > > Given it's size so far, I'm not sure it makes much sense to backport > > any of these fixes to stable; I think maybe we should just say "okay, > > before these fixes, it's just broken." > > OK as long as your first patch goes into stable it should be fine > as it'll essentially disable the marvell driver. It will disable the Marvell driver, along with the SW implementations in arch/arm/crypto/ which don't set .statesize, .import and .export. I would not be surprised if it also affects others too.
On Tue, Oct 13, 2015 at 02:59:47PM +0100, Russell King - ARM Linux wrote: > > It will disable the Marvell driver, along with the SW implementations in > arch/arm/crypto/ which don't set .statesize, .import and .export. I > would not be surprised if it also affects others too. As I said earlier I'll make sure to delete the hunk for shash_prepare_alg when I apply your patch so only ahash drivers will be affected. The ARM stuff is all shash so they are fine and will use the default import/export functions. Cheers,
diff --git a/drivers/crypto/marvell/hash.c b/drivers/crypto/marvell/hash.c index b7c2c05f1a01..a24ceda7acfe 100644 --- a/drivers/crypto/marvell/hash.c +++ b/drivers/crypto/marvell/hash.c @@ -795,39 +795,32 @@ static int mv_cesa_ahash_finup(struct ahash_request *req) return ret; } -static int mv_cesa_md5_init(struct ahash_request *req) +static int mv_cesa_ahash_export(struct ahash_request *req, void *hash, + u64 *len, void *cache) { - struct mv_cesa_op_ctx tmpl; - - mv_cesa_set_op_cfg(&tmpl, CESA_SA_DESC_CFG_MACM_MD5); - - mv_cesa_ahash_init(req, &tmpl); - - return 0; -} - -static int mv_cesa_md5_export(struct ahash_request *req, void *out) -{ - struct md5_state *out_state = out; struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); struct mv_cesa_ahash_req *creq = ahash_request_ctx(req); unsigned int digsize = crypto_ahash_digestsize(ahash); + unsigned int blocksize; + + blocksize = crypto_tfm_alg_blocksize(crypto_ahash_tfm(ahash)); - out_state->byte_count = creq->len; - memcpy(out_state->hash, creq->state, digsize); - memset(out_state->block, 0, sizeof(out_state->block)); + *len = creq->len; + memcpy(hash, creq->state, digsize); + memset(cache, 0, blocksize); if (creq->cache) - memcpy(out_state->block, creq->cache, creq->cache_ptr); + memcpy(cache, creq->cache, creq->cache_ptr); return 0; } -static int mv_cesa_md5_import(struct ahash_request *req, const void *in) +static int mv_cesa_ahash_import(struct ahash_request *req, const void *hash, + u64 len, const void *cache) { - const struct md5_state *in_state = in; struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); struct mv_cesa_ahash_req *creq = ahash_request_ctx(req); unsigned int digsize = crypto_ahash_digestsize(ahash); + unsigned int blocksize; unsigned int cache_ptr; int ret; @@ -835,16 +828,17 @@ static int mv_cesa_md5_import(struct ahash_request *req, const void *in) if (ret) return ret; - if (in_state->byte_count >= sizeof(in_state->block)) + blocksize = crypto_tfm_alg_blocksize(crypto_ahash_tfm(ahash)); + if (len >= blocksize) mv_cesa_update_op_cfg(&creq->op_tmpl, CESA_SA_DESC_CFG_MID_FRAG, CESA_SA_DESC_CFG_FRAG_MSK); - creq->len = in_state->byte_count; - memcpy(creq->state, in_state->hash, digsize); + creq->len = len; + memcpy(creq->state, hash, digsize); creq->cache_ptr = 0; - cache_ptr = creq->len % sizeof(in_state->block); + cache_ptr = do_div(len, blocksize); if (!cache_ptr) return 0; @@ -852,12 +846,39 @@ static int mv_cesa_md5_import(struct ahash_request *req, const void *in) if (ret) return ret; - memcpy(creq->cache, in_state->block, cache_ptr); + memcpy(creq->cache, cache, cache_ptr); creq->cache_ptr = cache_ptr; return 0; } +static int mv_cesa_md5_init(struct ahash_request *req) +{ + struct mv_cesa_op_ctx tmpl; + + mv_cesa_set_op_cfg(&tmpl, CESA_SA_DESC_CFG_MACM_MD5); + + mv_cesa_ahash_init(req, &tmpl); + + return 0; +} + +static int mv_cesa_md5_export(struct ahash_request *req, void *out) +{ + struct md5_state *out_state = out; + + return mv_cesa_ahash_export(req, out_state->hash, + &out_state->byte_count, out_state->block); +} + +static int mv_cesa_md5_import(struct ahash_request *req, const void *in) +{ + const struct md5_state *in_state = in; + + return mv_cesa_ahash_import(req, in_state->hash, in_state->byte_count, + in_state->block); +} + static int mv_cesa_md5_digest(struct ahash_request *req) { int ret; @@ -908,53 +929,17 @@ static int mv_cesa_sha1_init(struct ahash_request *req) static int mv_cesa_sha1_export(struct ahash_request *req, void *out) { struct sha1_state *out_state = out; - struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct mv_cesa_ahash_req *creq = ahash_request_ctx(req); - unsigned int digsize = crypto_ahash_digestsize(ahash); - - out_state->count = creq->len; - memcpy(out_state->state, creq->state, digsize); - memset(out_state->buffer, 0, sizeof(out_state->buffer)); - if (creq->cache) - memcpy(out_state->buffer, creq->cache, creq->cache_ptr); - return 0; + return mv_cesa_ahash_export(req, out_state->state, &out_state->count, + out_state->buffer); } static int mv_cesa_sha1_import(struct ahash_request *req, const void *in) { const struct sha1_state *in_state = in; - struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct mv_cesa_ahash_req *creq = ahash_request_ctx(req); - unsigned int digsize = crypto_ahash_digestsize(ahash); - unsigned int cache_ptr; - int ret; - - ret = crypto_ahash_init(req); - if (ret) - return ret; - - if (in_state->count >= SHA1_BLOCK_SIZE) - mv_cesa_update_op_cfg(&creq->op_tmpl, - CESA_SA_DESC_CFG_MID_FRAG, - CESA_SA_DESC_CFG_FRAG_MSK); - - creq->len = in_state->count; - memcpy(creq->state, in_state->state, digsize); - creq->cache_ptr = 0; - - cache_ptr = creq->len % SHA1_BLOCK_SIZE; - if (!cache_ptr) - return 0; - - ret = mv_cesa_ahash_alloc_cache(req); - if (ret) - return ret; - - memcpy(creq->cache, in_state->buffer, cache_ptr); - creq->cache_ptr = cache_ptr; - return 0; + return mv_cesa_ahash_import(req, in_state->state, in_state->count, + in_state->buffer); } static int mv_cesa_sha1_digest(struct ahash_request *req) @@ -1018,53 +1003,17 @@ static int mv_cesa_sha256_digest(struct ahash_request *req) static int mv_cesa_sha256_export(struct ahash_request *req, void *out) { struct sha256_state *out_state = out; - struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct mv_cesa_ahash_req *creq = ahash_request_ctx(req); - unsigned int ds = crypto_ahash_digestsize(ahash); - out_state->count = creq->len; - memcpy(out_state->state, creq->state, ds); - memset(out_state->buf, 0, sizeof(out_state->buf)); - if (creq->cache) - memcpy(out_state->buf, creq->cache, creq->cache_ptr); - - return 0; + return mv_cesa_ahash_export(req, out_state->state, &out_state->count, + out_state->buf); } static int mv_cesa_sha256_import(struct ahash_request *req, const void *in) { const struct sha256_state *in_state = in; - struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct mv_cesa_ahash_req *creq = ahash_request_ctx(req); - unsigned int digsize = crypto_ahash_digestsize(ahash); - unsigned int cache_ptr; - int ret; - - ret = crypto_ahash_init(req); - if (ret) - return ret; - - if (in_state->count >= SHA256_BLOCK_SIZE) - mv_cesa_update_op_cfg(&creq->op_tmpl, - CESA_SA_DESC_CFG_MID_FRAG, - CESA_SA_DESC_CFG_FRAG_MSK); - creq->len = in_state->count; - memcpy(creq->state, in_state->state, digsize); - creq->cache_ptr = 0; - - cache_ptr = creq->len % SHA256_BLOCK_SIZE; - if (!cache_ptr) - return 0; - - ret = mv_cesa_ahash_alloc_cache(req); - if (ret) - return ret; - - memcpy(creq->cache, in_state->buf, cache_ptr); - creq->cache_ptr = cache_ptr; - - return 0; + return mv_cesa_ahash_import(req, in_state->state, in_state->count, + in_state->buf); } struct ahash_alg mv_sha256_alg = {
As all the import functions and export functions are virtually identical, factor out their common parts into a generic mv_cesa_ahash_import() and mv_cesa_ahash_export() respectively. This performs the actual import or export, and we pass the data pointers and length into these functions. We have to switch a % const operation to do_div() in the common import function to avoid provoking gcc to use the expensive 64-bit by 64-bit modulus operation. Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> --- Replacement to patch 5, going a little further, as requested by Boris. drivers/crypto/marvell/hash.c | 157 ++++++++++++++---------------------------- 1 file changed, 53 insertions(+), 104 deletions(-)