diff mbox series

[9/9] crypto: qce - switch to using a mutex

Message ID 20241203-crypto-qce-refactor-v1-9-c5901d2dd45c@linaro.org (mailing list archive)
State Accepted
Delegated to: Herbert Xu
Headers show
Series crypto: qce - refactor the driver | expand

Commit Message

Bartosz Golaszewski Dec. 3, 2024, 9:19 a.m. UTC
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>

Having switched to workqueue from tasklet, we are no longer limited to
atomic APIs and can now convert the spinlock to a mutex. This, along
with the conversion from tasklet to workqueue grants us ~15% improvement
in cryptsetup benchmarks for AES encryption.

While at it: use guards to simplify locking code.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/crypto/qce/core.c | 46 +++++++++++++++++++++-------------------------
 drivers/crypto/qce/core.h |  3 ++-
 2 files changed, 23 insertions(+), 26 deletions(-)

Comments

Neil Armstrong Dec. 3, 2024, 1:53 p.m. UTC | #1
On 03/12/2024 10:19, Bartosz Golaszewski wrote:
> From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
> 
> Having switched to workqueue from tasklet, we are no longer limited to
> atomic APIs and can now convert the spinlock to a mutex. This, along
> with the conversion from tasklet to workqueue grants us ~15% improvement
> in cryptsetup benchmarks for AES encryption.

Can you share on which platforms you did the tests and the results you got ?

> 
> While at it: use guards to simplify locking code.
> 
> Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
> ---
>   drivers/crypto/qce/core.c | 46 +++++++++++++++++++++-------------------------
>   drivers/crypto/qce/core.h |  3 ++-
>   2 files changed, 23 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c
> index 6de9f1e23e282..e95e84486d9ae 100644
> --- a/drivers/crypto/qce/core.c
> +++ b/drivers/crypto/qce/core.c
> @@ -3,6 +3,7 @@
>    * Copyright (c) 2010-2014, The Linux Foundation. All rights reserved.
>    */
>   
> +#include <linux/cleanup.h>
>   #include <linux/clk.h>
>   #include <linux/device.h>
>   #include <linux/dma-mapping.h>
> @@ -11,7 +12,6 @@
>   #include <linux/module.h>
>   #include <linux/mod_devicetable.h>
>   #include <linux/platform_device.h>
> -#include <linux/spinlock.h>
>   #include <linux/types.h>
>   #include <crypto/algapi.h>
>   #include <crypto/internal/hash.h>
> @@ -89,34 +89,28 @@ static int qce_handle_queue(struct qce_device *qce,
>   			    struct crypto_async_request *req)
>   {
>   	struct crypto_async_request *async_req, *backlog;
> -	unsigned long flags;
>   	int ret = 0, err;
>   
> -	spin_lock_irqsave(&qce->lock, flags);
> +	scoped_guard(mutex, &qce->lock) {
> +		if (req)
> +			ret = crypto_enqueue_request(&qce->queue, req);
>   
> -	if (req)
> -		ret = crypto_enqueue_request(&qce->queue, req);
> +		/* busy, do not dequeue request */
> +		if (qce->req)
> +			return ret;
>   
> -	/* busy, do not dequeue request */
> -	if (qce->req) {
> -		spin_unlock_irqrestore(&qce->lock, flags);
> -		return ret;
> +		backlog = crypto_get_backlog(&qce->queue);
> +		async_req = crypto_dequeue_request(&qce->queue);
> +		if (async_req)
> +			qce->req = async_req;
>   	}
>   
> -	backlog = crypto_get_backlog(&qce->queue);
> -	async_req = crypto_dequeue_request(&qce->queue);
> -	if (async_req)
> -		qce->req = async_req;
> -
> -	spin_unlock_irqrestore(&qce->lock, flags);
> -
>   	if (!async_req)
>   		return ret;
>   
>   	if (backlog) {
> -		spin_lock_bh(&qce->lock);
> -		crypto_request_complete(backlog, -EINPROGRESS);
> -		spin_unlock_bh(&qce->lock);
> +		scoped_guard(mutex, &qce->lock)
> +			crypto_request_complete(backlog, -EINPROGRESS);
>   	}
>   
>   	err = qce_handle_request(async_req);
> @@ -133,12 +127,11 @@ static void qce_req_done_work(struct work_struct *work)
>   	struct qce_device *qce = container_of(work, struct qce_device,
>   					      done_work);
>   	struct crypto_async_request *req;
> -	unsigned long flags;
>   
> -	spin_lock_irqsave(&qce->lock, flags);
> -	req = qce->req;
> -	qce->req = NULL;
> -	spin_unlock_irqrestore(&qce->lock, flags);
> +	scoped_guard(mutex, &qce->lock) {
> +		req = qce->req;
> +		qce->req = NULL;
> +	}
>   
>   	if (req)
>   		crypto_request_complete(req, qce->result);
> @@ -243,7 +236,10 @@ static int qce_crypto_probe(struct platform_device *pdev)
>   	if (ret)
>   		return ret;
>   
> -	spin_lock_init(&qce->lock);
> +	ret = devm_mutex_init(qce->dev, &qce->lock);
> +	if (ret)
> +		return ret;
> +
>   	INIT_WORK(&qce->done_work, qce_req_done_work);
>   	crypto_init_queue(&qce->queue, QCE_QUEUE_LENGTH);
>   
> diff --git a/drivers/crypto/qce/core.h b/drivers/crypto/qce/core.h
> index 39e75a75a4293..eb6fa7a8b64a8 100644
> --- a/drivers/crypto/qce/core.h
> +++ b/drivers/crypto/qce/core.h
> @@ -6,6 +6,7 @@
>   #ifndef _CORE_H_
>   #define _CORE_H_
>   
> +#include <linux/mutex.h>
>   #include <linux/workqueue.h>
>   
>   #include "dma.h"
> @@ -30,7 +31,7 @@
>    */
>   struct qce_device {
>   	struct crypto_queue queue;
> -	spinlock_t lock;
> +	struct mutex lock;
>   	struct work_struct done_work;
>   	struct crypto_async_request *req;
>   	int result;
> 

Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
Bartosz Golaszewski Dec. 3, 2024, 3:10 p.m. UTC | #2
On Tue, 3 Dec 2024 14:53:21 +0100, neil.armstrong@linaro.org said:
> On 03/12/2024 10:19, Bartosz Golaszewski wrote:
>> From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
>>
>> Having switched to workqueue from tasklet, we are no longer limited to
>> atomic APIs and can now convert the spinlock to a mutex. This, along
>> with the conversion from tasklet to workqueue grants us ~15% improvement
>> in cryptsetup benchmarks for AES encryption.
>
> Can you share on which platforms you did the tests and the results you got ?
>

Sure, I tested on sm8650 with the following results (they vary from
one run to other but are more or less in this range):

With this series:

#     Algorithm |       Key |      Encryption |      Decryption
        aes-cbc        128b        94.1 MiB/s       138.6 MiB/s
    serpent-cbc        128b               N/A               N/A
    twofish-cbc        128b               N/A               N/A
        aes-cbc        256b        94.8 MiB/s       128.5 MiB/s
    serpent-cbc        256b               N/A               N/A
    twofish-cbc        256b               N/A               N/A
        aes-xts        256b       132.9 MiB/s       131.8 MiB/s
    serpent-xts        256b               N/A               N/A
    twofish-xts        256b               N/A               N/A
        aes-xts        512b       122.6 MiB/s       122.4 MiB/s
    serpent-xts        512b               N/A               N/A
    twofish-xts        512b               N/A               N/A

Without it:

#     Algorithm |       Key |      Encryption |      Decryption
        aes-cbc        128b        96.4 MiB/s       141.0 MiB/s
    serpent-cbc        128b               N/A               N/A
    twofish-cbc        128b               N/A               N/A
        aes-cbc        256b        67.0 MiB/s        97.8 MiB/s
    serpent-cbc        256b               N/A               N/A
    twofish-cbc        256b               N/A               N/A
        aes-xts        256b       131.7 MiB/s       132.0 MiB/s
    serpent-xts        256b               N/A               N/A
    twofish-xts        256b               N/A               N/A
        aes-xts        512b        93.9 MiB/s        96.8 MiB/s
    serpent-xts        512b               N/A               N/A
    twofish-xts        512b               N/A               N/A

AES-CBC and AES-XTS with shorter keys remain pretty much the same. I'm not
sure why that is. I also tested on sa8775p but there are no visible
improvements there. :(

Bart
Neil Armstrong Dec. 4, 2024, 10:17 a.m. UTC | #3
On 03/12/2024 16:10, Bartosz Golaszewski wrote:
> On Tue, 3 Dec 2024 14:53:21 +0100, neil.armstrong@linaro.org said:
>> On 03/12/2024 10:19, Bartosz Golaszewski wrote:
>>> From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
>>>
>>> Having switched to workqueue from tasklet, we are no longer limited to
>>> atomic APIs and can now convert the spinlock to a mutex. This, along
>>> with the conversion from tasklet to workqueue grants us ~15% improvement
>>> in cryptsetup benchmarks for AES encryption.
>>
>> Can you share on which platforms you did the tests and the results you got ?
>>
> 
> Sure, I tested on sm8650 with the following results (they vary from
> one run to other but are more or less in this range):
> 
> With this series:
> 
> #     Algorithm |       Key |      Encryption |      Decryption
>          aes-cbc        128b        94.1 MiB/s       138.6 MiB/s
>      serpent-cbc        128b               N/A               N/A
>      twofish-cbc        128b               N/A               N/A
>          aes-cbc        256b        94.8 MiB/s       128.5 MiB/s
>      serpent-cbc        256b               N/A               N/A
>      twofish-cbc        256b               N/A               N/A
>          aes-xts        256b       132.9 MiB/s       131.8 MiB/s
>      serpent-xts        256b               N/A               N/A
>      twofish-xts        256b               N/A               N/A
>          aes-xts        512b       122.6 MiB/s       122.4 MiB/s
>      serpent-xts        512b               N/A               N/A
>      twofish-xts        512b               N/A               N/A
> 
> Without it:
> 
> #     Algorithm |       Key |      Encryption |      Decryption
>          aes-cbc        128b        96.4 MiB/s       141.0 MiB/s
>      serpent-cbc        128b               N/A               N/A
>      twofish-cbc        128b               N/A               N/A
>          aes-cbc        256b        67.0 MiB/s        97.8 MiB/s
>      serpent-cbc        256b               N/A               N/A
>      twofish-cbc        256b               N/A               N/A
>          aes-xts        256b       131.7 MiB/s       132.0 MiB/s
>      serpent-xts        256b               N/A               N/A
>      twofish-xts        256b               N/A               N/A
>          aes-xts        512b        93.9 MiB/s        96.8 MiB/s
>      serpent-xts        512b               N/A               N/A
>      twofish-xts        512b               N/A               N/A
> 
> AES-CBC and AES-XTS with shorter keys remain pretty much the same. I'm not
> sure why that is. I also tested on sa8775p but there are no visible
> improvements there. :(

Thanks for the results !

Neil

> 
> Bart
diff mbox series

Patch

diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c
index 6de9f1e23e282..e95e84486d9ae 100644
--- a/drivers/crypto/qce/core.c
+++ b/drivers/crypto/qce/core.c
@@ -3,6 +3,7 @@ 
  * Copyright (c) 2010-2014, The Linux Foundation. All rights reserved.
  */
 
+#include <linux/cleanup.h>
 #include <linux/clk.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
@@ -11,7 +12,6 @@ 
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
 #include <linux/platform_device.h>
-#include <linux/spinlock.h>
 #include <linux/types.h>
 #include <crypto/algapi.h>
 #include <crypto/internal/hash.h>
@@ -89,34 +89,28 @@  static int qce_handle_queue(struct qce_device *qce,
 			    struct crypto_async_request *req)
 {
 	struct crypto_async_request *async_req, *backlog;
-	unsigned long flags;
 	int ret = 0, err;
 
-	spin_lock_irqsave(&qce->lock, flags);
+	scoped_guard(mutex, &qce->lock) {
+		if (req)
+			ret = crypto_enqueue_request(&qce->queue, req);
 
-	if (req)
-		ret = crypto_enqueue_request(&qce->queue, req);
+		/* busy, do not dequeue request */
+		if (qce->req)
+			return ret;
 
-	/* busy, do not dequeue request */
-	if (qce->req) {
-		spin_unlock_irqrestore(&qce->lock, flags);
-		return ret;
+		backlog = crypto_get_backlog(&qce->queue);
+		async_req = crypto_dequeue_request(&qce->queue);
+		if (async_req)
+			qce->req = async_req;
 	}
 
-	backlog = crypto_get_backlog(&qce->queue);
-	async_req = crypto_dequeue_request(&qce->queue);
-	if (async_req)
-		qce->req = async_req;
-
-	spin_unlock_irqrestore(&qce->lock, flags);
-
 	if (!async_req)
 		return ret;
 
 	if (backlog) {
-		spin_lock_bh(&qce->lock);
-		crypto_request_complete(backlog, -EINPROGRESS);
-		spin_unlock_bh(&qce->lock);
+		scoped_guard(mutex, &qce->lock)
+			crypto_request_complete(backlog, -EINPROGRESS);
 	}
 
 	err = qce_handle_request(async_req);
@@ -133,12 +127,11 @@  static void qce_req_done_work(struct work_struct *work)
 	struct qce_device *qce = container_of(work, struct qce_device,
 					      done_work);
 	struct crypto_async_request *req;
-	unsigned long flags;
 
-	spin_lock_irqsave(&qce->lock, flags);
-	req = qce->req;
-	qce->req = NULL;
-	spin_unlock_irqrestore(&qce->lock, flags);
+	scoped_guard(mutex, &qce->lock) {
+		req = qce->req;
+		qce->req = NULL;
+	}
 
 	if (req)
 		crypto_request_complete(req, qce->result);
@@ -243,7 +236,10 @@  static int qce_crypto_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	spin_lock_init(&qce->lock);
+	ret = devm_mutex_init(qce->dev, &qce->lock);
+	if (ret)
+		return ret;
+
 	INIT_WORK(&qce->done_work, qce_req_done_work);
 	crypto_init_queue(&qce->queue, QCE_QUEUE_LENGTH);
 
diff --git a/drivers/crypto/qce/core.h b/drivers/crypto/qce/core.h
index 39e75a75a4293..eb6fa7a8b64a8 100644
--- a/drivers/crypto/qce/core.h
+++ b/drivers/crypto/qce/core.h
@@ -6,6 +6,7 @@ 
 #ifndef _CORE_H_
 #define _CORE_H_
 
+#include <linux/mutex.h>
 #include <linux/workqueue.h>
 
 #include "dma.h"
@@ -30,7 +31,7 @@ 
  */
 struct qce_device {
 	struct crypto_queue queue;
-	spinlock_t lock;
+	struct mutex lock;
 	struct work_struct done_work;
 	struct crypto_async_request *req;
 	int result;