diff mbox series

[RFC] Crypto-engine support for parallel requests

Message ID 1579563149-3678-1-git-send-email-iuliana.prodan@nxp.com (mailing list archive)
State RFC
Delegated to: Herbert Xu
Headers show
Series [RFC] Crypto-engine support for parallel requests | expand

Commit Message

Iuliana Prodan Jan. 20, 2020, 11:32 p.m. UTC
Added support for executing multiple requests, in parallel,
for crypto engine.
A no_reqs is initialized and set in the new
crypto_engine_alloc_init_and_set function.
Here, is also set the maximum size for crypto-engine software
queue (not hardcoded anymore).
On crypto_pump_requests the no_reqs is increased, until the
max_no_reqs is reached, and decreased on crypto_finalize_request,
or on error path (in case a prepare_request or do_one_request
operation was unsuccessful).

Signed-off-by: Iuliana Prodan <iuliana.prodan@nxp.com>
---
 crypto/crypto_engine.c  | 112 +++++++++++++++++++++++++++++++++---------------
 include/crypto/engine.h |  11 +++--
 2 files changed, 84 insertions(+), 39 deletions(-)

Comments

Herbert Xu Jan. 21, 2020, 9:11 a.m. UTC | #1
On Tue, Jan 21, 2020 at 01:32:29AM +0200, Iuliana Prodan wrote:
>
> +	if (engine->no_reqs < engine->max_no_reqs)
> +		goto retry;

We should not hard-code this number into the engine.  Instead,
we should just let the driver tell us when it is ready to accept
more requests.

Perhaps we should add a new function for drivers that wish to
support this that would accept a list of requests instead of
a single one.  It would then process as many requests as it
can from that list and only return either when the list is
exhausted or when it can't process any more requests.

Cheers,
Corentin Labbe Jan. 21, 2020, 10 a.m. UTC | #2
On Tue, Jan 21, 2020 at 01:32:29AM +0200, Iuliana Prodan wrote:
> Added support for executing multiple requests, in parallel,
> for crypto engine.
> A no_reqs is initialized and set in the new
> crypto_engine_alloc_init_and_set function.
> Here, is also set the maximum size for crypto-engine software
> queue (not hardcoded anymore).
> On crypto_pump_requests the no_reqs is increased, until the
> max_no_reqs is reached, and decreased on crypto_finalize_request,
> or on error path (in case a prepare_request or do_one_request
> operation was unsuccessful).
> 
> Signed-off-by: Iuliana Prodan <iuliana.prodan@nxp.com>
> ---
>  crypto/crypto_engine.c  | 112 +++++++++++++++++++++++++++++++++---------------
>  include/crypto/engine.h |  11 +++--
>  2 files changed, 84 insertions(+), 39 deletions(-)
> 
> diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c
> index eb029ff..5219141 100644
> --- a/crypto/crypto_engine.c
> +++ b/crypto/crypto_engine.c
> @@ -14,6 +14,7 @@
>  #include "internal.h"
>  
>  #define CRYPTO_ENGINE_MAX_QLEN 10
> +#define CRYPTO_ENGINE_MAX_CONCURRENT_REQS 1
>  
>  /**
>   * crypto_finalize_request - finalize one request if the request is done
> @@ -22,32 +23,27 @@
>   * @err: error number
>   */
>  static void crypto_finalize_request(struct crypto_engine *engine,
> -			     struct crypto_async_request *req, int err)
> +				    struct crypto_async_request *req, int err)
>  {
>  	unsigned long flags;
> -	bool finalize_cur_req = false;
> +	bool finalize_req = false;
>  	int ret;
>  	struct crypto_engine_ctx *enginectx;
>  
>  	spin_lock_irqsave(&engine->queue_lock, flags);
> -	if (engine->cur_req == req)
> -		finalize_cur_req = true;
> +	if (engine->no_reqs > 0) {
> +		finalize_req = true;
> +		engine->no_reqs--;
> +	}
>  	spin_unlock_irqrestore(&engine->queue_lock, flags);
>  
> -	if (finalize_cur_req) {
> -		enginectx = crypto_tfm_ctx(req->tfm);
> -		if (engine->cur_req_prepared &&
> -		    enginectx->op.unprepare_request) {
> -			ret = enginectx->op.unprepare_request(engine, req);
> -			if (ret)
> -				dev_err(engine->dev, "failed to unprepare request\n");
> -		}
> -		spin_lock_irqsave(&engine->queue_lock, flags);
> -		engine->cur_req = NULL;
> -		engine->cur_req_prepared = false;
> -		spin_unlock_irqrestore(&engine->queue_lock, flags);
> +	enginectx = crypto_tfm_ctx(req->tfm);
> +	if (finalize_req && enginectx->op.prepare_request &&
> +	    enginectx->op.unprepare_request) {
> +		ret = enginectx->op.unprepare_request(engine, req);
> +		if (ret)
> +			dev_err(engine->dev, "failed to unprepare request\n");
>  	}
> -
>  	req->complete(req, err);
>  
>  	kthread_queue_work(engine->kworker, &engine->pump_requests);
> @@ -73,8 +69,8 @@ static void crypto_pump_requests(struct crypto_engine *engine,
>  
>  	spin_lock_irqsave(&engine->queue_lock, flags);
>  
> -	/* Make sure we are not already running a request */
> -	if (engine->cur_req)
> +	/* Make sure we have space, for more requests to run */
> +	if (engine->no_reqs >= engine->max_no_reqs)
>  		goto out;
>  
>  	/* If another context is idling then defer */
> @@ -108,13 +104,16 @@ static void crypto_pump_requests(struct crypto_engine *engine,
>  		goto out;
>  	}
>  
> +retry:
>  	/* Get the fist request from the engine queue to handle */
>  	backlog = crypto_get_backlog(&engine->queue);
>  	async_req = crypto_dequeue_request(&engine->queue);
>  	if (!async_req)
>  		goto out;
>  
> -	engine->cur_req = async_req;
> +	/* Increase the number of concurrent requests that are in execution */
> +	engine->no_reqs++;
> +
>  	if (backlog)
>  		backlog->complete(backlog, -EINPROGRESS);
>  
> @@ -130,7 +129,7 @@ static void crypto_pump_requests(struct crypto_engine *engine,
>  		ret = engine->prepare_crypt_hardware(engine);
>  		if (ret) {
>  			dev_err(engine->dev, "failed to prepare crypt hardware\n");
> -			goto req_err;
> +			goto req_err_2;
>  		}
>  	}
>  
> @@ -141,26 +140,45 @@ static void crypto_pump_requests(struct crypto_engine *engine,
>  		if (ret) {
>  			dev_err(engine->dev, "failed to prepare request: %d\n",
>  				ret);
> -			goto req_err;
> +			goto req_err_2;
>  		}
> -		engine->cur_req_prepared = true;
>  	}
>  	if (!enginectx->op.do_one_request) {
>  		dev_err(engine->dev, "failed to do request\n");
>  		ret = -EINVAL;
> -		goto req_err;
> +		goto req_err_1;
>  	}
> +
>  	ret = enginectx->op.do_one_request(engine, async_req);
>  	if (ret) {
>  		dev_err(engine->dev, "Failed to do one request from queue: %d\n", ret);
> -		goto req_err;
> +		goto req_err_1;
>  	}
> -	return;
> -
> -req_err:
> -	crypto_finalize_request(engine, async_req, ret);
> -	return;
>  
> +	/*
> +	 * If there is still space for concurrent requests,
> +	 * try and send a new one
> +	 */
> +	spin_lock_irqsave(&engine->queue_lock, flags);
> +	if (engine->no_reqs < engine->max_no_reqs)
> +		goto retry;

You should check if engine->queue.qlen > 0 before retry.

> +	goto out;
> +
> +req_err_1:
> +	if (enginectx->op.unprepare_request) {
> +		ret = enginectx->op.unprepare_request(engine, async_req);
> +		if (ret)
> +			dev_err(engine->dev, "failed to unprepare request\n");
> +	}
> +req_err_2:
> +	async_req->complete(async_req, ret);
> +	spin_lock_irqsave(&engine->queue_lock, flags);
> +	/*
> +	 * If unable to prepare or execute the request,
> +	 * decrease the number of concurrent requests
> +	 */
> +	engine->no_reqs--;
> +	goto retry;

You should check if engine->queue.qlen > 0 before retry.

>  out:
>  	spin_unlock_irqrestore(&engine->queue_lock, flags);
>  }
> @@ -386,15 +404,21 @@ int crypto_engine_stop(struct crypto_engine *engine)
>  EXPORT_SYMBOL_GPL(crypto_engine_stop);
>  
>  /**
> - * crypto_engine_alloc_init - allocate crypto hardware engine structure and
> - * initialize it.
> + * crypto_engine_alloc_init_and_set - allocate crypto hardware engine structure
> + * and initialize it by setting the maximum number of entries in the software
> + * crypto-engine queue and the maximum number of concurrent requests that can
> + * be executed at once.
>   * @dev: the device attached with one hardware engine
>   * @rt: whether this queue is set to run as a realtime task
> + * @max_no_reqs: maximum number of request that can be executed in parallel
> + * @qlen: maximum size of the crypto-engine queue
>   *
>   * This must be called from context that can sleep.
>   * Return: the crypto engine structure on success, else NULL.
>   */
> -struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
> +struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
> +						       bool rt, int max_no_reqs,
> +						       int qlen)
>  {
>  	struct sched_param param = { .sched_priority = MAX_RT_PRIO / 2 };
>  	struct crypto_engine *engine;
> @@ -411,12 +435,13 @@ struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
>  	engine->running = false;
>  	engine->busy = false;
>  	engine->idling = false;
> -	engine->cur_req_prepared = false;
>  	engine->priv_data = dev;
>  	snprintf(engine->name, sizeof(engine->name),
>  		 "%s-engine", dev_name(dev));
> +	engine->max_no_reqs = max_no_reqs;
> +	engine->no_reqs = 0;
>  
> -	crypto_init_queue(&engine->queue, CRYPTO_ENGINE_MAX_QLEN);
> +	crypto_init_queue(&engine->queue, qlen);
>  	spin_lock_init(&engine->queue_lock);
>  
>  	engine->kworker = kthread_create_worker(0, "%s", engine->name);
> @@ -433,6 +458,23 @@ struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
>  
>  	return engine;
>  }
> +EXPORT_SYMBOL_GPL(crypto_engine_alloc_init_and_set);
> +
> +/**
> + * crypto_engine_alloc_init - allocate crypto hardware engine structure and
> + * initialize it.
> + * @dev: the device attached with one hardware engine
> + * @rt: whether this queue is set to run as a realtime task
> + *
> + * This must be called from context that can sleep.
> + * Return: the crypto engine structure on success, else NULL.
> + */
> +struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
> +{
> +	return crypto_engine_alloc_init_and_set(dev, rt,
> +						CRYPTO_ENGINE_MAX_CONCURRENT_REQS,
> +						CRYPTO_ENGINE_MAX_QLEN);
> +}
>  EXPORT_SYMBOL_GPL(crypto_engine_alloc_init);
>  
>  /**
> diff --git a/include/crypto/engine.h b/include/crypto/engine.h
> index e29cd67..5f9a6df 100644
> --- a/include/crypto/engine.h
> +++ b/include/crypto/engine.h
> @@ -24,7 +24,6 @@
>   * @idling: the engine is entering idle state
>   * @busy: request pump is busy
>   * @running: the engine is on working
> - * @cur_req_prepared: current request is prepared
>   * @list: link with the global crypto engine list
>   * @queue_lock: spinlock to syncronise access to request queue
>   * @queue: the crypto queue of the engine
> @@ -38,14 +37,14 @@
>   * @kworker: kthread worker struct for request pump
>   * @pump_requests: work struct for scheduling work to the request pump
>   * @priv_data: the engine private data
> - * @cur_req: the current request which is on processing
> + * @max_no_reqs: maximum number of request which can be processed in parallel
> + * @no_reqs: current number of request which are processed in parallel
>   */
>  struct crypto_engine {
>  	char			name[ENGINE_NAME_LEN];
>  	bool			idling;
>  	bool			busy;
>  	bool			running;
> -	bool			cur_req_prepared;
>  
>  	struct list_head	list;
>  	spinlock_t		queue_lock;
> @@ -61,7 +60,8 @@ struct crypto_engine {
>  	struct kthread_work             pump_requests;
>  
>  	void				*priv_data;
> -	struct crypto_async_request	*cur_req;
> +	int			max_no_reqs;
> +	int			no_reqs;
>  };
>  
>  /*
> @@ -102,6 +102,9 @@ void crypto_finalize_skcipher_request(struct crypto_engine *engine,
>  int crypto_engine_start(struct crypto_engine *engine);
>  int crypto_engine_stop(struct crypto_engine *engine);
>  struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt);
> +struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
> +						       bool rt, int max_no_reqs,
> +						       int qlen);
>  int crypto_engine_exit(struct crypto_engine *engine);
>  
>  #endif /* _CRYPTO_ENGINE_H */
> -- 
> 2.1.0
> 

Hello

In your model, who is running finalize_request() ?
In caam it seems that you have a taskqueue dedicated for that but you cannot assume that all drivers will have this.
I think the crypto_engine should be sufficient by itself and does not need external thread/taskqueue.

But in your case, it seems that you dont have the choice, since do_one_request does not "do" but simply enqueue the request in the "jobring".

What about adding along prepare/do_one_request/unprepare a new enqueue()/can_do_more() function ?

The stream will be:
retry:
optionnal prepare
optionnal enqueue
optionnal can_do_more() (goto retry)
optionnal do_one_request

then
finalize()
optionnal unprepare

The can_do_more simply will tell if we can enqueue more (this will handle your case(ringjob), and my case(batching)
Instead of storing the limit in the crypto_engine, you keep control on the driver side.

For your case the do_one_request will be unset, for mine I will use to ran the batch.
But for other drivers, no change will be necessary (appart adding some enqueue=NULL,can_do_more=NULL).

We can also imagine an easier solution like enqueue returning a positive value saying to queue more.

Regards
Iuliana Prodan Jan. 21, 2020, 10:08 a.m. UTC | #3
On 1/21/2020 11:11 AM, Herbert Xu wrote:
> On Tue, Jan 21, 2020 at 01:32:29AM +0200, Iuliana Prodan wrote:
>>
>> +	if (engine->no_reqs < engine->max_no_reqs)
>> +		goto retry;
> 
> We should not hard-code this number into the engine.  Instead,
> we should just let the driver tell us when it is ready to accept
> more requests.
> 
This is not hardcoded in crypto-engine.
I've added the crypto_engine_alloc_init_and_set function to configure 
how crypto-engine works.
The max_no_req means how many request the driver can process in parallel.
If this doesn't apply on some drivers they can call 
crypto_engine_alloc_init, which has max_no_req = 1 - this is the current 
behavior. That is serialization of requests by crypto-engine which is a 
bottleneck.

> Perhaps we should add a new function for drivers that wish to
> support this that would accept a list of requests instead of
> a single one.  It would then process as many requests as it
> can from that list and only return either when the list is
> exhausted or when it can't process any more requests.
> 

Linking requests is something different that is not addressed by my 
proposal.
I want to remove this serialization, done by crypto-engine, of unrelated 
requests.
Iuliana Prodan Jan. 21, 2020, 2:20 p.m. UTC | #4
On 1/21/2020 12:00 PM, Corentin Labbe wrote:
> On Tue, Jan 21, 2020 at 01:32:29AM +0200, Iuliana Prodan wrote:
>> Added support for executing multiple requests, in parallel,
>> for crypto engine.
>> A no_reqs is initialized and set in the new
>> crypto_engine_alloc_init_and_set function.
>> Here, is also set the maximum size for crypto-engine software
>> queue (not hardcoded anymore).
>> On crypto_pump_requests the no_reqs is increased, until the
>> max_no_reqs is reached, and decreased on crypto_finalize_request,
>> or on error path (in case a prepare_request or do_one_request
>> operation was unsuccessful).
>>
>> Signed-off-by: Iuliana Prodan <iuliana.prodan@nxp.com>
>> ---
>>   crypto/crypto_engine.c  | 112 +++++++++++++++++++++++++++++++++---------------
>>   include/crypto/engine.h |  11 +++--
>>   2 files changed, 84 insertions(+), 39 deletions(-)
>>
>> diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c
>> index eb029ff..5219141 100644
>> --- a/crypto/crypto_engine.c
>> +++ b/crypto/crypto_engine.c
>> @@ -14,6 +14,7 @@
>>   #include "internal.h"
>>   
>>   #define CRYPTO_ENGINE_MAX_QLEN 10
>> +#define CRYPTO_ENGINE_MAX_CONCURRENT_REQS 1
>>   
>>   /**
>>    * crypto_finalize_request - finalize one request if the request is done
>> @@ -22,32 +23,27 @@
>>    * @err: error number
>>    */
>>   static void crypto_finalize_request(struct crypto_engine *engine,
>> -			     struct crypto_async_request *req, int err)
>> +				    struct crypto_async_request *req, int err)
>>   {
>>   	unsigned long flags;
>> -	bool finalize_cur_req = false;
>> +	bool finalize_req = false;
>>   	int ret;
>>   	struct crypto_engine_ctx *enginectx;
>>   
>>   	spin_lock_irqsave(&engine->queue_lock, flags);
>> -	if (engine->cur_req == req)
>> -		finalize_cur_req = true;
>> +	if (engine->no_reqs > 0) {
>> +		finalize_req = true;
>> +		engine->no_reqs--;
>> +	}
>>   	spin_unlock_irqrestore(&engine->queue_lock, flags);
>>   
>> -	if (finalize_cur_req) {
>> -		enginectx = crypto_tfm_ctx(req->tfm);
>> -		if (engine->cur_req_prepared &&
>> -		    enginectx->op.unprepare_request) {
>> -			ret = enginectx->op.unprepare_request(engine, req);
>> -			if (ret)
>> -				dev_err(engine->dev, "failed to unprepare request\n");
>> -		}
>> -		spin_lock_irqsave(&engine->queue_lock, flags);
>> -		engine->cur_req = NULL;
>> -		engine->cur_req_prepared = false;
>> -		spin_unlock_irqrestore(&engine->queue_lock, flags);
>> +	enginectx = crypto_tfm_ctx(req->tfm);
>> +	if (finalize_req && enginectx->op.prepare_request &&
>> +	    enginectx->op.unprepare_request) {
>> +		ret = enginectx->op.unprepare_request(engine, req);
>> +		if (ret)
>> +			dev_err(engine->dev, "failed to unprepare request\n");
>>   	}
>> -
>>   	req->complete(req, err);
>>   
>>   	kthread_queue_work(engine->kworker, &engine->pump_requests);
>> @@ -73,8 +69,8 @@ static void crypto_pump_requests(struct crypto_engine *engine,
>>   
>>   	spin_lock_irqsave(&engine->queue_lock, flags);
>>   
>> -	/* Make sure we are not already running a request */
>> -	if (engine->cur_req)
>> +	/* Make sure we have space, for more requests to run */
>> +	if (engine->no_reqs >= engine->max_no_reqs)
>>   		goto out;
>>   
>>   	/* If another context is idling then defer */
>> @@ -108,13 +104,16 @@ static void crypto_pump_requests(struct crypto_engine *engine,
>>   		goto out;
>>   	}
>>   
>> +retry:
>>   	/* Get the fist request from the engine queue to handle */
>>   	backlog = crypto_get_backlog(&engine->queue);
>>   	async_req = crypto_dequeue_request(&engine->queue);
>>   	if (!async_req)
>>   		goto out;
>>   
>> -	engine->cur_req = async_req;
>> +	/* Increase the number of concurrent requests that are in execution */
>> +	engine->no_reqs++;
>> +
>>   	if (backlog)
>>   		backlog->complete(backlog, -EINPROGRESS);
>>   
>> @@ -130,7 +129,7 @@ static void crypto_pump_requests(struct crypto_engine *engine,
>>   		ret = engine->prepare_crypt_hardware(engine);
>>   		if (ret) {
>>   			dev_err(engine->dev, "failed to prepare crypt hardware\n");
>> -			goto req_err;
>> +			goto req_err_2;
>>   		}
>>   	}
>>   
>> @@ -141,26 +140,45 @@ static void crypto_pump_requests(struct crypto_engine *engine,
>>   		if (ret) {
>>   			dev_err(engine->dev, "failed to prepare request: %d\n",
>>   				ret);
>> -			goto req_err;
>> +			goto req_err_2;
>>   		}
>> -		engine->cur_req_prepared = true;
>>   	}
>>   	if (!enginectx->op.do_one_request) {
>>   		dev_err(engine->dev, "failed to do request\n");
>>   		ret = -EINVAL;
>> -		goto req_err;
>> +		goto req_err_1;
>>   	}
>> +
>>   	ret = enginectx->op.do_one_request(engine, async_req);
>>   	if (ret) {
>>   		dev_err(engine->dev, "Failed to do one request from queue: %d\n", ret);
>> -		goto req_err;
>> +		goto req_err_1;
>>   	}
>> -	return;
>> -
>> -req_err:
>> -	crypto_finalize_request(engine, async_req, ret);
>> -	return;
>>   
>> +	/*
>> +	 * If there is still space for concurrent requests,
>> +	 * try and send a new one
>> +	 */
>> +	spin_lock_irqsave(&engine->queue_lock, flags);
>> +	if (engine->no_reqs < engine->max_no_reqs)
>> +		goto retry;
> 
> You should check if engine->queue.qlen > 0 before retry.
> 
>> +	goto out;
>> +
>> +req_err_1:
>> +	if (enginectx->op.unprepare_request) {
>> +		ret = enginectx->op.unprepare_request(engine, async_req);
>> +		if (ret)
>> +			dev_err(engine->dev, "failed to unprepare request\n");
>> +	}
>> +req_err_2:
>> +	async_req->complete(async_req, ret);
>> +	spin_lock_irqsave(&engine->queue_lock, flags);
>> +	/*
>> +	 * If unable to prepare or execute the request,
>> +	 * decrease the number of concurrent requests
>> +	 */
>> +	engine->no_reqs--;
>> +	goto retry;
> 
> You should check if engine->queue.qlen > 0 before retry.
> 
Here (and above) is not needed to check for qlen > 0, since on retry, 
first thing is tryin to dequeue an async_req from crypto-engine queue. 
In crypto_dequeue_request function is a check for qlen, that means than 
in pump_request will goto out.

>>   out:
>>   	spin_unlock_irqrestore(&engine->queue_lock, flags);
>>   }
>> @@ -386,15 +404,21 @@ int crypto_engine_stop(struct crypto_engine *engine)
>>   EXPORT_SYMBOL_GPL(crypto_engine_stop);
>>   
>>   /**
>> - * crypto_engine_alloc_init - allocate crypto hardware engine structure and
>> - * initialize it.
>> + * crypto_engine_alloc_init_and_set - allocate crypto hardware engine structure
>> + * and initialize it by setting the maximum number of entries in the software
>> + * crypto-engine queue and the maximum number of concurrent requests that can
>> + * be executed at once.
>>    * @dev: the device attached with one hardware engine
>>    * @rt: whether this queue is set to run as a realtime task
>> + * @max_no_reqs: maximum number of request that can be executed in parallel
>> + * @qlen: maximum size of the crypto-engine queue
>>    *
>>    * This must be called from context that can sleep.
>>    * Return: the crypto engine structure on success, else NULL.
>>    */
>> -struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
>> +struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
>> +						       bool rt, int max_no_reqs,
>> +						       int qlen)
>>   {
>>   	struct sched_param param = { .sched_priority = MAX_RT_PRIO / 2 };
>>   	struct crypto_engine *engine;
>> @@ -411,12 +435,13 @@ struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
>>   	engine->running = false;
>>   	engine->busy = false;
>>   	engine->idling = false;
>> -	engine->cur_req_prepared = false;
>>   	engine->priv_data = dev;
>>   	snprintf(engine->name, sizeof(engine->name),
>>   		 "%s-engine", dev_name(dev));
>> +	engine->max_no_reqs = max_no_reqs;
>> +	engine->no_reqs = 0;
>>   
>> -	crypto_init_queue(&engine->queue, CRYPTO_ENGINE_MAX_QLEN);
>> +	crypto_init_queue(&engine->queue, qlen);
>>   	spin_lock_init(&engine->queue_lock);
>>   
>>   	engine->kworker = kthread_create_worker(0, "%s", engine->name);
>> @@ -433,6 +458,23 @@ struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
>>   
>>   	return engine;
>>   }
>> +EXPORT_SYMBOL_GPL(crypto_engine_alloc_init_and_set);
>> +
>> +/**
>> + * crypto_engine_alloc_init - allocate crypto hardware engine structure and
>> + * initialize it.
>> + * @dev: the device attached with one hardware engine
>> + * @rt: whether this queue is set to run as a realtime task
>> + *
>> + * This must be called from context that can sleep.
>> + * Return: the crypto engine structure on success, else NULL.
>> + */
>> +struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
>> +{
>> +	return crypto_engine_alloc_init_and_set(dev, rt,
>> +						CRYPTO_ENGINE_MAX_CONCURRENT_REQS,
>> +						CRYPTO_ENGINE_MAX_QLEN);
>> +}
>>   EXPORT_SYMBOL_GPL(crypto_engine_alloc_init);
>>   
>>   /**
>> diff --git a/include/crypto/engine.h b/include/crypto/engine.h
>> index e29cd67..5f9a6df 100644
>> --- a/include/crypto/engine.h
>> +++ b/include/crypto/engine.h
>> @@ -24,7 +24,6 @@
>>    * @idling: the engine is entering idle state
>>    * @busy: request pump is busy
>>    * @running: the engine is on working
>> - * @cur_req_prepared: current request is prepared
>>    * @list: link with the global crypto engine list
>>    * @queue_lock: spinlock to syncronise access to request queue
>>    * @queue: the crypto queue of the engine
>> @@ -38,14 +37,14 @@
>>    * @kworker: kthread worker struct for request pump
>>    * @pump_requests: work struct for scheduling work to the request pump
>>    * @priv_data: the engine private data
>> - * @cur_req: the current request which is on processing
>> + * @max_no_reqs: maximum number of request which can be processed in parallel
>> + * @no_reqs: current number of request which are processed in parallel
>>    */
>>   struct crypto_engine {
>>   	char			name[ENGINE_NAME_LEN];
>>   	bool			idling;
>>   	bool			busy;
>>   	bool			running;
>> -	bool			cur_req_prepared;
>>   
>>   	struct list_head	list;
>>   	spinlock_t		queue_lock;
>> @@ -61,7 +60,8 @@ struct crypto_engine {
>>   	struct kthread_work             pump_requests;
>>   
>>   	void				*priv_data;
>> -	struct crypto_async_request	*cur_req;
>> +	int			max_no_reqs;
>> +	int			no_reqs;
>>   };
>>   
>>   /*
>> @@ -102,6 +102,9 @@ void crypto_finalize_skcipher_request(struct crypto_engine *engine,
>>   int crypto_engine_start(struct crypto_engine *engine);
>>   int crypto_engine_stop(struct crypto_engine *engine);
>>   struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt);
>> +struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
>> +						       bool rt, int max_no_reqs,
>> +						       int qlen);
>>   int crypto_engine_exit(struct crypto_engine *engine);
>>   
>>   #endif /* _CRYPTO_ENGINE_H */
>> -- 
>> 2.1.0
>>
> 
> Hello
> 
> In your model, who is running finalize_request() ?
finalize_request() in CAAM, and in other drivers, is called on the _done 
callback (stm32, virtio and omap).

> In caam it seems that you have a taskqueue dedicated for that but you cannot assume that all drivers will have this.
> I think the crypto_engine should be sufficient by itself and does not need external thread/taskqueue.
> 
> But in your case, it seems that you dont have the choice, since do_one_request does not "do" but simply enqueue the request in the "jobring".
> 
But, do_one_request it shouldn't, necessary,  execute the request. Is ok 
to enqueue it, since we have asynchronous requests. do_one_request is 
not blocking.

> What about adding along prepare/do_one_request/unprepare a new enqueue()/can_do_more() function ?
> 
> The stream will be:
> retry:
> optionnal prepare
> optionnal enqueue
> optionnal can_do_more() (goto retry)
> optionnal do_one_request
> 
> then
> finalize()
> optionnal unprepare
> 

I'm planning to improve crypto-engine incrementally, so I'm taking one 
step at a time :)
But I'm not sure if adding an enqueue operation is a good idea, since, 
my understanding, is that do_one_request is a non-blocking operation and 
it shouldn't execute the request.

IMO, the crypto-engine flow should be kept simple:
1. a request comes to hw -> this is doing transfer_request_to_engine;
2. CE enqueue the requests
3. on pump_requests:
	3. a) optional prepare operation
	3. b) sends the reqs to hw, by do_one_request operation. To wait for 
completion here it contradicts the asynchronous crypto API. 
do_one_request operation has a crypto_async_request type as argument.
Note: Step 3. b) can be done several times, depending on size of hw queue.
4. in driver, when req is done:
	4. a) optional unprepare operation
	4. b) crypto_finalize_request is called
	

Thanks,
Iulia

> The can_do_more simply will tell if we can enqueue more (this will handle your case(ringjob), and my case(batching)
> Instead of storing the limit in the crypto_engine, you keep control on the driver side.
> 
> For your case the do_one_request will be unset, for mine I will use to ran the batch.
> But for other drivers, no change will be necessary (appart adding some enqueue=NULL,can_do_more=NULL).
> 
> We can also imagine an easier solution like enqueue returning a positive value saying to queue more.
> 
> Regards
>
Corentin Labbe Jan. 22, 2020, 10:41 a.m. UTC | #5
On Tue, Jan 21, 2020 at 02:20:27PM +0000, Iuliana Prodan wrote:
> On 1/21/2020 12:00 PM, Corentin Labbe wrote:
> > On Tue, Jan 21, 2020 at 01:32:29AM +0200, Iuliana Prodan wrote:
> >> Added support for executing multiple requests, in parallel,
> >> for crypto engine.
> >> A no_reqs is initialized and set in the new
> >> crypto_engine_alloc_init_and_set function.
> >> Here, is also set the maximum size for crypto-engine software
> >> queue (not hardcoded anymore).
> >> On crypto_pump_requests the no_reqs is increased, until the
> >> max_no_reqs is reached, and decreased on crypto_finalize_request,
> >> or on error path (in case a prepare_request or do_one_request
> >> operation was unsuccessful).
> >>
> >> Signed-off-by: Iuliana Prodan <iuliana.prodan@nxp.com>
> >> ---
> >>   crypto/crypto_engine.c  | 112 +++++++++++++++++++++++++++++++++---------------
> >>   include/crypto/engine.h |  11 +++--
> >>   2 files changed, 84 insertions(+), 39 deletions(-)
> >>
> >> diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c
> >> index eb029ff..5219141 100644
> >> --- a/crypto/crypto_engine.c
> >> +++ b/crypto/crypto_engine.c
> >> @@ -14,6 +14,7 @@
> >>   #include "internal.h"
> >>   
> >>   #define CRYPTO_ENGINE_MAX_QLEN 10
> >> +#define CRYPTO_ENGINE_MAX_CONCURRENT_REQS 1
> >>   
> >>   /**
> >>    * crypto_finalize_request - finalize one request if the request is done
> >> @@ -22,32 +23,27 @@
> >>    * @err: error number
> >>    */
> >>   static void crypto_finalize_request(struct crypto_engine *engine,
> >> -			     struct crypto_async_request *req, int err)
> >> +				    struct crypto_async_request *req, int err)
> >>   {
> >>   	unsigned long flags;
> >> -	bool finalize_cur_req = false;
> >> +	bool finalize_req = false;
> >>   	int ret;
> >>   	struct crypto_engine_ctx *enginectx;
> >>   
> >>   	spin_lock_irqsave(&engine->queue_lock, flags);
> >> -	if (engine->cur_req == req)
> >> -		finalize_cur_req = true;
> >> +	if (engine->no_reqs > 0) {
> >> +		finalize_req = true;
> >> +		engine->no_reqs--;
> >> +	}
> >>   	spin_unlock_irqrestore(&engine->queue_lock, flags);
> >>   
> >> -	if (finalize_cur_req) {
> >> -		enginectx = crypto_tfm_ctx(req->tfm);
> >> -		if (engine->cur_req_prepared &&
> >> -		    enginectx->op.unprepare_request) {
> >> -			ret = enginectx->op.unprepare_request(engine, req);
> >> -			if (ret)
> >> -				dev_err(engine->dev, "failed to unprepare request\n");
> >> -		}
> >> -		spin_lock_irqsave(&engine->queue_lock, flags);
> >> -		engine->cur_req = NULL;
> >> -		engine->cur_req_prepared = false;
> >> -		spin_unlock_irqrestore(&engine->queue_lock, flags);
> >> +	enginectx = crypto_tfm_ctx(req->tfm);
> >> +	if (finalize_req && enginectx->op.prepare_request &&
> >> +	    enginectx->op.unprepare_request) {
> >> +		ret = enginectx->op.unprepare_request(engine, req);
> >> +		if (ret)
> >> +			dev_err(engine->dev, "failed to unprepare request\n");
> >>   	}
> >> -
> >>   	req->complete(req, err);
> >>   
> >>   	kthread_queue_work(engine->kworker, &engine->pump_requests);
> >> @@ -73,8 +69,8 @@ static void crypto_pump_requests(struct crypto_engine *engine,
> >>   
> >>   	spin_lock_irqsave(&engine->queue_lock, flags);
> >>   
> >> -	/* Make sure we are not already running a request */
> >> -	if (engine->cur_req)
> >> +	/* Make sure we have space, for more requests to run */
> >> +	if (engine->no_reqs >= engine->max_no_reqs)
> >>   		goto out;
> >>   
> >>   	/* If another context is idling then defer */
> >> @@ -108,13 +104,16 @@ static void crypto_pump_requests(struct crypto_engine *engine,
> >>   		goto out;
> >>   	}
> >>   
> >> +retry:
> >>   	/* Get the fist request from the engine queue to handle */
> >>   	backlog = crypto_get_backlog(&engine->queue);
> >>   	async_req = crypto_dequeue_request(&engine->queue);
> >>   	if (!async_req)
> >>   		goto out;
> >>   
> >> -	engine->cur_req = async_req;
> >> +	/* Increase the number of concurrent requests that are in execution */
> >> +	engine->no_reqs++;
> >> +
> >>   	if (backlog)
> >>   		backlog->complete(backlog, -EINPROGRESS);
> >>   
> >> @@ -130,7 +129,7 @@ static void crypto_pump_requests(struct crypto_engine *engine,
> >>   		ret = engine->prepare_crypt_hardware(engine);
> >>   		if (ret) {
> >>   			dev_err(engine->dev, "failed to prepare crypt hardware\n");
> >> -			goto req_err;
> >> +			goto req_err_2;
> >>   		}
> >>   	}
> >>   
> >> @@ -141,26 +140,45 @@ static void crypto_pump_requests(struct crypto_engine *engine,
> >>   		if (ret) {
> >>   			dev_err(engine->dev, "failed to prepare request: %d\n",
> >>   				ret);
> >> -			goto req_err;
> >> +			goto req_err_2;
> >>   		}
> >> -		engine->cur_req_prepared = true;
> >>   	}
> >>   	if (!enginectx->op.do_one_request) {
> >>   		dev_err(engine->dev, "failed to do request\n");
> >>   		ret = -EINVAL;
> >> -		goto req_err;
> >> +		goto req_err_1;
> >>   	}
> >> +
> >>   	ret = enginectx->op.do_one_request(engine, async_req);
> >>   	if (ret) {
> >>   		dev_err(engine->dev, "Failed to do one request from queue: %d\n", ret);
> >> -		goto req_err;
> >> +		goto req_err_1;
> >>   	}
> >> -	return;
> >> -
> >> -req_err:
> >> -	crypto_finalize_request(engine, async_req, ret);
> >> -	return;
> >>   
> >> +	/*
> >> +	 * If there is still space for concurrent requests,
> >> +	 * try and send a new one
> >> +	 */
> >> +	spin_lock_irqsave(&engine->queue_lock, flags);
> >> +	if (engine->no_reqs < engine->max_no_reqs)
> >> +		goto retry;
> > 
> > You should check if engine->queue.qlen > 0 before retry.
> > 
> >> +	goto out;
> >> +
> >> +req_err_1:
> >> +	if (enginectx->op.unprepare_request) {
> >> +		ret = enginectx->op.unprepare_request(engine, async_req);
> >> +		if (ret)
> >> +			dev_err(engine->dev, "failed to unprepare request\n");
> >> +	}
> >> +req_err_2:
> >> +	async_req->complete(async_req, ret);
> >> +	spin_lock_irqsave(&engine->queue_lock, flags);
> >> +	/*
> >> +	 * If unable to prepare or execute the request,
> >> +	 * decrease the number of concurrent requests
> >> +	 */
> >> +	engine->no_reqs--;
> >> +	goto retry;
> > 
> > You should check if engine->queue.qlen > 0 before retry.
> > 
> Here (and above) is not needed to check for qlen > 0, since on retry, 
> first thing is tryin to dequeue an async_req from crypto-engine queue. 
> In crypto_dequeue_request function is a check for qlen, that means than 
> in pump_request will goto out.
> 
> >>   out:
> >>   	spin_unlock_irqrestore(&engine->queue_lock, flags);
> >>   }
> >> @@ -386,15 +404,21 @@ int crypto_engine_stop(struct crypto_engine *engine)
> >>   EXPORT_SYMBOL_GPL(crypto_engine_stop);
> >>   
> >>   /**
> >> - * crypto_engine_alloc_init - allocate crypto hardware engine structure and
> >> - * initialize it.
> >> + * crypto_engine_alloc_init_and_set - allocate crypto hardware engine structure
> >> + * and initialize it by setting the maximum number of entries in the software
> >> + * crypto-engine queue and the maximum number of concurrent requests that can
> >> + * be executed at once.
> >>    * @dev: the device attached with one hardware engine
> >>    * @rt: whether this queue is set to run as a realtime task
> >> + * @max_no_reqs: maximum number of request that can be executed in parallel
> >> + * @qlen: maximum size of the crypto-engine queue
> >>    *
> >>    * This must be called from context that can sleep.
> >>    * Return: the crypto engine structure on success, else NULL.
> >>    */
> >> -struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
> >> +struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
> >> +						       bool rt, int max_no_reqs,
> >> +						       int qlen)
> >>   {
> >>   	struct sched_param param = { .sched_priority = MAX_RT_PRIO / 2 };
> >>   	struct crypto_engine *engine;
> >> @@ -411,12 +435,13 @@ struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
> >>   	engine->running = false;
> >>   	engine->busy = false;
> >>   	engine->idling = false;
> >> -	engine->cur_req_prepared = false;
> >>   	engine->priv_data = dev;
> >>   	snprintf(engine->name, sizeof(engine->name),
> >>   		 "%s-engine", dev_name(dev));
> >> +	engine->max_no_reqs = max_no_reqs;
> >> +	engine->no_reqs = 0;
> >>   
> >> -	crypto_init_queue(&engine->queue, CRYPTO_ENGINE_MAX_QLEN);
> >> +	crypto_init_queue(&engine->queue, qlen);
> >>   	spin_lock_init(&engine->queue_lock);
> >>   
> >>   	engine->kworker = kthread_create_worker(0, "%s", engine->name);
> >> @@ -433,6 +458,23 @@ struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
> >>   
> >>   	return engine;
> >>   }
> >> +EXPORT_SYMBOL_GPL(crypto_engine_alloc_init_and_set);
> >> +
> >> +/**
> >> + * crypto_engine_alloc_init - allocate crypto hardware engine structure and
> >> + * initialize it.
> >> + * @dev: the device attached with one hardware engine
> >> + * @rt: whether this queue is set to run as a realtime task
> >> + *
> >> + * This must be called from context that can sleep.
> >> + * Return: the crypto engine structure on success, else NULL.
> >> + */
> >> +struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
> >> +{
> >> +	return crypto_engine_alloc_init_and_set(dev, rt,
> >> +						CRYPTO_ENGINE_MAX_CONCURRENT_REQS,
> >> +						CRYPTO_ENGINE_MAX_QLEN);
> >> +}
> >>   EXPORT_SYMBOL_GPL(crypto_engine_alloc_init);
> >>   
> >>   /**
> >> diff --git a/include/crypto/engine.h b/include/crypto/engine.h
> >> index e29cd67..5f9a6df 100644
> >> --- a/include/crypto/engine.h
> >> +++ b/include/crypto/engine.h
> >> @@ -24,7 +24,6 @@
> >>    * @idling: the engine is entering idle state
> >>    * @busy: request pump is busy
> >>    * @running: the engine is on working
> >> - * @cur_req_prepared: current request is prepared
> >>    * @list: link with the global crypto engine list
> >>    * @queue_lock: spinlock to syncronise access to request queue
> >>    * @queue: the crypto queue of the engine
> >> @@ -38,14 +37,14 @@
> >>    * @kworker: kthread worker struct for request pump
> >>    * @pump_requests: work struct for scheduling work to the request pump
> >>    * @priv_data: the engine private data
> >> - * @cur_req: the current request which is on processing
> >> + * @max_no_reqs: maximum number of request which can be processed in parallel
> >> + * @no_reqs: current number of request which are processed in parallel
> >>    */
> >>   struct crypto_engine {
> >>   	char			name[ENGINE_NAME_LEN];
> >>   	bool			idling;
> >>   	bool			busy;
> >>   	bool			running;
> >> -	bool			cur_req_prepared;
> >>   
> >>   	struct list_head	list;
> >>   	spinlock_t		queue_lock;
> >> @@ -61,7 +60,8 @@ struct crypto_engine {
> >>   	struct kthread_work             pump_requests;
> >>   
> >>   	void				*priv_data;
> >> -	struct crypto_async_request	*cur_req;
> >> +	int			max_no_reqs;
> >> +	int			no_reqs;
> >>   };
> >>   
> >>   /*
> >> @@ -102,6 +102,9 @@ void crypto_finalize_skcipher_request(struct crypto_engine *engine,
> >>   int crypto_engine_start(struct crypto_engine *engine);
> >>   int crypto_engine_stop(struct crypto_engine *engine);
> >>   struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt);
> >> +struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
> >> +						       bool rt, int max_no_reqs,
> >> +						       int qlen);
> >>   int crypto_engine_exit(struct crypto_engine *engine);
> >>   
> >>   #endif /* _CRYPTO_ENGINE_H */
> >> -- 
> >> 2.1.0
> >>
> > 
> > Hello
> > 
> > In your model, who is running finalize_request() ?
> finalize_request() in CAAM, and in other drivers, is called on the _done 
> callback (stm32, virtio and omap).
> 
> > In caam it seems that you have a taskqueue dedicated for that but you cannot assume that all drivers will have this.
> > I think the crypto_engine should be sufficient by itself and does not need external thread/taskqueue.
> > 
> > But in your case, it seems that you dont have the choice, since do_one_request does not "do" but simply enqueue the request in the "jobring".
> > 
> But, do_one_request it shouldn't, necessary,  execute the request. Is ok 
> to enqueue it, since we have asynchronous requests. do_one_request is 
> not blocking.
> 
> > What about adding along prepare/do_one_request/unprepare a new enqueue()/can_do_more() function ?
> > 
> > The stream will be:
> > retry:
> > optionnal prepare
> > optionnal enqueue
> > optionnal can_do_more() (goto retry)
> > optionnal do_one_request
> > 
> > then
> > finalize()
> > optionnal unprepare
> > 
> 
> I'm planning to improve crypto-engine incrementally, so I'm taking one 
> step at a time :)
> But I'm not sure if adding an enqueue operation is a good idea, since, 
> my understanding, is that do_one_request is a non-blocking operation and 
> it shouldn't execute the request.

do_one_request is a blocking operation on amlogic/sun8i-ce/sun8i-ss and the "documentation" is clear "@do_one_request: do encryption for current request".
But I agree that is a bit small for a documentation.

> 
> IMO, the crypto-engine flow should be kept simple:
> 1. a request comes to hw -> this is doing transfer_request_to_engine;
> 2. CE enqueue the requests
> 3. on pump_requests:
> 	3. a) optional prepare operation
> 	3. b) sends the reqs to hw, by do_one_request operation. To wait for 
> completion here it contradicts the asynchronous crypto API. 

There are no contradiction, the call is asynchronous for the user of the API.

> do_one_request operation has a crypto_async_request type as argument.
> Note: Step 3. b) can be done several times, depending on size of hw queue.
> 4. in driver, when req is done:
> 	4. a) optional unprepare operation
> 	4. b) crypto_finalize_request is called
> 	

Since Herbert say the same thing than me:
"Instead, we should just let the driver tell us when it is ready to accept more requests."
Let me insist on my proposal, I have updated my serie, and it should handle your case and mine.
I will send it within minutes.

Regards
Iuliana Prodan Jan. 22, 2020, 12:29 p.m. UTC | #6
On 1/22/2020 12:41 PM, Corentin Labbe wrote:
> On Tue, Jan 21, 2020 at 02:20:27PM +0000, Iuliana Prodan wrote:
>> On 1/21/2020 12:00 PM, Corentin Labbe wrote:
>>> On Tue, Jan 21, 2020 at 01:32:29AM +0200, Iuliana Prodan wrote:
>>>> Added support for executing multiple requests, in parallel,
>>>> for crypto engine.
>>>> A no_reqs is initialized and set in the new
>>>> crypto_engine_alloc_init_and_set function.
>>>> 
>>>>
>>>
>>> Hello
>>>
>>> In your model, who is running finalize_request() ?
>> finalize_request() in CAAM, and in other drivers, is called on the _done
>> callback (stm32, virtio and omap).
>>
>>> In caam it seems that you have a taskqueue dedicated for that but you cannot assume that all drivers will have this.
>>> I think the crypto_engine should be sufficient by itself and does not need external thread/taskqueue.
>>>
>>> But in your case, it seems that you dont have the choice, since do_one_request does not "do" but simply enqueue the request in the "jobring".
>>>
>> But, do_one_request it shouldn't, necessary,  execute the request. Is ok
>> to enqueue it, since we have asynchronous requests. do_one_request is
>> not blocking.
>>
>>> What about adding along prepare/do_one_request/unprepare a new enqueue()/can_do_more() function ?
>>>
>>> The stream will be:
>>> retry:
>>> optionnal prepare
>>> optionnal enqueue
>>> optionnal can_do_more() (goto retry)
>>> optionnal do_one_request
>>>
>>> then
>>> finalize()
>>> optionnal unprepare
>>>
>>
>> I'm planning to improve crypto-engine incrementally, so I'm taking one
>> step at a time :)
>> But I'm not sure if adding an enqueue operation is a good idea, since,
>> my understanding, is that do_one_request is a non-blocking operation and
>> it shouldn't execute the request.
> 
> do_one_request is a blocking operation on amlogic/sun8i-ce/sun8i-ss and the "documentation" is clear "@do_one_request: do encryption for current request".
> But I agree that is a bit small for a documentation.
> 

Herbert, Baolin,

What do you think about do_one_requet operation: is blocking or not?

There are several drivers (stm32, omap, virtio, caam) that include 
crypto-engine, and uses do_one_request as non-blocking, only the ones 
mentioned and implemented by Corentin use do_one_request as blocking.

>>
>> IMO, the crypto-engine flow should be kept simple:
>> 1. a request comes to hw -> this is doing transfer_request_to_engine;
>> 2. CE enqueue the requests
>> 3. on pump_requests:
>> 	3. a) optional prepare operation
>> 	3. b) sends the reqs to hw, by do_one_request operation. To wait for
>> completion here it contradicts the asynchronous crypto API.
> 
> There are no contradiction, the call is asynchronous for the user of the API.
> 
>> do_one_request operation has a crypto_async_request type as argument.
>> Note: Step 3. b) can be done several times, depending on size of hw queue.
>> 4. in driver, when req is done:
>> 	4. a) optional unprepare operation
>> 	4. b) crypto_finalize_request is called
>> 	
> 
> Since Herbert say the same thing than me:
> "Instead, we should just let the driver tell us when it is ready to accept more requests."
> Let me insist on my proposal, I have updated my serie, and it should handle your case and mine.
> I will send it within minutes.
> 

Corentin,

In your new proposal, a few patches include my modifications. The others 
include a solution that fits your drivers very well, but implies 
modifications in all the other 4 drivers. It's not backwards compatible.
I believe it can be done better, so we won't need to modify, _at all_, 
the other drivers.

I'm working on a new version for my RFC, that has the can_enqueue_more, 
as Herbert suggested, but I would really want to know how 
crypto-engine's do_one_request was thought: blocking or non-blocking?

Just your driver(s) use it as blocking, the other examples use it to 
enqueue (don't block in waiting for request to finish).

Thanks,
Iulia
Corentin Labbe Jan. 22, 2020, 1:22 p.m. UTC | #7
On Wed, Jan 22, 2020 at 12:29:22PM +0000, Iuliana Prodan wrote:
> On 1/22/2020 12:41 PM, Corentin Labbe wrote:
> > On Tue, Jan 21, 2020 at 02:20:27PM +0000, Iuliana Prodan wrote:
> >> On 1/21/2020 12:00 PM, Corentin Labbe wrote:
> >>> On Tue, Jan 21, 2020 at 01:32:29AM +0200, Iuliana Prodan wrote:
> >>>> Added support for executing multiple requests, in parallel,
> >>>> for crypto engine.
> >>>> A no_reqs is initialized and set in the new
> >>>> crypto_engine_alloc_init_and_set function.
> >>>> 
> >>>>
> >>>
> >>> Hello
> >>>
> >>> In your model, who is running finalize_request() ?
> >> finalize_request() in CAAM, and in other drivers, is called on the _done
> >> callback (stm32, virtio and omap).
> >>
> >>> In caam it seems that you have a taskqueue dedicated for that but you cannot assume that all drivers will have this.
> >>> I think the crypto_engine should be sufficient by itself and does not need external thread/taskqueue.
> >>>
> >>> But in your case, it seems that you dont have the choice, since do_one_request does not "do" but simply enqueue the request in the "jobring".
> >>>
> >> But, do_one_request it shouldn't, necessary,  execute the request. Is ok
> >> to enqueue it, since we have asynchronous requests. do_one_request is
> >> not blocking.
> >>
> >>> What about adding along prepare/do_one_request/unprepare a new enqueue()/can_do_more() function ?
> >>>
> >>> The stream will be:
> >>> retry:
> >>> optionnal prepare
> >>> optionnal enqueue
> >>> optionnal can_do_more() (goto retry)
> >>> optionnal do_one_request
> >>>
> >>> then
> >>> finalize()
> >>> optionnal unprepare
> >>>
> >>
> >> I'm planning to improve crypto-engine incrementally, so I'm taking one
> >> step at a time :)
> >> But I'm not sure if adding an enqueue operation is a good idea, since,
> >> my understanding, is that do_one_request is a non-blocking operation and
> >> it shouldn't execute the request.
> > 
> > do_one_request is a blocking operation on amlogic/sun8i-ce/sun8i-ss and the "documentation" is clear "@do_one_request: do encryption for current request".
> > But I agree that is a bit small for a documentation.
> > 
> 
> Herbert, Baolin,
> 
> What do you think about do_one_requet operation: is blocking or not?
> 
> There are several drivers (stm32, omap, virtio, caam) that include 
> crypto-engine, and uses do_one_request as non-blocking, only the ones 
> mentioned and implemented by Corentin use do_one_request as blocking.
> 
> >>
> >> IMO, the crypto-engine flow should be kept simple:
> >> 1. a request comes to hw -> this is doing transfer_request_to_engine;
> >> 2. CE enqueue the requests
> >> 3. on pump_requests:
> >> 	3. a) optional prepare operation
> >> 	3. b) sends the reqs to hw, by do_one_request operation. To wait for
> >> completion here it contradicts the asynchronous crypto API.
> > 
> > There are no contradiction, the call is asynchronous for the user of the API.
> > 
> >> do_one_request operation has a crypto_async_request type as argument.
> >> Note: Step 3. b) can be done several times, depending on size of hw queue.
> >> 4. in driver, when req is done:
> >> 	4. a) optional unprepare operation
> >> 	4. b) crypto_finalize_request is called
> >> 	
> > 
> > Since Herbert say the same thing than me:
> > "Instead, we should just let the driver tell us when it is ready to accept more requests."
> > Let me insist on my proposal, I have updated my serie, and it should handle your case and mine.
> > I will send it within minutes.
> > 
> 
> Corentin,
> 
> In your new proposal, a few patches include my modifications. The others 
> include a solution that fits your drivers very well, but implies 
> modifications in all the other 4 drivers. It's not backwards compatible.
> I believe it can be done better, so we won't need to modify, _at all_, 
> the other drivers.

Others driver should work the same, I dont see what changes they need.
As I said, I tested caam with my changes, it works the same.
Please show me what changes they need to continue to work and proof that they are broken with my changes.

> 
> I'm working on a new version for my RFC, that has the can_enqueue_more, 
> as Herbert suggested, but I would really want to know how 
> crypto-engine's do_one_request was thought: blocking or non-blocking?

We know that both way works, so this is not really a problem.
Herbert Xu Jan. 22, 2020, 2:41 p.m. UTC | #8
On Wed, Jan 22, 2020 at 12:29:22PM +0000, Iuliana Prodan wrote:
>
> > do_one_request is a blocking operation on amlogic/sun8i-ce/sun8i-ss and the "documentation" is clear "@do_one_request: do encryption for current request".
> > But I agree that is a bit small for a documentation.
> > 
> 
> Herbert, Baolin,
> 
> What do you think about do_one_requet operation: is blocking or not?
> 
> There are several drivers (stm32, omap, virtio, caam) that include 
> crypto-engine, and uses do_one_request as non-blocking, only the ones 
> mentioned and implemented by Corentin use do_one_request as blocking.

It certainly shouldn't be blocking in the general case.  The only
time when it might make sense to block is if the hardware is
incapable of using IRQs to signal completion.

Thanks,
diff mbox series

Patch

diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c
index eb029ff..5219141 100644
--- a/crypto/crypto_engine.c
+++ b/crypto/crypto_engine.c
@@ -14,6 +14,7 @@ 
 #include "internal.h"
 
 #define CRYPTO_ENGINE_MAX_QLEN 10
+#define CRYPTO_ENGINE_MAX_CONCURRENT_REQS 1
 
 /**
  * crypto_finalize_request - finalize one request if the request is done
@@ -22,32 +23,27 @@ 
  * @err: error number
  */
 static void crypto_finalize_request(struct crypto_engine *engine,
-			     struct crypto_async_request *req, int err)
+				    struct crypto_async_request *req, int err)
 {
 	unsigned long flags;
-	bool finalize_cur_req = false;
+	bool finalize_req = false;
 	int ret;
 	struct crypto_engine_ctx *enginectx;
 
 	spin_lock_irqsave(&engine->queue_lock, flags);
-	if (engine->cur_req == req)
-		finalize_cur_req = true;
+	if (engine->no_reqs > 0) {
+		finalize_req = true;
+		engine->no_reqs--;
+	}
 	spin_unlock_irqrestore(&engine->queue_lock, flags);
 
-	if (finalize_cur_req) {
-		enginectx = crypto_tfm_ctx(req->tfm);
-		if (engine->cur_req_prepared &&
-		    enginectx->op.unprepare_request) {
-			ret = enginectx->op.unprepare_request(engine, req);
-			if (ret)
-				dev_err(engine->dev, "failed to unprepare request\n");
-		}
-		spin_lock_irqsave(&engine->queue_lock, flags);
-		engine->cur_req = NULL;
-		engine->cur_req_prepared = false;
-		spin_unlock_irqrestore(&engine->queue_lock, flags);
+	enginectx = crypto_tfm_ctx(req->tfm);
+	if (finalize_req && enginectx->op.prepare_request &&
+	    enginectx->op.unprepare_request) {
+		ret = enginectx->op.unprepare_request(engine, req);
+		if (ret)
+			dev_err(engine->dev, "failed to unprepare request\n");
 	}
-
 	req->complete(req, err);
 
 	kthread_queue_work(engine->kworker, &engine->pump_requests);
@@ -73,8 +69,8 @@  static void crypto_pump_requests(struct crypto_engine *engine,
 
 	spin_lock_irqsave(&engine->queue_lock, flags);
 
-	/* Make sure we are not already running a request */
-	if (engine->cur_req)
+	/* Make sure we have space, for more requests to run */
+	if (engine->no_reqs >= engine->max_no_reqs)
 		goto out;
 
 	/* If another context is idling then defer */
@@ -108,13 +104,16 @@  static void crypto_pump_requests(struct crypto_engine *engine,
 		goto out;
 	}
 
+retry:
 	/* Get the fist request from the engine queue to handle */
 	backlog = crypto_get_backlog(&engine->queue);
 	async_req = crypto_dequeue_request(&engine->queue);
 	if (!async_req)
 		goto out;
 
-	engine->cur_req = async_req;
+	/* Increase the number of concurrent requests that are in execution */
+	engine->no_reqs++;
+
 	if (backlog)
 		backlog->complete(backlog, -EINPROGRESS);
 
@@ -130,7 +129,7 @@  static void crypto_pump_requests(struct crypto_engine *engine,
 		ret = engine->prepare_crypt_hardware(engine);
 		if (ret) {
 			dev_err(engine->dev, "failed to prepare crypt hardware\n");
-			goto req_err;
+			goto req_err_2;
 		}
 	}
 
@@ -141,26 +140,45 @@  static void crypto_pump_requests(struct crypto_engine *engine,
 		if (ret) {
 			dev_err(engine->dev, "failed to prepare request: %d\n",
 				ret);
-			goto req_err;
+			goto req_err_2;
 		}
-		engine->cur_req_prepared = true;
 	}
 	if (!enginectx->op.do_one_request) {
 		dev_err(engine->dev, "failed to do request\n");
 		ret = -EINVAL;
-		goto req_err;
+		goto req_err_1;
 	}
+
 	ret = enginectx->op.do_one_request(engine, async_req);
 	if (ret) {
 		dev_err(engine->dev, "Failed to do one request from queue: %d\n", ret);
-		goto req_err;
+		goto req_err_1;
 	}
-	return;
-
-req_err:
-	crypto_finalize_request(engine, async_req, ret);
-	return;
 
+	/*
+	 * If there is still space for concurrent requests,
+	 * try and send a new one
+	 */
+	spin_lock_irqsave(&engine->queue_lock, flags);
+	if (engine->no_reqs < engine->max_no_reqs)
+		goto retry;
+	goto out;
+
+req_err_1:
+	if (enginectx->op.unprepare_request) {
+		ret = enginectx->op.unprepare_request(engine, async_req);
+		if (ret)
+			dev_err(engine->dev, "failed to unprepare request\n");
+	}
+req_err_2:
+	async_req->complete(async_req, ret);
+	spin_lock_irqsave(&engine->queue_lock, flags);
+	/*
+	 * If unable to prepare or execute the request,
+	 * decrease the number of concurrent requests
+	 */
+	engine->no_reqs--;
+	goto retry;
 out:
 	spin_unlock_irqrestore(&engine->queue_lock, flags);
 }
@@ -386,15 +404,21 @@  int crypto_engine_stop(struct crypto_engine *engine)
 EXPORT_SYMBOL_GPL(crypto_engine_stop);
 
 /**
- * crypto_engine_alloc_init - allocate crypto hardware engine structure and
- * initialize it.
+ * crypto_engine_alloc_init_and_set - allocate crypto hardware engine structure
+ * and initialize it by setting the maximum number of entries in the software
+ * crypto-engine queue and the maximum number of concurrent requests that can
+ * be executed at once.
  * @dev: the device attached with one hardware engine
  * @rt: whether this queue is set to run as a realtime task
+ * @max_no_reqs: maximum number of request that can be executed in parallel
+ * @qlen: maximum size of the crypto-engine queue
  *
  * This must be called from context that can sleep.
  * Return: the crypto engine structure on success, else NULL.
  */
-struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
+struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
+						       bool rt, int max_no_reqs,
+						       int qlen)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO / 2 };
 	struct crypto_engine *engine;
@@ -411,12 +435,13 @@  struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
 	engine->running = false;
 	engine->busy = false;
 	engine->idling = false;
-	engine->cur_req_prepared = false;
 	engine->priv_data = dev;
 	snprintf(engine->name, sizeof(engine->name),
 		 "%s-engine", dev_name(dev));
+	engine->max_no_reqs = max_no_reqs;
+	engine->no_reqs = 0;
 
-	crypto_init_queue(&engine->queue, CRYPTO_ENGINE_MAX_QLEN);
+	crypto_init_queue(&engine->queue, qlen);
 	spin_lock_init(&engine->queue_lock);
 
 	engine->kworker = kthread_create_worker(0, "%s", engine->name);
@@ -433,6 +458,23 @@  struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
 
 	return engine;
 }
+EXPORT_SYMBOL_GPL(crypto_engine_alloc_init_and_set);
+
+/**
+ * crypto_engine_alloc_init - allocate crypto hardware engine structure and
+ * initialize it.
+ * @dev: the device attached with one hardware engine
+ * @rt: whether this queue is set to run as a realtime task
+ *
+ * This must be called from context that can sleep.
+ * Return: the crypto engine structure on success, else NULL.
+ */
+struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
+{
+	return crypto_engine_alloc_init_and_set(dev, rt,
+						CRYPTO_ENGINE_MAX_CONCURRENT_REQS,
+						CRYPTO_ENGINE_MAX_QLEN);
+}
 EXPORT_SYMBOL_GPL(crypto_engine_alloc_init);
 
 /**
diff --git a/include/crypto/engine.h b/include/crypto/engine.h
index e29cd67..5f9a6df 100644
--- a/include/crypto/engine.h
+++ b/include/crypto/engine.h
@@ -24,7 +24,6 @@ 
  * @idling: the engine is entering idle state
  * @busy: request pump is busy
  * @running: the engine is on working
- * @cur_req_prepared: current request is prepared
  * @list: link with the global crypto engine list
  * @queue_lock: spinlock to syncronise access to request queue
  * @queue: the crypto queue of the engine
@@ -38,14 +37,14 @@ 
  * @kworker: kthread worker struct for request pump
  * @pump_requests: work struct for scheduling work to the request pump
  * @priv_data: the engine private data
- * @cur_req: the current request which is on processing
+ * @max_no_reqs: maximum number of request which can be processed in parallel
+ * @no_reqs: current number of request which are processed in parallel
  */
 struct crypto_engine {
 	char			name[ENGINE_NAME_LEN];
 	bool			idling;
 	bool			busy;
 	bool			running;
-	bool			cur_req_prepared;
 
 	struct list_head	list;
 	spinlock_t		queue_lock;
@@ -61,7 +60,8 @@  struct crypto_engine {
 	struct kthread_work             pump_requests;
 
 	void				*priv_data;
-	struct crypto_async_request	*cur_req;
+	int			max_no_reqs;
+	int			no_reqs;
 };
 
 /*
@@ -102,6 +102,9 @@  void crypto_finalize_skcipher_request(struct crypto_engine *engine,
 int crypto_engine_start(struct crypto_engine *engine);
 int crypto_engine_stop(struct crypto_engine *engine);
 struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt);
+struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
+						       bool rt, int max_no_reqs,
+						       int qlen);
 int crypto_engine_exit(struct crypto_engine *engine);
 
 #endif /* _CRYPTO_ENGINE_H */