diff mbox series

[RFC,v3,22/41] block: implement persistent commands

Message ID 20200430131904.5847-23-hare@suse.de (mailing list archive)
State Changes Requested
Headers show
Series scsi: enable reserved commands for LLDDs | expand

Commit Message

Hannes Reinecke April 30, 2020, 1:18 p.m. UTC
Some LLDDs implement event handling by sending a command to the
firmware, which then will be completed once the firmware wants
to register an event.
So worst case a command is being sent to the firmware then the
driver initializes, and will be returned once the driver unloads.
To avoid these commands to block the queues during freezing or
quiescing this patch implements support for 'persistent' commands,
which will be excluded from blk_queue_enter() and blk_queue_exit()
calls.

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 block/blk-mq.c            | 12 +++++++++---
 include/linux/blk-mq.h    |  2 ++
 include/linux/blk_types.h |  4 ++++
 3 files changed, 15 insertions(+), 3 deletions(-)

Comments

Bart Van Assche May 1, 2020, 4:59 a.m. UTC | #1
On 2020-04-30 06:18, Hannes Reinecke wrote:
> Some LLDDs implement event handling by sending a command to the
> firmware, which then will be completed once the firmware wants
> to register an event.
     ^^^^^^^^
     report?

> So worst case a command is being sent to the firmware then the
                                                        ^^^^
                                                        when?
> driver initializes, and will be returned once the driver unloads.
> To avoid these commands to block the queues during freezing or
> quiescing this patch implements support for 'persistent' commands,
> which will be excluded from blk_queue_enter() and blk_queue_exit()
> calls.

How is it prevented that the SCSI timeout handler is activated for
persistent commands?

> Signed-off-by: Hannes Reinecke <hare@suse.de>
> ---
>  block/blk-mq.c            | 12 +++++++++---
>  include/linux/blk-mq.h    |  2 ++
>  include/linux/blk_types.h |  4 ++++
>  3 files changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 44482aaed11e..402cf104d183 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -402,9 +402,14 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
>  {
>  	struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
>  	struct request *rq;
> -	int ret;
> +	int ret = 0;
>  
> -	ret = blk_queue_enter(q, flags);
> +	if (flags & BLK_MQ_REQ_PERSISTENT) {
> +		if (blk_queue_dying(q))
> +			ret = -ENODEV;
> +		alloc_data.cmd_flags |= REQ_PERSISTENT;
> +	} else
> +		ret = blk_queue_enter(q, flags);
>  	if (ret)
>  		return ERR_PTR(ret);
>  

I think that not calling blk_queue_enter() for persistent commands means
opening a giant can of worms. There is quite some code in the block
layer that assumes that neither .queue_rq() nor the request completion
code will be called if q_usage_counter == 0. Skipping the
blk_queue_enter() call for persistent commands breaks that assumption. I
think we need a better solution.

Thanks,

Bart.
Ming Lei May 1, 2020, 8:33 a.m. UTC | #2
On Thu, Apr 30, 2020 at 03:18:45PM +0200, Hannes Reinecke wrote:
> Some LLDDs implement event handling by sending a command to the
> firmware, which then will be completed once the firmware wants
> to register an event.
> So worst case a command is being sent to the firmware then the
> driver initializes, and will be returned once the driver unloads.
> To avoid these commands to block the queues during freezing or
> quiescing this patch implements support for 'persistent' commands,
> which will be excluded from blk_queue_enter() and blk_queue_exit()
> calls.

This way is quite dangerous from block layer viewpoint, and it should
have been done in driver/device specific way instead of polluting block
layer.


thanks, 
Ming
Hannes Reinecke May 2, 2020, 12:11 p.m. UTC | #3
On 5/1/20 6:59 AM, Bart Van Assche wrote:
> On 2020-04-30 06:18, Hannes Reinecke wrote:
>> Some LLDDs implement event handling by sending a command to the
>> firmware, which then will be completed once the firmware wants
>> to register an event.
>       ^^^^^^^^
>       report?
> 
>> So worst case a command is being sent to the firmware then the
>                                                          ^^^^
>                                                          when?
>> driver initializes, and will be returned once the driver unloads.
>> To avoid these commands to block the queues during freezing or
>> quiescing this patch implements support for 'persistent' commands,
>> which will be excluded from blk_queue_enter() and blk_queue_exit()
>> calls.
> 
> How is it prevented that the SCSI timeout handler is activated for
> persistent commands?
> 
>> Signed-off-by: Hannes Reinecke <hare@suse.de>
>> ---
>>   block/blk-mq.c            | 12 +++++++++---
>>   include/linux/blk-mq.h    |  2 ++
>>   include/linux/blk_types.h |  4 ++++
>>   3 files changed, 15 insertions(+), 3 deletions(-)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 44482aaed11e..402cf104d183 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -402,9 +402,14 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
>>   {
>>   	struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
>>   	struct request *rq;
>> -	int ret;
>> +	int ret = 0;
>>   
>> -	ret = blk_queue_enter(q, flags);
>> +	if (flags & BLK_MQ_REQ_PERSISTENT) {
>> +		if (blk_queue_dying(q))
>> +			ret = -ENODEV;
>> +		alloc_data.cmd_flags |= REQ_PERSISTENT;
>> +	} else
>> +		ret = blk_queue_enter(q, flags);
>>   	if (ret)
>>   		return ERR_PTR(ret);
>>   
> 
> I think that not calling blk_queue_enter() for persistent commands means
> opening a giant can of worms. There is quite some code in the block
> layer that assumes that neither .queue_rq() nor the request completion
> code will be called if q_usage_counter == 0. Skipping the
> blk_queue_enter() call for persistent commands breaks that assumption. I
> think we need a better solution.
> 
Well, yeah, maybe.
My aim here is that _all_ I/O requiring a tag from the hardware will be 
tracked by the blocklayer tagset. Only that will give the block-layer 
accurate information about outstanding commands, such that the ongoing 
CPU hotplug discussion can make the correct decisions and implement 
functions really covering all outstanding I/O.
It also allows us to use the scsi_host_busy_iter() functions within the 
driver, and will get rid of the hand-crafted iterations the driver has 
to do now.

It worked reasonably well, until I encountered the infamous AEN 
commands, which actually require the opposite: _not_ to be tracked by 
the block layer at all, as the commands themselves are just placeholders
to be returned by the firmware once an event occurs.
(And yes, I _do_ think this is a quite dangerous operation, because I 
can't quite see how one could reliably return this command in case of a 
firmware crash ...)
(But anyhow, that's how the firmware is written and we have to live with 
it.)

So I implemented this approach, to have tags which are ignored by the 
block layer. But I have to admit that this approach relies on quite some 
assumptions (like these tags are never actually submitted to the 
blocklayer itself, are never started etc), none of which are spelled out 
clearly (yet).
An alternative approach would be to arbitrary decrease the tagset size 
by one (eg by shifting the tags by one), and use the free tag for AENs).
That would have to be coded within the driver, though.

If that's a solution which you like better I could give it a go.

Cheers,

Hannes
Hannes Reinecke May 2, 2020, 12:22 p.m. UTC | #4
On 5/1/20 10:33 AM, Ming Lei wrote:
> On Thu, Apr 30, 2020 at 03:18:45PM +0200, Hannes Reinecke wrote:
>> Some LLDDs implement event handling by sending a command to the
>> firmware, which then will be completed once the firmware wants
>> to register an event.
>> So worst case a command is being sent to the firmware then the
>> driver initializes, and will be returned once the driver unloads.
>> To avoid these commands to block the queues during freezing or
>> quiescing this patch implements support for 'persistent' commands,
>> which will be excluded from blk_queue_enter() and blk_queue_exit()
>> calls.
> 
> This way is quite dangerous from block layer viewpoint, and it should
> have been done in driver/device specific way instead of polluting block
> layer.
> 
As already outlined in the reply to Bart, I'll be rewriting that 
requiring the drivers to set aside a separate tag and decrease the 
tagspace by one.
That should work as well.

Cheers,

Hannes
Bart Van Assche May 2, 2020, 4:22 p.m. UTC | #5
On 2020-05-02 05:11, Hannes Reinecke wrote:
> On 5/1/20 6:59 AM, Bart Van Assche wrote:
>> On 2020-04-30 06:18, Hannes Reinecke wrote:
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index 44482aaed11e..402cf104d183 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -402,9 +402,14 @@ struct request *blk_mq_alloc_request(struct
>>> request_queue *q, unsigned int op,
>>>   {
>>>       struct blk_mq_alloc_data alloc_data = { .flags = flags,
>>> .cmd_flags = op };
>>>       struct request *rq;
>>> -    int ret;
>>> +    int ret = 0;
>>>   -    ret = blk_queue_enter(q, flags);
>>> +    if (flags & BLK_MQ_REQ_PERSISTENT) {
>>> +        if (blk_queue_dying(q))
>>> +            ret = -ENODEV;
>>> +        alloc_data.cmd_flags |= REQ_PERSISTENT;
>>> +    } else
>>> +        ret = blk_queue_enter(q, flags);
>>>       if (ret)
>>>           return ERR_PTR(ret);
>>>   
>>
>> I think that not calling blk_queue_enter() for persistent commands means
>> opening a giant can of worms. There is quite some code in the block
>> layer that assumes that neither .queue_rq() nor the request completion
>> code will be called if q_usage_counter == 0. Skipping the
>> blk_queue_enter() call for persistent commands breaks that assumption. I
>> think we need a better solution.
>>
> Well, yeah, maybe.
> My aim here is that _all_ I/O requiring a tag from the hardware will be
> tracked by the blocklayer tagset. Only that will give the block-layer
> accurate information about outstanding commands, such that the ongoing
> CPU hotplug discussion can make the correct decisions and implement
> functions really covering all outstanding I/O.
> It also allows us to use the scsi_host_busy_iter() functions within the
> driver, and will get rid of the hand-crafted iterations the driver has
> to do now.
> 
> It worked reasonably well, until I encountered the infamous AEN
> commands, which actually require the opposite: _not_ to be tracked by
> the block layer at all, as the commands themselves are just placeholders
> to be returned by the firmware once an event occurs.
> (And yes, I _do_ think this is a quite dangerous operation, because I
> can't quite see how one could reliably return this command in case of a
> firmware crash ...)
> (But anyhow, that's how the firmware is written and we have to live with
> it.)
> 
> So I implemented this approach, to have tags which are ignored by the
> block layer. But I have to admit that this approach relies on quite some
> assumptions (like these tags are never actually submitted to the
> blocklayer itself, are never started etc), none of which are spelled out
> clearly (yet).
> An alternative approach would be to arbitrary decrease the tagset size
> by one (eg by shifting the tags by one), and use the free tag for AENs).
> That would have to be coded within the driver, though.
> 
> If that's a solution which you like better I could give it a go.

How about dropping support for AEN commands entirely? As far as I know
such a command has never been standardized. Additionally, all SCSI core
code I'm familiar with supports unit attentions and does not rely on
asynchronous events to be reported immediately.

If dropping support for AEN commands is not an option, how about
aborting these commands before freezing a request queue?

Thanks,

Bart.
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 44482aaed11e..402cf104d183 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -402,9 +402,14 @@  struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 {
 	struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
 	struct request *rq;
-	int ret;
+	int ret = 0;
 
-	ret = blk_queue_enter(q, flags);
+	if (flags & BLK_MQ_REQ_PERSISTENT) {
+		if (blk_queue_dying(q))
+			ret = -ENODEV;
+		alloc_data.cmd_flags |= REQ_PERSISTENT;
+	} else
+		ret = blk_queue_enter(q, flags);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -481,7 +486,8 @@  static void __blk_mq_free_request(struct request *rq)
 	if (sched_tag != -1)
 		blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
 	blk_mq_sched_restart(hctx);
-	blk_queue_exit(q);
+	if (!(rq->cmd_flags & REQ_PERSISTENT))
+		blk_queue_exit(q);
 }
 
 void blk_mq_free_request(struct request *rq)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c186dc25fc1c..a4b02196810c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -441,6 +441,8 @@  enum {
 	BLK_MQ_REQ_INTERNAL	= (__force blk_mq_req_flags_t)(1 << 2),
 	/* set RQF_PREEMPT */
 	BLK_MQ_REQ_PREEMPT	= (__force blk_mq_req_flags_t)(1 << 3),
+	/* mark request as persistent */
+	BLK_MQ_REQ_PERSISTENT	= (__force blk_mq_req_flags_t)(1 << 4),
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 70254ae11769..898e75e2e8b0 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -336,6 +336,9 @@  enum req_flag_bits {
 	/* command specific flags for REQ_OP_WRITE_ZEROES: */
 	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
 
+	/* Persistent firmware command, ignore for q_usage_counter */
+	__REQ_PERSISTENT,
+
 	__REQ_HIPRI,
 
 	/* for driver use */
@@ -362,6 +365,7 @@  enum req_flag_bits {
 #define REQ_CGROUP_PUNT		(1ULL << __REQ_CGROUP_PUNT)
 
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
+#define REQ_PERSISTENT		(1ULL << __REQ_PERSISTENT)
 #define REQ_HIPRI		(1ULL << __REQ_HIPRI)
 
 #define REQ_DRV			(1ULL << __REQ_DRV)