diff mbox series

block: introduce QUEUE_FLAG_POLL_CAP flag

Message ID 20210416080037.26335-1-jefflexu@linux.alibaba.com (mailing list archive)
State New
Headers show
Series block: introduce QUEUE_FLAG_POLL_CAP flag | expand

Commit Message

JeffleXu April 16, 2021, 8 a.m. UTC
Hi,
How about this patch to remove the extra poll_capable() method?

And the following 'dm: support IO polling for bio-based dm device' needs
following change.

```
+       /*
+        * Check for request-based device is remained to
+        * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
+        * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
+        * devices supporting polling.
+        */
+       if (__table_type_bio_based(t->type)) {
+               if (dm_table_supports_poll(t)) {
+                       blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
+                       blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+               }
+               else {
+                       blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
+                       blk_queue_flag_clear(QUEUE_FLAG_POLL_CAP, q);
+               }
+       }
```


Introduce QUEUE_FLAG_POLL_CAP flag, indicating if the device supports IO
polling or not. Thus both blk-mq and bio-based device could set this
flag at the initialization phase, and then only this flag needs to be
checked instead of rechecking if the device has the ability of IO
polling when enabling IO polling via sysfs.

For NVMe, the ability of IO polling may change after RESET, since
nvme.poll_queues module parameter may change. Thus the ability of IO
polling need to be rechecked after RESET.

Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
---
 block/blk-mq.c           |  5 +++--
 block/blk-sysfs.c        |  3 +--
 drivers/nvme/host/core.c |  2 ++
 include/linux/blk-mq.h   | 12 ++++++++++++
 include/linux/blkdev.h   |  2 ++
 5 files changed, 20 insertions(+), 4 deletions(-)

Comments

JeffleXu April 16, 2021, 8:42 a.m. UTC | #1
On 4/16/21 4:00 PM, Jeffle Xu wrote:
> Hi,
> How about this patch to remove the extra poll_capable() method?
> 
> And the following 'dm: support IO polling for bio-based dm device' needs
> following change.
> 
> ```
> +       /*
> +        * Check for request-based device is remained to
> +        * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
> +        * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
> +        * devices supporting polling.
> +        */
> +       if (__table_type_bio_based(t->type)) {
> +               if (dm_table_supports_poll(t)) {
> +                       blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
> +                       blk_queue_flag_set(QUEUE_FLAG_POLL, q);
> +               }
> +               else {
> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL_CAP, q);
> +               }
> +       }
> ```
> 
> 
> Introduce QUEUE_FLAG_POLL_CAP flag, indicating if the device supports IO
> polling or not. Thus both blk-mq and bio-based device could set this
> flag at the initialization phase, and then only this flag needs to be
> checked instead of rechecking if the device has the ability of IO
> polling when enabling IO polling via sysfs.
> 
> For NVMe, the ability of IO polling may change after RESET, since
> nvme.poll_queues module parameter may change. Thus the ability of IO
> polling need to be rechecked after RESET.
> 
The defect of this approach is that all device drivers that may change
tag_set after initialization (e.g., NVMe RESET) need to update
QUEUE_FLAG_POLL_CAP. Previous this patch, tag_set is checked directly in
queue_poll_store, and thus device drivers don't need to care the
queue_flags.


> Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
> ---
>  block/blk-mq.c           |  5 +++--
>  block/blk-sysfs.c        |  3 +--
>  drivers/nvme/host/core.c |  2 ++
>  include/linux/blk-mq.h   | 12 ++++++++++++
>  include/linux/blkdev.h   |  2 ++
>  5 files changed, 20 insertions(+), 4 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 414f5d99d9de..55ef6b975169 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -3227,9 +3227,10 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>  	q->tag_set = set;
>  
>  	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
> -	if (set->nr_maps > HCTX_TYPE_POLL &&
> -	    set->map[HCTX_TYPE_POLL].nr_queues)
> +	if (blk_mq_poll_capable(set)) {
> +		blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
>  		blk_queue_flag_set(QUEUE_FLAG_POLL, q);
> +	}
>  
>  	q->sg_reserved_size = INT_MAX;
>  
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index db3268d41274..64f0ab84b606 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -426,8 +426,7 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
>  	unsigned long poll_on;
>  	ssize_t ret;
>  
> -	if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
> -	    !q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
> +	if(!blk_queue_poll_cap(q))
>  		return -EINVAL;
>  
>  	ret = queue_var_store(&poll_on, page, count);
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index bb7da34dd967..5344cc877b05 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -2210,6 +2210,8 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
>  	ns->lba_shift = id->lbaf[lbaf].ds;
>  	nvme_set_queue_limits(ns->ctrl, ns->queue);
>  
> +	blk_mq_check_poll(ns->disk->queue, ns->disk->queue->tag_set);
> +
>  	ret = nvme_configure_metadata(ns, id);
>  	if (ret)
>  		goto out_unfreeze;
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 2c473c9b8990..ee4c89c8bebc 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -618,4 +618,16 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio);
>  void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
>  		struct lock_class_key *key);
>  
> +static inline bool blk_mq_poll_capable(struct blk_mq_tag_set *set)
> +{
> +	return set->nr_maps > HCTX_TYPE_POLL &&
> +	       set->map[HCTX_TYPE_POLL].nr_queues;
> +}
> +


> +static inline void blk_mq_check_poll(struct request_queue *q,
> +				     struct blk_mq_tag_set *set)
> +{
> +	if (blk_mq_poll_capable(set))
> +		blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
> +}


Sorry it should be

> +static inline void blk_mq_check_poll(struct request_queue *q,
> +				     struct blk_mq_tag_set *set)
> +{
> +	if (blk_mq_poll_capable(set))
> +		blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
> +}
> +	else
> +		blk_queue_flag_clear(QUEUE_FLAG_POLL_CAP, q);
> +}


>  #endif
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 1e88116dc070..d192a106bf40 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -621,6 +621,7 @@ struct request_queue {
>  #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
>  #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
>  #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
> +#define QUEUE_FLAG_POLL_CAP     30	/* device supports IO polling */
>  
>  #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
>  				 (1 << QUEUE_FLAG_SAME_COMP) |		\
> @@ -668,6 +669,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
>  #define blk_queue_registered(q)	test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
>  #define blk_queue_nowait(q)	test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
>  #define blk_queue_poll(q)	test_bit(QUEUE_FLAG_POLL, &(q)->queue_flags)
> +#define blk_queue_poll_cap(q)	test_bit(QUEUE_FLAG_POLL_CAP, &(q)->queue_flags)
>  
>  extern void blk_set_pm_only(struct request_queue *q);
>  extern void blk_clear_pm_only(struct request_queue *q);
>
Ming Lei April 16, 2021, 9:07 a.m. UTC | #2
On Fri, Apr 16, 2021 at 04:00:37PM +0800, Jeffle Xu wrote:
> Hi,
> How about this patch to remove the extra poll_capable() method?
> 
> And the following 'dm: support IO polling for bio-based dm device' needs
> following change.
> 
> ```
> +       /*
> +        * Check for request-based device is remained to
> +        * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
> +        * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
> +        * devices supporting polling.
> +        */
> +       if (__table_type_bio_based(t->type)) {
> +               if (dm_table_supports_poll(t)) {
> +                       blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
> +                       blk_queue_flag_set(QUEUE_FLAG_POLL, q);
> +               }
> +               else {
> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL_CAP, q);
> +               }
> +       }
> ```

Frankly speaking, I don't see any value of using QUEUE_FLAG_POLL_CAP for
DM, and the result is basically subset of treating DM as always being capable
of polling.

Also underlying queue change(either limits or flag) won't be propagated
to DM/MD automatically. Strictly speaking it doesn't matter if all underlying
queues are capable of supporting polling at the exact time of 'write sysfs/poll',
cause any of them may change in future.

So why not start with the simplest approach(always capable of polling)
which does meet normal bio based polling requirement?



Thanks,
Ming
JeffleXu April 16, 2021, 10:20 a.m. UTC | #3
On 4/16/21 5:07 PM, Ming Lei wrote:
> On Fri, Apr 16, 2021 at 04:00:37PM +0800, Jeffle Xu wrote:
>> Hi,
>> How about this patch to remove the extra poll_capable() method?
>>
>> And the following 'dm: support IO polling for bio-based dm device' needs
>> following change.
>>
>> ```
>> +       /*
>> +        * Check for request-based device is remained to
>> +        * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
>> +        * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
>> +        * devices supporting polling.
>> +        */
>> +       if (__table_type_bio_based(t->type)) {
>> +               if (dm_table_supports_poll(t)) {
>> +                       blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
>> +                       blk_queue_flag_set(QUEUE_FLAG_POLL, q);
>> +               }
>> +               else {
>> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
>> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL_CAP, q);
>> +               }
>> +       }
>> ```
> 
> Frankly speaking, I don't see any value of using QUEUE_FLAG_POLL_CAP for
> DM, and the result is basically subset of treating DM as always being capable
> of polling.
> 
> Also underlying queue change(either limits or flag) won't be propagated
> to DM/MD automatically. Strictly speaking it doesn't matter if all underlying
> queues are capable of supporting polling at the exact time of 'write sysfs/poll',
> cause any of them may change in future.

Yes it is.

> 
> So why not start with the simplest approach(always capable of polling)
> which does meet normal bio based polling requirement?
> 

I agree if we have no better way. Though the handling of "sysfs/io_poll"
is somehow inconsistent between blk-mq and bio-based device then.
JeffleXu April 17, 2021, 2:06 p.m. UTC | #4
On 4/16/21 5:07 PM, Ming Lei wrote:
> On Fri, Apr 16, 2021 at 04:00:37PM +0800, Jeffle Xu wrote:
>> Hi,
>> How about this patch to remove the extra poll_capable() method?
>>
>> And the following 'dm: support IO polling for bio-based dm device' needs
>> following change.
>>
>> ```
>> +       /*
>> +        * Check for request-based device is remained to
>> +        * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
>> +        * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
>> +        * devices supporting polling.
>> +        */
>> +       if (__table_type_bio_based(t->type)) {
>> +               if (dm_table_supports_poll(t)) {
>> +                       blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
>> +                       blk_queue_flag_set(QUEUE_FLAG_POLL, q);
>> +               }
>> +               else {
>> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
>> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL_CAP, q);
>> +               }
>> +       }
>> ```
> 
> Frankly speaking, I don't see any value of using QUEUE_FLAG_POLL_CAP for
> DM, and the result is basically subset of treating DM as always being capable
> of polling.
> 
> Also underlying queue change(either limits or flag) won't be propagated
> to DM/MD automatically. Strictly speaking it doesn't matter if all underlying
> queues are capable of supporting polling at the exact time of 'write sysfs/poll',
> cause any of them may change in future.
> 
> So why not start with the simplest approach(always capable of polling)
> which does meet normal bio based polling requirement?
> 

I find one scenario where this issue may matter. Consider the scenario
where HIPRI bios are submitted to DM device though **all** underlying
devices has been disabled for polling. In this case, a **valid** cookie
(pid of current submitting process) is still returned. Then if @spin of
the following blk_poll() is true, blk_poll() will get stuck in dead loop
because blk_mq_poll() always returns 0, since previously submitted bios
are all enqueued into IRQ hw queue.

Maybe you need to re-remove the bio from the poll context if the
returned cookie is BLK_QC_T_NONE?


Something like:

-static blk_qc_t __submit_bio_noacct(struct bio *bio)
+static blk_qc_t __submit_bio_noacct_ctx(struct bio *bio, struct
io_context *ioc)
 {
 	struct bio_list bio_list_on_stack[2];
 	blk_qc_t ret = BLK_QC_T_NONE;
@@ -1047,7 +1163,15 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
 		bio_list_on_stack[1] = bio_list_on_stack[0];
 		bio_list_init(&bio_list_on_stack[0]);

		if (ioc && queue_is_mq(q) && (bio->bi_opf & REQ_HIPRI)) {
			bool queued = blk_bio_poll_prep_submit(ioc, bio);

			ret = __submit_bio(bio);
+			if (queued && !blk_qc_t_valid(ret))
				/* TODO:remove bio from poll_context */
				
				bio_set_private_data(bio, ret);
		} else {
			ret = __submit_bio(bio);
		}


Then if you'd like fix this in this way, the returned value of
.submit_bio() of DM/MD also needs to return BLK_QC_T_NONE now. Currently
.submit_bio() of DM actually returns the cookie of the last split bio
(to underlying mq deivice).
Ming Lei April 19, 2021, 2:21 a.m. UTC | #5
On Sat, Apr 17, 2021 at 10:06:53PM +0800, JeffleXu wrote:
> 
> 
> On 4/16/21 5:07 PM, Ming Lei wrote:
> > On Fri, Apr 16, 2021 at 04:00:37PM +0800, Jeffle Xu wrote:
> >> Hi,
> >> How about this patch to remove the extra poll_capable() method?
> >>
> >> And the following 'dm: support IO polling for bio-based dm device' needs
> >> following change.
> >>
> >> ```
> >> +       /*
> >> +        * Check for request-based device is remained to
> >> +        * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
> >> +        * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
> >> +        * devices supporting polling.
> >> +        */
> >> +       if (__table_type_bio_based(t->type)) {
> >> +               if (dm_table_supports_poll(t)) {
> >> +                       blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
> >> +                       blk_queue_flag_set(QUEUE_FLAG_POLL, q);
> >> +               }
> >> +               else {
> >> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
> >> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL_CAP, q);
> >> +               }
> >> +       }
> >> ```
> > 
> > Frankly speaking, I don't see any value of using QUEUE_FLAG_POLL_CAP for
> > DM, and the result is basically subset of treating DM as always being capable
> > of polling.
> > 
> > Also underlying queue change(either limits or flag) won't be propagated
> > to DM/MD automatically. Strictly speaking it doesn't matter if all underlying
> > queues are capable of supporting polling at the exact time of 'write sysfs/poll',
> > cause any of them may change in future.
> > 
> > So why not start with the simplest approach(always capable of polling)
> > which does meet normal bio based polling requirement?
> > 
> 
> I find one scenario where this issue may matter. Consider the scenario
> where HIPRI bios are submitted to DM device though **all** underlying
> devices has been disabled for polling. In this case, a **valid** cookie
> (pid of current submitting process) is still returned. Then if @spin of
> the following blk_poll() is true, blk_poll() will get stuck in dead loop
> because blk_mq_poll() always returns 0, since previously submitted bios
> are all enqueued into IRQ hw queue.
> 
> Maybe you need to re-remove the bio from the poll context if the
> returned cookie is BLK_QC_T_NONE?

It won't be one issue, see blk_bio_poll_preprocess() which is called
from submit_bio_checks(), so any bio's HIPRI will be cleared if the
queue doesn't support POLL, that code does cover underlying bios.

> 
> 
> Something like:
> 
> -static blk_qc_t __submit_bio_noacct(struct bio *bio)
> +static blk_qc_t __submit_bio_noacct_ctx(struct bio *bio, struct
> io_context *ioc)
>  {
>  	struct bio_list bio_list_on_stack[2];
>  	blk_qc_t ret = BLK_QC_T_NONE;
> @@ -1047,7 +1163,15 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>  		bio_list_on_stack[1] = bio_list_on_stack[0];
>  		bio_list_init(&bio_list_on_stack[0]);
> 
> 		if (ioc && queue_is_mq(q) && (bio->bi_opf & REQ_HIPRI)) {

REQ_HIPRI won't be set for underlying bios which queue doesn't support
poll, so this branch won't be reached. And the submission queue will
be empty, and blk_poll() for DM/MD(bio based queue) checks nothing, but
the polling won't be stopped until the iocb is completed. And this
handling is actually same with current polling on IRQ based queue.

> 			bool queued = blk_bio_poll_prep_submit(ioc, bio);
> 
> 			ret = __submit_bio(bio);
> +			if (queued && !blk_qc_t_valid(ret))
> 				/* TODO:remove bio from poll_context */
> 				
> 				bio_set_private_data(bio, ret);
> 		} else {
> 			ret = __submit_bio(bio);
> 		}
> 
> 
> Then if you'd like fix this in this way, the returned value of
> .submit_bio() of DM/MD also needs to return BLK_QC_T_NONE now. Currently
> .submit_bio() of DM actually returns the cookie of the last split bio
> (to underlying mq deivice).

I am a bit confused, this patch requires .submit_bio() of DM/MD(bio
based queue) to return either 0 or pid of the submission task.


Thanks,
Ming
JeffleXu April 19, 2021, 5:40 a.m. UTC | #6
On 4/19/21 10:21 AM, Ming Lei wrote:
> On Sat, Apr 17, 2021 at 10:06:53PM +0800, JeffleXu wrote:
>>
>>
>> On 4/16/21 5:07 PM, Ming Lei wrote:
>>> On Fri, Apr 16, 2021 at 04:00:37PM +0800, Jeffle Xu wrote:
>>>> Hi,
>>>> How about this patch to remove the extra poll_capable() method?
>>>>
>>>> And the following 'dm: support IO polling for bio-based dm device' needs
>>>> following change.
>>>>
>>>> ```
>>>> +       /*
>>>> +        * Check for request-based device is remained to
>>>> +        * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
>>>> +        * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
>>>> +        * devices supporting polling.
>>>> +        */
>>>> +       if (__table_type_bio_based(t->type)) {
>>>> +               if (dm_table_supports_poll(t)) {
>>>> +                       blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
>>>> +                       blk_queue_flag_set(QUEUE_FLAG_POLL, q);
>>>> +               }
>>>> +               else {
>>>> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
>>>> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL_CAP, q);
>>>> +               }
>>>> +       }
>>>> ```
>>>
>>> Frankly speaking, I don't see any value of using QUEUE_FLAG_POLL_CAP for
>>> DM, and the result is basically subset of treating DM as always being capable
>>> of polling.
>>>
>>> Also underlying queue change(either limits or flag) won't be propagated
>>> to DM/MD automatically. Strictly speaking it doesn't matter if all underlying
>>> queues are capable of supporting polling at the exact time of 'write sysfs/poll',
>>> cause any of them may change in future.
>>>
>>> So why not start with the simplest approach(always capable of polling)
>>> which does meet normal bio based polling requirement?
>>>
>>
>> I find one scenario where this issue may matter. Consider the scenario
>> where HIPRI bios are submitted to DM device though **all** underlying
>> devices has been disabled for polling. In this case, a **valid** cookie
>> (pid of current submitting process) is still returned. Then if @spin of
>> the following blk_poll() is true, blk_poll() will get stuck in dead loop
>> because blk_mq_poll() always returns 0, since previously submitted bios
>> are all enqueued into IRQ hw queue.
>>
>> Maybe you need to re-remove the bio from the poll context if the
>> returned cookie is BLK_QC_T_NONE?
> 
> It won't be one issue, see blk_bio_poll_preprocess() which is called
> from submit_bio_checks(), so any bio's HIPRI will be cleared if the
> queue doesn't support POLL, that code does cover underlying bios.

Sorry there may be some confusion in my description. Let's discuss in
the following scenario: MD/DM advertise QUEUE_FLAG_POLL, though **all**
underlying devices are without QUEUE_FLAG_POLL. This scenario is
possible, if you want to enable MD/DM's polling without checking the
capability of underlying devices.

In this case, it seems that REQ_HIPRI is kept for both MD/DM and
underlying blk-mq devices. I used to think that REQ_HIPRI will be
cleared for underlying blk-mq deivces, but now it seems that REQ_HIPRI
of bios submitted to underlying blk-mq deivces won't be cleared, since
submit_bio_checks() is only called in the entry of submit_bio(), not in
the while() loop of __submit_bio_noacct_ctx(). Though these underlying
blk-mq devices don't support IO polling at all, or they all have been
disabled for polling, REQ_HIPRI bios are finally submitted down.

Or do I miss something?


> 
>>
>>
>> Something like:
>>
>> -static blk_qc_t __submit_bio_noacct(struct bio *bio)
>> +static blk_qc_t __submit_bio_noacct_ctx(struct bio *bio, struct
>> io_context *ioc)
>>  {
>>  	struct bio_list bio_list_on_stack[2];
>>  	blk_qc_t ret = BLK_QC_T_NONE;
>> @@ -1047,7 +1163,15 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>>  		bio_list_on_stack[1] = bio_list_on_stack[0];
>>  		bio_list_init(&bio_list_on_stack[0]);
>>
>> 		if (ioc && queue_is_mq(q) && (bio->bi_opf & REQ_HIPRI)) {
> 
> REQ_HIPRI won't be set for underlying bios which queue doesn't support
> poll, so this branch won't be reached. 

Sorry I missed the '(bio->bi_opf & REQ_HIPRI)' condition here. Indeed
bio without REQ_HIPRI won't be enqueued into the poll_context.

> And the submission queue will
> be empty, and blk_poll() for DM/MD(bio based queue) checks nothing, but
> the polling won't be stopped until the iocb is completed. And this
> handling is actually same with current polling on IRQ based queue.
> 
>> 			bool queued = blk_bio_poll_prep_submit(ioc, bio);
>>
>> 			ret = __submit_bio(bio);
>> +			if (queued && !blk_qc_t_valid(ret))
>> 				/* TODO:remove bio from poll_context */
>> 				
>> 				bio_set_private_data(bio, ret);
>> 		} else {
>> 			ret = __submit_bio(bio);
>> 		}
>>
>>
>> Then if you'd like fix this in this way, the returned value of
>> .submit_bio() of DM/MD also needs to return BLK_QC_T_NONE now. Currently
>> .submit_bio() of DM actually returns the cookie of the last split bio
>> (to underlying mq deivice).
> 
> I am a bit confused, this patch requires .submit_bio() of DM/MD(bio
> based queue) to return either 0 or pid of the submission task.
> 
> 
> Thanks,
> Ming
>
Ming Lei April 19, 2021, 1:36 p.m. UTC | #7
On Mon, Apr 19, 2021 at 01:40:21PM +0800, JeffleXu wrote:
> 
> 
> On 4/19/21 10:21 AM, Ming Lei wrote:
> > On Sat, Apr 17, 2021 at 10:06:53PM +0800, JeffleXu wrote:
> >>
> >>
> >> On 4/16/21 5:07 PM, Ming Lei wrote:
> >>> On Fri, Apr 16, 2021 at 04:00:37PM +0800, Jeffle Xu wrote:
> >>>> Hi,
> >>>> How about this patch to remove the extra poll_capable() method?
> >>>>
> >>>> And the following 'dm: support IO polling for bio-based dm device' needs
> >>>> following change.
> >>>>
> >>>> ```
> >>>> +       /*
> >>>> +        * Check for request-based device is remained to
> >>>> +        * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
> >>>> +        * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
> >>>> +        * devices supporting polling.
> >>>> +        */
> >>>> +       if (__table_type_bio_based(t->type)) {
> >>>> +               if (dm_table_supports_poll(t)) {
> >>>> +                       blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
> >>>> +                       blk_queue_flag_set(QUEUE_FLAG_POLL, q);
> >>>> +               }
> >>>> +               else {
> >>>> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
> >>>> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL_CAP, q);
> >>>> +               }
> >>>> +       }
> >>>> ```
> >>>
> >>> Frankly speaking, I don't see any value of using QUEUE_FLAG_POLL_CAP for
> >>> DM, and the result is basically subset of treating DM as always being capable
> >>> of polling.
> >>>
> >>> Also underlying queue change(either limits or flag) won't be propagated
> >>> to DM/MD automatically. Strictly speaking it doesn't matter if all underlying
> >>> queues are capable of supporting polling at the exact time of 'write sysfs/poll',
> >>> cause any of them may change in future.
> >>>
> >>> So why not start with the simplest approach(always capable of polling)
> >>> which does meet normal bio based polling requirement?
> >>>
> >>
> >> I find one scenario where this issue may matter. Consider the scenario
> >> where HIPRI bios are submitted to DM device though **all** underlying
> >> devices has been disabled for polling. In this case, a **valid** cookie
> >> (pid of current submitting process) is still returned. Then if @spin of
> >> the following blk_poll() is true, blk_poll() will get stuck in dead loop
> >> because blk_mq_poll() always returns 0, since previously submitted bios
> >> are all enqueued into IRQ hw queue.
> >>
> >> Maybe you need to re-remove the bio from the poll context if the
> >> returned cookie is BLK_QC_T_NONE?
> > 
> > It won't be one issue, see blk_bio_poll_preprocess() which is called
> > from submit_bio_checks(), so any bio's HIPRI will be cleared if the
> > queue doesn't support POLL, that code does cover underlying bios.
> 
> Sorry there may be some confusion in my description. Let's discuss in
> the following scenario: MD/DM advertise QUEUE_FLAG_POLL, though **all**
> underlying devices are without QUEUE_FLAG_POLL. This scenario is
> possible, if you want to enable MD/DM's polling without checking the
> capability of underlying devices.
> 
> In this case, it seems that REQ_HIPRI is kept for both MD/DM and
> underlying blk-mq devices. I used to think that REQ_HIPRI will be
> cleared for underlying blk-mq deivces, but now it seems that REQ_HIPRI
> of bios submitted to underlying blk-mq deivces won't be cleared, since
> submit_bio_checks() is only called in the entry of submit_bio(), not in
> the while() loop of __submit_bio_noacct_ctx(). Though these underlying
> blk-mq devices don't support IO polling at all, or they all have been
> disabled for polling, REQ_HIPRI bios are finally submitted down.
> 
> Or do I miss something?

No matter the loop, the bios are actually submitted to the
current->bio_list via submit_bio_noacct() or submit_bio().
'grep -r submit_bio drivers/md' will show you the point.

Also it is a bug if one underlying bio is submitted without being checked.

You can observe it by the following bpftrace when you run io_uring on dm
disk:

#include <linux/blkdev.h>

kprobe:blk_mq_submit_bio
/strncmp(((struct bio *)arg0)->bi_bdev->bd_disk->disk_name, "nvme", 4) == 0/
{
	$b = (struct bio *)arg0;
	$hipri = $b->bi_opf & (1 << __REQ_HIPRI);

	printf("%s %d: %s %lu %lu high prio %d\n", comm, tid, $b->bi_bdev->bd_disk->disk_name,
		$b->bi_iter.bi_sector, $b->bi_iter.bi_size, $hipri);
}


> 
> 
> > 
> >>
> >>
> >> Something like:
> >>
> >> -static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >> +static blk_qc_t __submit_bio_noacct_ctx(struct bio *bio, struct
> >> io_context *ioc)
> >>  {
> >>  	struct bio_list bio_list_on_stack[2];
> >>  	blk_qc_t ret = BLK_QC_T_NONE;
> >> @@ -1047,7 +1163,15 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
> >>  		bio_list_on_stack[1] = bio_list_on_stack[0];
> >>  		bio_list_init(&bio_list_on_stack[0]);
> >>
> >> 		if (ioc && queue_is_mq(q) && (bio->bi_opf & REQ_HIPRI)) {
> > 
> > REQ_HIPRI won't be set for underlying bios which queue doesn't support
> > poll, so this branch won't be reached. 
> 
> Sorry I missed the '(bio->bi_opf & REQ_HIPRI)' condition here. Indeed
> bio without REQ_HIPRI won't be enqueued into the poll_context.

Even though these bios are queued, blk_poll() still can handle them
easily by just ignoring queues which aren't POLL_TYPE. However, I still
think their HIPRI will be cleared.

Thanks,
Ming
JeffleXu April 20, 2021, 7:25 a.m. UTC | #8
On 4/19/21 9:36 PM, Ming Lei wrote:
> On Mon, Apr 19, 2021 at 01:40:21PM +0800, JeffleXu wrote:
>>
>>
>> On 4/19/21 10:21 AM, Ming Lei wrote:
>>> On Sat, Apr 17, 2021 at 10:06:53PM +0800, JeffleXu wrote:
>>>>
>>>>
>>>> On 4/16/21 5:07 PM, Ming Lei wrote:
>>>>> On Fri, Apr 16, 2021 at 04:00:37PM +0800, Jeffle Xu wrote:
>>>>>> Hi,
>>>>>> How about this patch to remove the extra poll_capable() method?
>>>>>>
>>>>>> And the following 'dm: support IO polling for bio-based dm device' needs
>>>>>> following change.
>>>>>>
>>>>>> ```
>>>>>> +       /*
>>>>>> +        * Check for request-based device is remained to
>>>>>> +        * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
>>>>>> +        * For bio-based device, only set QUEUE_FLAG_POLL when all underlying
>>>>>> +        * devices supporting polling.
>>>>>> +        */
>>>>>> +       if (__table_type_bio_based(t->type)) {
>>>>>> +               if (dm_table_supports_poll(t)) {
>>>>>> +                       blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
>>>>>> +                       blk_queue_flag_set(QUEUE_FLAG_POLL, q);
>>>>>> +               }
>>>>>> +               else {
>>>>>> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
>>>>>> +                       blk_queue_flag_clear(QUEUE_FLAG_POLL_CAP, q);
>>>>>> +               }
>>>>>> +       }
>>>>>> ```
>>>>>
>>>>> Frankly speaking, I don't see any value of using QUEUE_FLAG_POLL_CAP for
>>>>> DM, and the result is basically subset of treating DM as always being capable
>>>>> of polling.
>>>>>
>>>>> Also underlying queue change(either limits or flag) won't be propagated
>>>>> to DM/MD automatically. Strictly speaking it doesn't matter if all underlying
>>>>> queues are capable of supporting polling at the exact time of 'write sysfs/poll',
>>>>> cause any of them may change in future.
>>>>>
>>>>> So why not start with the simplest approach(always capable of polling)
>>>>> which does meet normal bio based polling requirement?
>>>>>
>>>>
>>>> I find one scenario where this issue may matter. Consider the scenario
>>>> where HIPRI bios are submitted to DM device though **all** underlying
>>>> devices has been disabled for polling. In this case, a **valid** cookie
>>>> (pid of current submitting process) is still returned. Then if @spin of
>>>> the following blk_poll() is true, blk_poll() will get stuck in dead loop
>>>> because blk_mq_poll() always returns 0, since previously submitted bios
>>>> are all enqueued into IRQ hw queue.
>>>>
>>>> Maybe you need to re-remove the bio from the poll context if the
>>>> returned cookie is BLK_QC_T_NONE?
>>>
>>> It won't be one issue, see blk_bio_poll_preprocess() which is called
>>> from submit_bio_checks(), so any bio's HIPRI will be cleared if the
>>> queue doesn't support POLL, that code does cover underlying bios.
>>
>> Sorry there may be some confusion in my description. Let's discuss in
>> the following scenario: MD/DM advertise QUEUE_FLAG_POLL, though **all**
>> underlying devices are without QUEUE_FLAG_POLL. This scenario is
>> possible, if you want to enable MD/DM's polling without checking the
>> capability of underlying devices.
>>
>> In this case, it seems that REQ_HIPRI is kept for both MD/DM and
>> underlying blk-mq devices. I used to think that REQ_HIPRI will be
>> cleared for underlying blk-mq deivces, but now it seems that REQ_HIPRI
>> of bios submitted to underlying blk-mq deivces won't be cleared, since
>> submit_bio_checks() is only called in the entry of submit_bio(), not in
>> the while() loop of __submit_bio_noacct_ctx(). Though these underlying
>> blk-mq devices don't support IO polling at all, or they all have been
>> disabled for polling, REQ_HIPRI bios are finally submitted down.
>>
>> Or do I miss something?
> 
> No matter the loop, the bios are actually submitted to the
> current->bio_list via submit_bio_noacct() or submit_bio().
> 'grep -r submit_bio drivers/md' will show you the point.

Oops. I forgot that. Thanks and sorry for the noise.

So if that's the case, it seems that patch 11/12 are not needed anymore.
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 414f5d99d9de..55ef6b975169 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3227,9 +3227,10 @@  struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	q->tag_set = set;
 
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
-	if (set->nr_maps > HCTX_TYPE_POLL &&
-	    set->map[HCTX_TYPE_POLL].nr_queues)
+	if (blk_mq_poll_capable(set)) {
+		blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
 		blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+	}
 
 	q->sg_reserved_size = INT_MAX;
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index db3268d41274..64f0ab84b606 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -426,8 +426,7 @@  static ssize_t queue_poll_store(struct request_queue *q, const char *page,
 	unsigned long poll_on;
 	ssize_t ret;
 
-	if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
-	    !q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
+	if(!blk_queue_poll_cap(q))
 		return -EINVAL;
 
 	ret = queue_var_store(&poll_on, page, count);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index bb7da34dd967..5344cc877b05 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2210,6 +2210,8 @@  static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
 	ns->lba_shift = id->lbaf[lbaf].ds;
 	nvme_set_queue_limits(ns->ctrl, ns->queue);
 
+	blk_mq_check_poll(ns->disk->queue, ns->disk->queue->tag_set);
+
 	ret = nvme_configure_metadata(ns, id);
 	if (ret)
 		goto out_unfreeze;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2c473c9b8990..ee4c89c8bebc 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -618,4 +618,16 @@  blk_qc_t blk_mq_submit_bio(struct bio *bio);
 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
 		struct lock_class_key *key);
 
+static inline bool blk_mq_poll_capable(struct blk_mq_tag_set *set)
+{
+	return set->nr_maps > HCTX_TYPE_POLL &&
+	       set->map[HCTX_TYPE_POLL].nr_queues;
+}
+
+static inline void blk_mq_check_poll(struct request_queue *q,
+				     struct blk_mq_tag_set *set)
+{
+	if (blk_mq_poll_capable(set))
+		blk_queue_flag_set(QUEUE_FLAG_POLL_CAP, q);
+}
 #endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1e88116dc070..d192a106bf40 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -621,6 +621,7 @@  struct request_queue {
 #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
 #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
 #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
+#define QUEUE_FLAG_POLL_CAP     30	/* device supports IO polling */
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_SAME_COMP) |		\
@@ -668,6 +669,7 @@  bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_registered(q)	test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
 #define blk_queue_nowait(q)	test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
 #define blk_queue_poll(q)	test_bit(QUEUE_FLAG_POLL, &(q)->queue_flags)
+#define blk_queue_poll_cap(q)	test_bit(QUEUE_FLAG_POLL_CAP, &(q)->queue_flags)
 
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);