diff mbox series

[3/8] block: don't update BLK_FEAT_POLL in __blk_mq_update_nr_hw_queues

Message ID 20250107063120.1011593-4-hch@lst.de (mailing list archive)
State New
Headers show
Series [1/8] block: fix docs for freezing of queue limits updates | expand

Commit Message

Christoph Hellwig Jan. 7, 2025, 6:30 a.m. UTC
When __blk_mq_update_nr_hw_queues changes the number of tag sets, it
might have to disable poll queues.  Currently it does so by adjusting
the BLK_FEAT_POLL, which is a bit against the intent of features that
describe hardware / driver capabilities, but more importantly causes
nasty lock order problems with the broadly held freeze when updating the
number of hardware queues and the limits lock.  Fix this by leaving
BLK_FEAT_POLL alone, and instead check for the number of poll queues in
the bio submission and poll handlers.  While this adds extra work to the
fast path, the variables are in cache lines used by these operations
anyway, so it should be cheap enough.

Fixes: 8023e144f9d6 ("block: move the poll flag to queue_limits")
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-core.c | 17 ++++++++++++++---
 block/blk-mq.c   | 17 +----------------
 2 files changed, 15 insertions(+), 19 deletions(-)

Comments

Nilay Shroff Jan. 7, 2025, 6:57 a.m. UTC | #1
On 1/7/25 12:00 PM, Christoph Hellwig wrote:
> When __blk_mq_update_nr_hw_queues changes the number of tag sets, it
> might have to disable poll queues.  Currently it does so by adjusting
> the BLK_FEAT_POLL, which is a bit against the intent of features that
> describe hardware / driver capabilities, but more importantly causes
> nasty lock order problems with the broadly held freeze when updating the
> number of hardware queues and the limits lock.  Fix this by leaving
> BLK_FEAT_POLL alone, and instead check for the number of poll queues in
> the bio submission and poll handlers.  While this adds extra work to the
> fast path, the variables are in cache lines used by these operations
> anyway, so it should be cheap enough.
> 
> Fixes: 8023e144f9d6 ("block: move the poll flag to queue_limits")
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/blk-core.c | 17 ++++++++++++++---
>  block/blk-mq.c   | 17 +----------------
>  2 files changed, 15 insertions(+), 19 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 666efe8fa202..bd5bec843d37 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -753,6 +753,18 @@ static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q,
>  	return BLK_STS_OK;
>  }
>  
> +static bool bdev_can_poll(struct block_device *bdev)
> +{
> +	struct request_queue *q = bdev_get_queue(bdev);
> +
> +	if (!(q->limits.features & BLK_FEAT_POLL))
> +		return false;
> +
> +	if (queue_is_mq(q))
> +		return q->tag_set->map[HCTX_TYPE_POLL].nr_queues;
> +	return true;
> +}
> +
As discussed in another thread with Damien, shouldn't we need to 
move bdev_can_poll() to header file? We also need to use this 
function while reading sysfs attribute "io-poll", no?  

>  /**
>   * submit_bio_noacct - re-submit a bio to the block device layer for I/O
>   * @bio:  The bio describing the location in memory and on the device.
> @@ -805,8 +817,7 @@ void submit_bio_noacct(struct bio *bio)
>  		}
>  	}
>  
> -	if (!(q->limits.features & BLK_FEAT_POLL) &&
> -			(bio->bi_opf & REQ_POLLED)) {
> +	if ((bio->bi_opf & REQ_POLLED) && !bdev_can_poll(bdev)) {
>  		bio_clear_polled(bio);
>  		goto not_supported;
>  	}
> @@ -935,7 +946,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
>  		return 0;
>  
>  	q = bdev_get_queue(bdev);
> -	if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL))
> +	if (cookie == BLK_QC_T_NONE || !bdev_can_poll(bdev))
>  		return 0;
>  
>  	blk_flush_plug(current->plug, false);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 2e6132f778fd..f795d81b6b38 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -4320,12 +4320,6 @@ void blk_mq_release(struct request_queue *q)
>  	blk_mq_sysfs_deinit(q);
>  }
>  
> -static bool blk_mq_can_poll(struct blk_mq_tag_set *set)
> -{
> -	return set->nr_maps > HCTX_TYPE_POLL &&
> -		set->map[HCTX_TYPE_POLL].nr_queues;
> -}
> -
>  struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
>  		struct queue_limits *lim, void *queuedata)
>  {
> @@ -4336,7 +4330,7 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
>  	if (!lim)
>  		lim = &default_lim;
>  	lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
> -	if (blk_mq_can_poll(set))
> +	if (set->nr_maps > HCTX_TYPE_POLL)
>  		lim->features |= BLK_FEAT_POLL;
>  
>  	q = blk_alloc_queue(lim, set->numa_node);
> @@ -5024,8 +5018,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>  fallback:
>  	blk_mq_update_queue_map(set);
>  	list_for_each_entry(q, &set->tag_list, tag_set_list) {
> -		struct queue_limits lim;
> -
>  		blk_mq_realloc_hw_ctxs(set, q);
>  
>  		if (q->nr_hw_queues != set->nr_hw_queues) {
> @@ -5039,13 +5031,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>  			set->nr_hw_queues = prev_nr_hw_queues;
>  			goto fallback;
>  		}
> -		lim = queue_limits_start_update(q);
> -		if (blk_mq_can_poll(set))
> -			lim.features |= BLK_FEAT_POLL;
> -		else
> -			lim.features &= ~BLK_FEAT_POLL;
> -		if (queue_limits_commit_update(q, &lim) < 0)
> -			pr_warn("updating the poll flag failed\n");
>  		blk_mq_map_swqueue(q);
>  	}
>
Christoph Hellwig Jan. 7, 2025, 8:21 a.m. UTC | #2
On Tue, Jan 07, 2025 at 12:27:35PM +0530, Nilay Shroff wrote:
> As discussed in another thread with Damien, shouldn't we need to 
> move bdev_can_poll() to header file?

Well, if it was needed I would have done it, otherwise the code wouldn't
compile, would it?

> We also need to use this 
> function while reading sysfs attribute "io-poll", no?  

This now reports polling support when the driver declared it but
later resized the number of queues to have no queues left.  Which I
think is a fine tradeoff if you do that.
Nilay Shroff Jan. 7, 2025, 9:23 a.m. UTC | #3
On 1/7/25 1:51 PM, Christoph Hellwig wrote:
> On Tue, Jan 07, 2025 at 12:27:35PM +0530, Nilay Shroff wrote:
>> As discussed in another thread with Damien, shouldn't we need to 
>> move bdev_can_poll() to header file?
> 
> Well, if it was needed I would have done it, otherwise the code wouldn't
> compile, would it?
> 
I think, there won't be compile error because if we look at the show function
for "io_poll" attribute under sysfs, then I see it evaluates the queue limits 
feature flag BLK_FEAT_POLL and returns the value.

>> We also need to use this 
>> function while reading sysfs attribute "io-poll", no?  
> 
> This now reports polling support when the driver declared it but
> later resized the number of queues to have no queues left.  Which I
> think is a fine tradeoff if you do that.
> 
When I applied you patch on my system and access io_poll attribute
of one of my nvme disk, I see it returns 1, though I didn't configure 
poll queue for the disk. With this patch, as we're now always setting 
BLK_FEAT_POLL (under blk_mq_alloc_queue()) it return 1. So when I haven't
configured poll queue for NVMe driver, shouldn't it return 0 when I access 
/sys/block/nvmeXnY/queue/io_poll ?  

Thanks,
--Nilay
Christoph Hellwig Jan. 7, 2025, 1:51 p.m. UTC | #4
On Tue, Jan 07, 2025 at 02:53:40PM +0530, Nilay Shroff wrote:
> When I applied you patch on my system and access io_poll attribute
> of one of my nvme disk, I see it returns 1, though I didn't configure 
> poll queue for the disk. With this patch, as we're now always setting 
> BLK_FEAT_POLL (under blk_mq_alloc_queue()) it return 1. So when I haven't
> configured poll queue for NVMe driver, shouldn't it return 0 when I access 
> /sys/block/nvmeXnY/queue/io_poll ?  

While that was the case with the previous RFC series it should not be
the case with this version, as the nvme driver does not enable the
poll tag set map unless poll queues are enabled.  I also double checked
that I do not see it on any of my test setups.
Nilay Shroff Jan. 7, 2025, 5:55 p.m. UTC | #5
On 1/7/25 7:21 PM, Christoph Hellwig wrote:
> On Tue, Jan 07, 2025 at 02:53:40PM +0530, Nilay Shroff wrote:
>> When I applied you patch on my system and access io_poll attribute
>> of one of my nvme disk, I see it returns 1, though I didn't configure 
>> poll queue for the disk. With this patch, as we're now always setting 
>> BLK_FEAT_POLL (under blk_mq_alloc_queue()) it return 1. So when I haven't
>> configured poll queue for NVMe driver, shouldn't it return 0 when I access 
>> /sys/block/nvmeXnY/queue/io_poll ?  
> 
> While that was the case with the previous RFC series it should not be
> the case with this version, as the nvme driver does not enable the
> poll tag set map unless poll queues are enabled.  I also double checked
> that I do not see it on any of my test setups.
> 
Ohk I did install previous RFC series and tested it. 
On another note, with latest patch series, assuming NVMe driver reports polling 
support when it's loaded, accessing io_poll under sysfs reports 1. This is good.
However later resizing queue so that no poll queue is left and I reset the controller
and then access the io_poll it still reports 1. Is this expected? Other than this 
everything else looks fine.

Thanks,
--Nilay
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index 666efe8fa202..bd5bec843d37 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -753,6 +753,18 @@  static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q,
 	return BLK_STS_OK;
 }
 
+static bool bdev_can_poll(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (!(q->limits.features & BLK_FEAT_POLL))
+		return false;
+
+	if (queue_is_mq(q))
+		return q->tag_set->map[HCTX_TYPE_POLL].nr_queues;
+	return true;
+}
+
 /**
  * submit_bio_noacct - re-submit a bio to the block device layer for I/O
  * @bio:  The bio describing the location in memory and on the device.
@@ -805,8 +817,7 @@  void submit_bio_noacct(struct bio *bio)
 		}
 	}
 
-	if (!(q->limits.features & BLK_FEAT_POLL) &&
-			(bio->bi_opf & REQ_POLLED)) {
+	if ((bio->bi_opf & REQ_POLLED) && !bdev_can_poll(bdev)) {
 		bio_clear_polled(bio);
 		goto not_supported;
 	}
@@ -935,7 +946,7 @@  int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
 		return 0;
 
 	q = bdev_get_queue(bdev);
-	if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL))
+	if (cookie == BLK_QC_T_NONE || !bdev_can_poll(bdev))
 		return 0;
 
 	blk_flush_plug(current->plug, false);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2e6132f778fd..f795d81b6b38 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4320,12 +4320,6 @@  void blk_mq_release(struct request_queue *q)
 	blk_mq_sysfs_deinit(q);
 }
 
-static bool blk_mq_can_poll(struct blk_mq_tag_set *set)
-{
-	return set->nr_maps > HCTX_TYPE_POLL &&
-		set->map[HCTX_TYPE_POLL].nr_queues;
-}
-
 struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
 		struct queue_limits *lim, void *queuedata)
 {
@@ -4336,7 +4330,7 @@  struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
 	if (!lim)
 		lim = &default_lim;
 	lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
-	if (blk_mq_can_poll(set))
+	if (set->nr_maps > HCTX_TYPE_POLL)
 		lim->features |= BLK_FEAT_POLL;
 
 	q = blk_alloc_queue(lim, set->numa_node);
@@ -5024,8 +5018,6 @@  static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 fallback:
 	blk_mq_update_queue_map(set);
 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
-		struct queue_limits lim;
-
 		blk_mq_realloc_hw_ctxs(set, q);
 
 		if (q->nr_hw_queues != set->nr_hw_queues) {
@@ -5039,13 +5031,6 @@  static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 			set->nr_hw_queues = prev_nr_hw_queues;
 			goto fallback;
 		}
-		lim = queue_limits_start_update(q);
-		if (blk_mq_can_poll(set))
-			lim.features |= BLK_FEAT_POLL;
-		else
-			lim.features &= ~BLK_FEAT_POLL;
-		if (queue_limits_commit_update(q, &lim) < 0)
-			pr_warn("updating the poll flag failed\n");
 		blk_mq_map_swqueue(q);
 	}