diff mbox series

[2/8] blk-mq: change the method of iterating busy tags of a request_queue

Message ID 1552640264-26101-3-git-send-email-jianchao.w.wang@oracle.com (mailing list archive)
State New, archived
Headers show
Series : blk-mq: use static_rqs to iterate busy tags | expand

Commit Message

jianchao.wang March 15, 2019, 8:57 a.m. UTC
tags->rqs[] will not been cleaned when free driver tag and there
is a window between get driver tag and write tags->rqs[], so we
may see stale rq in tags->rqs[] which may have been freed, as
following case,
blk_mq_get_request         blk_mq_queue_tag_busy_iter
  -> blk_mq_get_tag
                             -> bt_for_each
                               -> bt_iter
                                 -> rq = taags->rqs[]
                                 -> rq->q
  -> blk_mq_rq_ctx_init
    -> data->hctx->tags->rqs[rq->tag] = rq;

To fix this, the blk_mq_queue_tag_busy_iter is changed in this
patch to use tags->static_rqs[] instead of tags->rqs[]. We have
to identify whether there is a io scheduler attached to decide
to use hctx->tags or hctx->sched_tags. And we will try to get a
non-zero q_usage_counter before that, so it is safe to access
them. Add 'inflight' parameter to determine to iterate in-flight
requests or just busy tags. A correction here is that
part_in_flight should count the busy tags instead of rqs that
have got driver tags.

Moreover, get rid of the 'hctx' parameter in iter function as
we could get it from rq->mq_hctx and export this interface for
drivers.

Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
---
 block/blk-mq-tag.c     | 76 +++++++++++++++++++++++++++++++++-----------------
 block/blk-mq-tag.h     |  2 --
 block/blk-mq.c         | 31 ++++++++------------
 include/linux/blk-mq.h |  5 ++--
 4 files changed, 64 insertions(+), 50 deletions(-)

Comments

Keith Busch March 15, 2019, 4:16 p.m. UTC | #1
On Fri, Mar 15, 2019 at 01:57:38AM -0700, Jianchao Wang wrote:
> tags->rqs[] will not been cleaned when free driver tag and there
> is a window between get driver tag and write tags->rqs[], so we
> may see stale rq in tags->rqs[] which may have been freed, as
> following case,
> blk_mq_get_request         blk_mq_queue_tag_busy_iter
>   -> blk_mq_get_tag
>                              -> bt_for_each
>                                -> bt_iter
>                                  -> rq = taags->rqs[]
>                                  -> rq->q
>   -> blk_mq_rq_ctx_init
>     -> data->hctx->tags->rqs[rq->tag] = rq;
> 
> To fix this, the blk_mq_queue_tag_busy_iter is changed in this
> patch to use tags->static_rqs[] instead of tags->rqs[]. We have
> to identify whether there is a io scheduler attached to decide
> to use hctx->tags or hctx->sched_tags. And we will try to get a
> non-zero q_usage_counter before that, so it is safe to access
> them. Add 'inflight' parameter to determine to iterate in-flight
> requests or just busy tags. A correction here is that
> part_in_flight should count the busy tags instead of rqs that
> have got driver tags.
> 
> Moreover, get rid of the 'hctx' parameter in iter function as
> we could get it from rq->mq_hctx and export this interface for
> drivers.
> 
> Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
> ---
>  block/blk-mq-tag.c     | 76 +++++++++++++++++++++++++++++++++-----------------
>  block/blk-mq-tag.h     |  2 --
>  block/blk-mq.c         | 31 ++++++++------------
>  include/linux/blk-mq.h |  5 ++--
>  4 files changed, 64 insertions(+), 50 deletions(-)
> 
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index b792537..cdec2cd 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -216,26 +216,38 @@ struct bt_iter_data {
>  	busy_iter_fn *fn;
>  	void *data;
>  	bool reserved;
> +	bool inflight;
>  };
>  
>  static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>  {
>  	struct bt_iter_data *iter_data = data;
>  	struct blk_mq_hw_ctx *hctx = iter_data->hctx;
> -	struct blk_mq_tags *tags = hctx->tags;
>  	bool reserved = iter_data->reserved;
> +	struct blk_mq_tags *tags;
>  	struct request *rq;
>  
> +	tags =  hctx->sched_tags ? hctx->sched_tags : hctx->tags;
> +
>  	if (!reserved)
>  		bitnr += tags->nr_reserved_tags;
> -	rq = tags->rqs[bitnr];
> +	/*
> +	 * Because tags->rqs[] will not been cleaned when free driver tag
> +	 * and there is a window between get driver tag and write tags->rqs[],
> +	 * so we may see stale rq in tags->rqs[] which may have been freed.
> +	 * Using static_rqs[] is safer.
> +	 */
> +	rq = tags->static_rqs[bitnr];
>  
>  	/*
> -	 * We can hit rq == NULL here, because the tagging functions
> -	 * test and set the bit before assigning ->rqs[].
> +	 * There is a small window between get tag and blk_mq_rq_ctx_init,
> +	 * so rq->q and rq->mq_hctx maybe different.
>  	 */
> -	if (rq && rq->q == hctx->queue)
> -		return iter_data->fn(hctx, rq, iter_data->data, reserved);
> +	if (rq && rq->q == hctx->queue &&
> +	    rq->mq_hctx == hctx &&
> +	    (!iter_data->inflight ||
> +	     blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT))
> +		return iter_data->fn(rq, iter_data->data, reserved);

There is still a window where the check may succeed, but the request is
being assigned to a completely different request_queue. The callback
then operates on a request it doesn't own.

Tag iteration from a driver can be safe only if the driver initiates a
freeze and quiesced all queues sharing the same tagset first. The nvme
driver does that, but I think there may need to be an additional grace
period to wait for the allocation's queue_enter to call queue_exit to
ensure the request is initialized through blk_mq_rq_ctx_init().
Ming Lei March 17, 2019, 6:50 a.m. UTC | #2
On Sat, Mar 16, 2019 at 12:15 AM Keith Busch <kbusch@kernel.org> wrote:
>
> On Fri, Mar 15, 2019 at 01:57:38AM -0700, Jianchao Wang wrote:
> > tags->rqs[] will not been cleaned when free driver tag and there
> > is a window between get driver tag and write tags->rqs[], so we
> > may see stale rq in tags->rqs[] which may have been freed, as
> > following case,
> > blk_mq_get_request         blk_mq_queue_tag_busy_iter
> >   -> blk_mq_get_tag
> >                              -> bt_for_each
> >                                -> bt_iter
> >                                  -> rq = taags->rqs[]
> >                                  -> rq->q
> >   -> blk_mq_rq_ctx_init
> >     -> data->hctx->tags->rqs[rq->tag] = rq;
> >
> > To fix this, the blk_mq_queue_tag_busy_iter is changed in this
> > patch to use tags->static_rqs[] instead of tags->rqs[]. We have
> > to identify whether there is a io scheduler attached to decide
> > to use hctx->tags or hctx->sched_tags. And we will try to get a
> > non-zero q_usage_counter before that, so it is safe to access
> > them. Add 'inflight' parameter to determine to iterate in-flight
> > requests or just busy tags. A correction here is that
> > part_in_flight should count the busy tags instead of rqs that
> > have got driver tags.
> >
> > Moreover, get rid of the 'hctx' parameter in iter function as
> > we could get it from rq->mq_hctx and export this interface for
> > drivers.
> >
> > Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
> > ---
> >  block/blk-mq-tag.c     | 76 +++++++++++++++++++++++++++++++++-----------------
> >  block/blk-mq-tag.h     |  2 --
> >  block/blk-mq.c         | 31 ++++++++------------
> >  include/linux/blk-mq.h |  5 ++--
> >  4 files changed, 64 insertions(+), 50 deletions(-)
> >
> > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > index b792537..cdec2cd 100644
> > --- a/block/blk-mq-tag.c
> > +++ b/block/blk-mq-tag.c
> > @@ -216,26 +216,38 @@ struct bt_iter_data {
> >       busy_iter_fn *fn;
> >       void *data;
> >       bool reserved;
> > +     bool inflight;
> >  };
> >
> >  static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
> >  {
> >       struct bt_iter_data *iter_data = data;
> >       struct blk_mq_hw_ctx *hctx = iter_data->hctx;
> > -     struct blk_mq_tags *tags = hctx->tags;
> >       bool reserved = iter_data->reserved;
> > +     struct blk_mq_tags *tags;
> >       struct request *rq;
> >
> > +     tags =  hctx->sched_tags ? hctx->sched_tags : hctx->tags;
> > +
> >       if (!reserved)
> >               bitnr += tags->nr_reserved_tags;
> > -     rq = tags->rqs[bitnr];
> > +     /*
> > +      * Because tags->rqs[] will not been cleaned when free driver tag
> > +      * and there is a window between get driver tag and write tags->rqs[],
> > +      * so we may see stale rq in tags->rqs[] which may have been freed.
> > +      * Using static_rqs[] is safer.
> > +      */
> > +     rq = tags->static_rqs[bitnr];
> >
> >       /*
> > -      * We can hit rq == NULL here, because the tagging functions
> > -      * test and set the bit before assigning ->rqs[].
> > +      * There is a small window between get tag and blk_mq_rq_ctx_init,
> > +      * so rq->q and rq->mq_hctx maybe different.
> >        */
> > -     if (rq && rq->q == hctx->queue)
> > -             return iter_data->fn(hctx, rq, iter_data->data, reserved);
> > +     if (rq && rq->q == hctx->queue &&
> > +         rq->mq_hctx == hctx &&
> > +         (!iter_data->inflight ||
> > +          blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT))
> > +             return iter_data->fn(rq, iter_data->data, reserved);
>
> There is still a window where the check may succeed, but the request is
> being assigned to a completely different request_queue. The callback
> then operates on a request it doesn't own.

The situation above is only related with 'none' io scheduler.

However, this patch doesn't change behavior for none given .rq[tag] is same with
.static_rq[tag], so I guess your concern may not be related with this patch.

>
> Tag iteration from a driver can be safe only if the driver initiates a
> freeze and quiesced all queues sharing the same tagset first. The nvme
> driver does that, but I think there may need to be an additional grace
> period to wait for the allocation's queue_enter to call queue_exit to
> ensure the request is initialized through blk_mq_rq_ctx_init().

blk_mq_queue_tag_busy_iter() is used by blk-mq core code only for
io accounting or timeout on the specified request_queue, and NVMe's
uses are actually over the whole driver tags.

Thanks,
Ming Lei
jianchao.wang March 18, 2019, 1:49 a.m. UTC | #3
Hi Keith

On 3/16/19 12:16 AM, Keith Busch wrote:
> There is still a window where the check may succeed, but the request is
> being assigned to a completely different request_queue. The callback
> then operates on a request it doesn't own.
> 
> Tag iteration from a driver can be safe only if the driver initiates a
> freeze and quiesced all queues sharing the same tagset first. The nvme
> driver does that, but I think there may need to be an additional grace
> period to wait for the allocation's queue_enter to call queue_exit to
> ensure the request is initialized through blk_mq_rq_ctx_init().

This patch is to avoid the use-after-free case in the comment.
The helper interface is used in following cases,
1. in flight request account
   this case should not require so much accuracy

2. timeout check
   a extra reference will be held there to avoid the request recycle

3. if it is used by driver, it should be enough that quiesce the request_queue
   the driver usually wants to handle the in-flight ones, quiesce request_queue
   could ensure all of tasks quit the hctx_lock and no one issue request any
   more.

Thanks
Jianchao
Keith Busch March 18, 2019, 3:53 p.m. UTC | #4
On Sat, Mar 16, 2019 at 11:50:27PM -0700, Ming Lei wrote:
> On Sat, Mar 16, 2019 at 12:15 AM Keith Busch <kbusch@kernel.org> wrote:
> > On Fri, Mar 15, 2019 at 01:57:38AM -0700, Jianchao Wang wrote:
> > > +     if (rq && rq->q == hctx->queue &&
> > > +         rq->mq_hctx == hctx &&
> > > +         (!iter_data->inflight ||
> > > +          blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT))
> > > +             return iter_data->fn(rq, iter_data->data, reserved);
> >
> > There is still a window where the check may succeed, but the request is
> > being assigned to a completely different request_queue. The callback
> > then operates on a request it doesn't own.
> 
> The situation above is only related with 'none' io scheduler.
> 
> However, this patch doesn't change behavior for none given .rq[tag] is same with
> .static_rq[tag], so I guess your concern may not be related with this patch.

Right, it's not new, but queue iteration wasn't exported for drivers
before this.
Sagi Grimberg March 20, 2019, 6:52 p.m. UTC | #5
On 3/15/19 1:57 AM, Jianchao Wang wrote:
> tags->rqs[] will not been cleaned when free driver tag and there
> is a window between get driver tag and write tags->rqs[], so we
> may see stale rq in tags->rqs[] which may have been freed, as
> following case,
> blk_mq_get_request         blk_mq_queue_tag_busy_iter
>    -> blk_mq_get_tag
>                               -> bt_for_each
>                                 -> bt_iter
>                                   -> rq = taags->rqs[]
>                                   -> rq->q
>    -> blk_mq_rq_ctx_init
>      -> data->hctx->tags->rqs[rq->tag] = rq;
> 
> To fix this, the blk_mq_queue_tag_busy_iter is changed in this
> patch to use tags->static_rqs[] instead of tags->rqs[]. We have
> to identify whether there is a io scheduler attached to decide
> to use hctx->tags or hctx->sched_tags. And we will try to get a
> non-zero q_usage_counter before that, so it is safe to access
> them. Add 'inflight' parameter to determine to iterate in-flight
> requests or just busy tags. A correction here is that
> part_in_flight should count the busy tags instead of rqs that
> have got driver tags.

IMO, instead of this parameter, add a wrapper like
blk_mq_queue_tag_inflight_iter() or keep the parameter out until
we actually have a user that calls it for busy and not inflight.

Other than that, I think that iterating over static_rqs is a good
solution to the problem described.
jianchao.wang March 21, 2019, 1:33 a.m. UTC | #6
Hi Sagi

Thanks for your kindly response.

On 3/21/19 2:52 AM, Sagi Grimberg wrote:
> 
> 
> On 3/15/19 1:57 AM, Jianchao Wang wrote:
>> tags->rqs[] will not been cleaned when free driver tag and there
>> is a window between get driver tag and write tags->rqs[], so we
>> may see stale rq in tags->rqs[] which may have been freed, as
>> following case,
>> blk_mq_get_request         blk_mq_queue_tag_busy_iter
>>    -> blk_mq_get_tag
>>                               -> bt_for_each
>>                                 -> bt_iter
>>                                   -> rq = taags->rqs[]
>>                                   -> rq->q
>>    -> blk_mq_rq_ctx_init
>>      -> data->hctx->tags->rqs[rq->tag] = rq;
>>
>> To fix this, the blk_mq_queue_tag_busy_iter is changed in this
>> patch to use tags->static_rqs[] instead of tags->rqs[]. We have
>> to identify whether there is a io scheduler attached to decide
>> to use hctx->tags or hctx->sched_tags. And we will try to get a
>> non-zero q_usage_counter before that, so it is safe to access
>> them. Add 'inflight' parameter to determine to iterate in-flight
>> requests or just busy tags. A correction here is that
>> part_in_flight should count the busy tags instead of rqs that
>> have got driver tags.
> 
> IMO, instead of this parameter, add a wrapper like
> blk_mq_queue_tag_inflight_iter() or keep the parameter out until
> we actually have a user that calls it for busy and not inflight.

Using a wrapper instead of exporting the parameter to user is indeed better.
I will change it in next version.

When the 'inflight' parameter is true, we iterate the inflight requests,
otherwise, we iterate the all the busy requests no matter they are in-flight
or not. The latter would be useful for part_in_flight. Currently, we only
account the requests with driver tags, it is inconsistent for the io-scheduler
and non-io-scheduler case.

> 
> Other than that, I think that iterating over static_rqs is a good
> solution to the problem described.
> 

Sincerely
Jianchao
diff mbox series

Patch

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index b792537..cdec2cd 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -216,26 +216,38 @@  struct bt_iter_data {
 	busy_iter_fn *fn;
 	void *data;
 	bool reserved;
+	bool inflight;
 };
 
 static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 {
 	struct bt_iter_data *iter_data = data;
 	struct blk_mq_hw_ctx *hctx = iter_data->hctx;
-	struct blk_mq_tags *tags = hctx->tags;
 	bool reserved = iter_data->reserved;
+	struct blk_mq_tags *tags;
 	struct request *rq;
 
+	tags =  hctx->sched_tags ? hctx->sched_tags : hctx->tags;
+
 	if (!reserved)
 		bitnr += tags->nr_reserved_tags;
-	rq = tags->rqs[bitnr];
+	/*
+	 * Because tags->rqs[] will not been cleaned when free driver tag
+	 * and there is a window between get driver tag and write tags->rqs[],
+	 * so we may see stale rq in tags->rqs[] which may have been freed.
+	 * Using static_rqs[] is safer.
+	 */
+	rq = tags->static_rqs[bitnr];
 
 	/*
-	 * We can hit rq == NULL here, because the tagging functions
-	 * test and set the bit before assigning ->rqs[].
+	 * There is a small window between get tag and blk_mq_rq_ctx_init,
+	 * so rq->q and rq->mq_hctx maybe different.
 	 */
-	if (rq && rq->q == hctx->queue)
-		return iter_data->fn(hctx, rq, iter_data->data, reserved);
+	if (rq && rq->q == hctx->queue &&
+	    rq->mq_hctx == hctx &&
+	    (!iter_data->inflight ||
+	     blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT))
+		return iter_data->fn(rq, iter_data->data, reserved);
 	return true;
 }
 
@@ -246,7 +258,7 @@  static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
  *		or the bitmap_tags member of struct blk_mq_tags.
  * @fn:		Pointer to the function that will be called for each request
  *		associated with @hctx that has been assigned a driver tag.
- *		@fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
+ *		@fn will be called as follows: @fn(rq, @data, @reserved)
  *		where rq is a pointer to a request. Return true to continue
  *		iterating tags, false to stop.
  * @data:	Will be passed as third argument to @fn.
@@ -254,13 +266,14 @@  static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
  *		bitmap_tags member of struct blk_mq_tags.
  */
 static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
-			busy_iter_fn *fn, void *data, bool reserved)
+			busy_iter_fn *fn, void *data, bool reserved, bool inflight)
 {
 	struct bt_iter_data iter_data = {
 		.hctx = hctx,
 		.fn = fn,
 		.data = data,
 		.reserved = reserved,
+		.inflight = inflight,
 	};
 
 	sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
@@ -362,36 +375,33 @@  void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
 
 /**
- * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
+ * blk_mq_queue_tag_busy_iter - iterate over all busy tags
  * @q:		Request queue to examine.
- * @fn:		Pointer to the function that will be called for each request
- *		on @q. @fn will be called as follows: @fn(hctx, rq, @priv,
- *		reserved) where rq is a pointer to a request and hctx points
- *		to the hardware queue associated with the request. 'reserved'
- *		indicates whether or not @rq is a reserved request.
+ * @fn:		Pointer to the function that will be called for each
+ * 		in-flight request issued by @q. @fn will be called as
+ * 		follows:
+ * 		@fn(rq, @priv, reserved)
+ * 		rq is a pointer to a request.'reserved' indicates whether or
+ * 		not @rq is a reserved request.
  * @priv:	Will be passed as third argument to @fn.
- *
- * Note: if @q->tag_set is shared with other request queues then @fn will be
- * called for all requests on all queues that share that tag set and not only
- * for requests associated with @q.
  */
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
-		void *priv)
+		void *priv, bool inflight)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
 	/*
-	 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
-	 * while the queue is frozen. So we can use q_usage_counter to avoid
-	 * racing with it.
+	 * Get a reference of the queue unless it has been zero. We use this
+	 * to avoid the race with the code that would modify the hctxs after
+	 * freeze and drain the queue, including updating nr_hw_queues, io
+	 * scheduler switching and queue clean up.
 	 */
 	if (!percpu_ref_tryget(&q->q_usage_counter))
 		return;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		struct blk_mq_tags *tags = hctx->tags;
-
+		struct blk_mq_tags *tags;
 		/*
 		 * If no software queues are currently mapped to this
 		 * hardware queue, there's nothing to check
@@ -399,12 +409,26 @@  void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		if (!blk_mq_hw_queue_mapped(hctx))
 			continue;
 
+		tags =  hctx->sched_tags ? hctx->sched_tags : hctx->tags;
+
 		if (tags->nr_reserved_tags)
-			bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
-		bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
+			bt_for_each(hctx, &tags->breserved_tags,
+				    fn, priv, true, inflight);
+		bt_for_each(hctx, &tags->bitmap_tags,
+			    fn, priv, false, inflight);
+		/*
+		 * flush_rq represents the rq with REQ_PREFLUSH and REQ_FUA
+		 * (if FUA is not supported by device) to be issued to
+		 * device. So we need to consider it when iterate inflight
+		 * rqs, but needn't to count it when iterate busy tags.
+		 */
+		if (inflight &&
+		    blk_mq_rq_state(hctx->fq->flush_rq) == MQ_RQ_IN_FLIGHT)
+			fn(hctx->fq->flush_rq, priv, false);
 	}
 	blk_queue_exit(q);
 }
+EXPORT_SYMBOL(blk_mq_queue_tag_busy_iter);
 
 static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
 		    bool round_robin, int node)
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0..5af7ff9 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -33,8 +33,6 @@  extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 					struct blk_mq_tags **tags,
 					unsigned int depth, bool can_grow);
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
-void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
-		void *priv);
 
 static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
 						 struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 07584a9..9144ec1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -93,8 +93,7 @@  struct mq_inflight {
 	unsigned int *inflight;
 };
 
-static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
-				  struct request *rq, void *priv,
+static bool blk_mq_check_inflight(struct request *rq, void *priv,
 				  bool reserved)
 {
 	struct mq_inflight *mi = priv;
@@ -114,13 +113,12 @@  unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
 	struct mq_inflight mi = { .part = part, .inflight = inflight, };
 
 	inflight[0] = inflight[1] = 0;
-	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi, false);
 
 	return inflight[0];
 }
 
-static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
-				     struct request *rq, void *priv,
+static bool blk_mq_check_inflight_rw(struct request *rq, void *priv,
 				     bool reserved)
 {
 	struct mq_inflight *mi = priv;
@@ -137,7 +135,7 @@  void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 	struct mq_inflight mi = { .part = part, .inflight = inflight, };
 
 	inflight[0] = inflight[1] = 0;
-	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi, false);
 }
 
 void blk_freeze_queue_start(struct request_queue *q)
@@ -813,28 +811,22 @@  struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
-static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
-			       void *priv, bool reserved)
+static bool blk_mq_rq_inflight(struct request *rq, void *priv, bool reserved)
 {
+	bool *busy = priv;
 	/*
 	 * If we find a request that is inflight and the queue matches,
 	 * we know the queue is busy. Return false to stop the iteration.
 	 */
-	if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
-		bool *busy = priv;
-
-		*busy = true;
-		return false;
-	}
-
-	return true;
+	*busy = true;
+	return false;
 }
 
 bool blk_mq_queue_inflight(struct request_queue *q)
 {
 	bool busy = false;
 
-	blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
+	blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy, true);
 	return busy;
 }
 EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
@@ -874,8 +866,7 @@  static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
 	return false;
 }
 
-static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
-		struct request *rq, void *priv, bool reserved)
+static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved)
 {
 	unsigned long *next = priv;
 
@@ -936,7 +927,7 @@  static void blk_mq_timeout_work(struct work_struct *work)
 	if (!percpu_ref_tryget(&q->q_usage_counter))
 		return;
 
-	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next, true);
 
 	if (next != 0) {
 		mod_timer(&q->timeout, next);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0e030f5..d6beeb5 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -132,8 +132,7 @@  typedef int (init_request_fn)(struct blk_mq_tag_set *set, struct request *,
 typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *,
 		unsigned int);
 
-typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
-		bool);
+typedef bool (busy_iter_fn)(struct request *, void *, bool);
 typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
 typedef int (poll_fn)(struct blk_mq_hw_ctx *);
 typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
@@ -322,6 +321,8 @@  bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 		busy_tag_iter_fn *fn, void *priv);
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+		void *priv, bool inflight);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
 void blk_freeze_queue_start(struct request_queue *q);