diff mbox

[3/3] blk-mq: unify hctx delay_work and run_work

Message ID 1491839696-24783-4-git-send-email-axboe@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jens Axboe April 10, 2017, 3:54 p.m. UTC
The only difference between ->run_work and ->delay_work, is that
the latter is used to defer running a queue. This is done by
marking the queue stopped, and scheduling ->delay_work to run
sometime in the future. While the queue is stopped, direct runs
or runs through ->run_work will not run the queue.

If we combine the handlers, then we need to handle two things:

1) If a delayed/stopped run is scheduled, then we should not run
   the queue before that has been completed.
2) If a queue is delayed/stopped, the handler needs to restart
   the queue. Normally a run of a queue with the stopped bit set
   would be a no-op.

Case 1 is handled by modifying a currently pending queue run
to the deadline set by the caller of blk_mq_delay_queue().
Subsequent attempts to queue a queue run will find the work
item already pending, and direct runs will see a stopped queue
as before.

Case 2 is handled by adding a new bit, BLK_MQ_S_START_ON_RUN,
that tells the work handler that it should clear a stopped
queue and run the handler.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  4 +---
 block/blk-mq.c         | 34 ++++++++++++++++++++++------------
 include/linux/blk-mq.h |  3 +--
 3 files changed, 24 insertions(+), 17 deletions(-)

Comments

Bart Van Assche April 10, 2017, 4:21 p.m. UTC | #1
On Mon, 2017-04-10 at 09:54 -0600, Jens Axboe wrote:
> @@ -1281,27 +1280,39 @@ static void blk_mq_run_work_fn(struct work_struct *work)
>  	struct blk_mq_hw_ctx *hctx;
>  
>  	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
> -	__blk_mq_run_hw_queue(hctx);
> -}
>  
> -static void blk_mq_delay_work_fn(struct work_struct *work)
> -{
> -	struct blk_mq_hw_ctx *hctx;
> +	/*
> +	 * If we are stopped, don't run the queue. The exception is if
> +	 * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
> +	 * the STOPPED bit and run it.
> +	 */
> +	if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
> +		if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
> +			return;
>  
> -	hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
> +		clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
> +		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
> +	}
>  
> -	if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
> -		__blk_mq_run_hw_queue(hctx);
> +	__blk_mq_run_hw_queue(hctx);
>  }
>  
> +
>  void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
>  {
>  	if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
>  		return;
>  
> +	/*
> +	 * Stop the hw queue, then modify currently delayed work.
> +	 * This should prevent us from running the queue prematurely.
> +	 * Mark the queue as auto-clearing STOPPED when it runs.
> +	 */
>  	blk_mq_stop_hw_queue(hctx);
> -	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
> -			&hctx->delay_work, msecs_to_jiffies(msecs));
> +	set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
> +	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
> +					&hctx->run_work,
> +					msecs_to_jiffies(msecs));
>  }
>  EXPORT_SYMBOL(blk_mq_delay_queue);

Hello Jens,

Is it possible for a block driver to call blk_mq_delay_queue() while
blk_mq_delay_work_fn() is running? Can that result in BLK_MQ_S_STOPPED
and BLK_MQ_S_START_ON_RUN being checked by blk_mq_delay_work_fn() after
blk_mq_delay_queue() has set BLK_MQ_S_STOPPED and before it has set
BLK_MQ_S_START_ON_RUN?

Thanks,

Bart.
Jens Axboe April 10, 2017, 4:53 p.m. UTC | #2
On 04/10/2017 10:21 AM, Bart Van Assche wrote:
> On Mon, 2017-04-10 at 09:54 -0600, Jens Axboe wrote:
>> @@ -1281,27 +1280,39 @@ static void blk_mq_run_work_fn(struct work_struct *work)
>>  	struct blk_mq_hw_ctx *hctx;
>>  
>>  	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
>> -	__blk_mq_run_hw_queue(hctx);
>> -}
>>  
>> -static void blk_mq_delay_work_fn(struct work_struct *work)
>> -{
>> -	struct blk_mq_hw_ctx *hctx;
>> +	/*
>> +	 * If we are stopped, don't run the queue. The exception is if
>> +	 * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
>> +	 * the STOPPED bit and run it.
>> +	 */
>> +	if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
>> +		if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
>> +			return;
>>  
>> -	hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
>> +		clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
>> +		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
>> +	}
>>  
>> -	if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
>> -		__blk_mq_run_hw_queue(hctx);
>> +	__blk_mq_run_hw_queue(hctx);
>>  }
>>  
>> +
>>  void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
>>  {
>>  	if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
>>  		return;
>>  
>> +	/*
>> +	 * Stop the hw queue, then modify currently delayed work.
>> +	 * This should prevent us from running the queue prematurely.
>> +	 * Mark the queue as auto-clearing STOPPED when it runs.
>> +	 */
>>  	blk_mq_stop_hw_queue(hctx);
>> -	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
>> -			&hctx->delay_work, msecs_to_jiffies(msecs));
>> +	set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
>> +	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
>> +					&hctx->run_work,
>> +					msecs_to_jiffies(msecs));
>>  }
>>  EXPORT_SYMBOL(blk_mq_delay_queue);
> 
> Hello Jens,
> 
> Is it possible for a block driver to call blk_mq_delay_queue() while
> blk_mq_delay_work_fn() is running? Can that result in BLK_MQ_S_STOPPED
> and BLK_MQ_S_START_ON_RUN being checked by blk_mq_delay_work_fn() after
> blk_mq_delay_queue() has set BLK_MQ_S_STOPPED and before it has set
> BLK_MQ_S_START_ON_RUN?

Yeah, I don't think it's bullet proof. I just looked at the in-kernel
users, and there's just one, nvme/fc.c. And it looks really buggy,
since it manually stops _all_ queues, then delays the one hw queue.
So we'll end up with potentially a bunch of stopped queues, and only
one getting restarted.

I think for blk_mq_delay_queue(), what we really care about is "this
queue has to run again sometime in the future". If that happens
much sooner than asked for, that's OK, the caller will just delay
again if it needs it. For most cases, we'll be close.

Obviously we can't have ordering issues and end up in a bad state,
we have to prevent that.

I'll fiddle with this a bit more.
Jens Axboe April 10, 2017, 5:23 p.m. UTC | #3
On 04/10/2017 10:53 AM, Jens Axboe wrote:
> On 04/10/2017 10:21 AM, Bart Van Assche wrote:
>> On Mon, 2017-04-10 at 09:54 -0600, Jens Axboe wrote:
>>> @@ -1281,27 +1280,39 @@ static void blk_mq_run_work_fn(struct work_struct *work)
>>>  	struct blk_mq_hw_ctx *hctx;
>>>  
>>>  	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
>>> -	__blk_mq_run_hw_queue(hctx);
>>> -}
>>>  
>>> -static void blk_mq_delay_work_fn(struct work_struct *work)
>>> -{
>>> -	struct blk_mq_hw_ctx *hctx;
>>> +	/*
>>> +	 * If we are stopped, don't run the queue. The exception is if
>>> +	 * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
>>> +	 * the STOPPED bit and run it.
>>> +	 */
>>> +	if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
>>> +		if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
>>> +			return;
>>>  
>>> -	hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
>>> +		clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
>>> +		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
>>> +	}
>>>  
>>> -	if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
>>> -		__blk_mq_run_hw_queue(hctx);
>>> +	__blk_mq_run_hw_queue(hctx);
>>>  }
>>>  
>>> +
>>>  void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
>>>  {
>>>  	if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
>>>  		return;
>>>  
>>> +	/*
>>> +	 * Stop the hw queue, then modify currently delayed work.
>>> +	 * This should prevent us from running the queue prematurely.
>>> +	 * Mark the queue as auto-clearing STOPPED when it runs.
>>> +	 */
>>>  	blk_mq_stop_hw_queue(hctx);
>>> -	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
>>> -			&hctx->delay_work, msecs_to_jiffies(msecs));
>>> +	set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
>>> +	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
>>> +					&hctx->run_work,
>>> +					msecs_to_jiffies(msecs));
>>>  }
>>>  EXPORT_SYMBOL(blk_mq_delay_queue);
>>
>> Hello Jens,
>>
>> Is it possible for a block driver to call blk_mq_delay_queue() while
>> blk_mq_delay_work_fn() is running? Can that result in BLK_MQ_S_STOPPED
>> and BLK_MQ_S_START_ON_RUN being checked by blk_mq_delay_work_fn() after
>> blk_mq_delay_queue() has set BLK_MQ_S_STOPPED and before it has set
>> BLK_MQ_S_START_ON_RUN?
> 
> Yeah, I don't think it's bullet proof. I just looked at the in-kernel
> users, and there's just one, nvme/fc.c. And it looks really buggy,
> since it manually stops _all_ queues, then delays the one hw queue.
> So we'll end up with potentially a bunch of stopped queues, and only
> one getting restarted.
> 
> I think for blk_mq_delay_queue(), what we really care about is "this
> queue has to run again sometime in the future". If that happens
> much sooner than asked for, that's OK, the caller will just delay
> again if it needs it. For most cases, we'll be close.
> 
> Obviously we can't have ordering issues and end up in a bad state,
> we have to prevent that.
> 
> I'll fiddle with this a bit more.

Spent a bit of time looking at the workqueue code. Looks like we're
guaranteed that we'll have at least one run of the handler after
calling kblockd_mod_delayed_work_on(). If the handler is currently
running, the we will sucessfully queue a new invocation. That's the
troublesome case. If it's not currently running, we just push the run
sometime into the future. In both cases, we know it'll run _after_
setting BLK_MQ_S_START_ON_RUN, which is the important part.

Hence I think the patch is fine as-is. Let me know if you disagree!
diff mbox

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index bffb8640346b..4f0104afa848 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -268,10 +268,8 @@  void blk_sync_queue(struct request_queue *q)
 		struct blk_mq_hw_ctx *hctx;
 		int i;
 
-		queue_for_each_hw_ctx(q, hctx, i) {
+		queue_for_each_hw_ctx(q, hctx, i)
 			cancel_delayed_work_sync(&hctx->run_work);
-			cancel_delayed_work_sync(&hctx->delay_work);
-		}
 	} else {
 		cancel_delayed_work_sync(&q->delay_work);
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7afba6ab5a96..e97ed8e7f359 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1223,7 +1223,6 @@  EXPORT_SYMBOL(blk_mq_queue_stopped);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	cancel_delayed_work(&hctx->run_work);
-	cancel_delayed_work(&hctx->delay_work);
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -1281,27 +1280,39 @@  static void blk_mq_run_work_fn(struct work_struct *work)
 	struct blk_mq_hw_ctx *hctx;
 
 	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
-	__blk_mq_run_hw_queue(hctx);
-}
 
-static void blk_mq_delay_work_fn(struct work_struct *work)
-{
-	struct blk_mq_hw_ctx *hctx;
+	/*
+	 * If we are stopped, don't run the queue. The exception is if
+	 * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
+	 * the STOPPED bit and run it.
+	 */
+	if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
+		if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
+			return;
 
-	hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
+		clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+	}
 
-	if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
-		__blk_mq_run_hw_queue(hctx);
+	__blk_mq_run_hw_queue(hctx);
 }
 
+
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 {
 	if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
 		return;
 
+	/*
+	 * Stop the hw queue, then modify currently delayed work.
+	 * This should prevent us from running the queue prematurely.
+	 * Mark the queue as auto-clearing STOPPED when it runs.
+	 */
 	blk_mq_stop_hw_queue(hctx);
-	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-			&hctx->delay_work, msecs_to_jiffies(msecs));
+	set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+					&hctx->run_work,
+					msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL(blk_mq_delay_queue);
 
@@ -1886,7 +1897,6 @@  static int blk_mq_init_hctx(struct request_queue *q,
 		node = hctx->numa_node = set->numa_node;
 
 	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
-	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
 	hctx->queue = q;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2b4573a9ccf4..7a114b7b943c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -51,8 +51,6 @@  struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
-	struct delayed_work	delay_work;
-
 	struct hlist_node	cpuhp_dead;
 	struct kobject		kobj;
 
@@ -160,6 +158,7 @@  enum {
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
 	BLK_MQ_S_TAG_WAITING	= 3,
+	BLK_MQ_S_START_ON_RUN	= 4,
 
 	BLK_MQ_MAX_DEPTH	= 10240,