diff mbox series

blk-mq: avoid repeatedly scheduling the same work to run hardware queue

Message ID 1573771884-38879-1-git-send-email-longli@linuxonhyperv.com (mailing list archive)
State New, archived
Headers show
Series blk-mq: avoid repeatedly scheduling the same work to run hardware queue | expand

Commit Message

Long Li Nov. 14, 2019, 10:51 p.m. UTC
From: Long Li <longli@microsoft.com>

SCSI layer calls blk_mq_run_hw_queues() in scsi_end_request(), for every
completed I/O. blk_mq_run_hw_queues() in turn schedules some works to run
the hardware queues.

The actual work is queued by mod_delayed_work_on(), it turns out the cost of
this function is high on locking and CPU usage, when the I/O workload has
high queue depth. Most of these calls are not necessary since the queue is
already scheduled to run, and has not run yet.

This patch tries to solve this problem by avoiding scheduling work when it's
already scheduled.

Benchmark results:
The following tests are run on a RAM backed virtual disk on Hyper-V, with 8
FIO jobs with 4k random read I/O. The test numbers are for IOPS.

queue_depth	pre-patch	after-patch	improvement
16		190k		190k		0%
64		235k		240k		2%
256		180k		256k		42%
1024		156k		250k		60%

Signed-off-by: Long Li <longli@microsoft.com>
---
 block/blk-mq.c         | 12 ++++++++++++
 include/linux/blk-mq.h |  1 +
 2 files changed, 13 insertions(+)

Comments

Damien Le Moal Nov. 15, 2019, 2:13 a.m. UTC | #1
On 2019/11/15 7:51, longli@linuxonhyperv.com wrote:
> From: Long Li <longli@microsoft.com>
> 
> SCSI layer calls blk_mq_run_hw_queues() in scsi_end_request(), for every
> completed I/O. blk_mq_run_hw_queues() in turn schedules some works to run
> the hardware queues.
> 
> The actual work is queued by mod_delayed_work_on(), it turns out the cost of
> this function is high on locking and CPU usage, when the I/O workload has
> high queue depth. Most of these calls are not necessary since the queue is
> already scheduled to run, and has not run yet.
> 
> This patch tries to solve this problem by avoiding scheduling work when it's
> already scheduled.
> 
> Benchmark results:
> The following tests are run on a RAM backed virtual disk on Hyper-V, with 8
> FIO jobs with 4k random read I/O. The test numbers are for IOPS.
> 
> queue_depth	pre-patch	after-patch	improvement
> 16		190k		190k		0%
> 64		235k		240k		2%
> 256		180k		256k		42%
> 1024		156k		250k		60%
> 
> Signed-off-by: Long Li <longli@microsoft.com>
> ---
>  block/blk-mq.c         | 12 ++++++++++++
>  include/linux/blk-mq.h |  1 +
>  2 files changed, 13 insertions(+)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index ec791156e9cc..a882bd65167a 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1476,6 +1476,16 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
>  		put_cpu();
>  	}
>  
> +	/*
> +	 * Queue a work to run queue. If this is a non-delay run and the
> +	 * work is already scheduled, avoid scheduling the same work again.
> +	 */
> +	if (!msecs) {
> +		if (test_bit(BLK_MQ_S_WORK_QUEUED, &hctx->state))
> +			return;

With this change, if the kblockd work is already scheduled with a delay,
then the current no-delay run request will incur a delay because
kblockd_mod_delayed_work_on() is not called, implying that
__queue_delayed_work() does not execute __queue_work() as mandated by
the 0 delay. The work is *not* started immediately.

While your results show improvements of IOPS at high queue depth,
doesn't this change degrade IOPS and especially latency at low queue depth ?

> +		set_bit(BLK_MQ_S_WORK_QUEUED, &hctx->state);
> +	}
> +
>  	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
>  				    msecs_to_jiffies(msecs));
>  }
> @@ -1561,6 +1571,7 @@ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
>  	cancel_delayed_work(&hctx->run_work);
>  
>  	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
> +	clear_bit(BLK_MQ_S_WORK_QUEUED, &hctx->state);
>  }
>  EXPORT_SYMBOL(blk_mq_stop_hw_queue);
>  
> @@ -1626,6 +1637,7 @@ static void blk_mq_run_work_fn(struct work_struct *work)
>  	struct blk_mq_hw_ctx *hctx;
>  
>  	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
> +	clear_bit(BLK_MQ_S_WORK_QUEUED, &hctx->state);
>  
>  	/*
>  	 * If we are stopped, don't run the queue.
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 0bf056de5cc3..98269d3fd141 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -234,6 +234,7 @@ enum {
>  	BLK_MQ_S_STOPPED	= 0,
>  	BLK_MQ_S_TAG_ACTIVE	= 1,
>  	BLK_MQ_S_SCHED_RESTART	= 2,
> +	BLK_MQ_S_WORK_QUEUED	= 3,
>  
>  	BLK_MQ_MAX_DEPTH	= 10240,
>  
>
Long Li Nov. 15, 2019, 11:25 p.m. UTC | #2
>Subject: Re: [PATCH] blk-mq: avoid repeatedly scheduling the same work to
>run hardware queue
>
>On 2019/11/15 7:51, longli@linuxonhyperv.com wrote:
>> From: Long Li <longli@microsoft.com>
>>
>> SCSI layer calls blk_mq_run_hw_queues() in scsi_end_request(), for
>> every completed I/O. blk_mq_run_hw_queues() in turn schedules some
>> works to run the hardware queues.
>>
>> The actual work is queued by mod_delayed_work_on(), it turns out the
>> cost of this function is high on locking and CPU usage, when the I/O
>> workload has high queue depth. Most of these calls are not necessary
>> since the queue is already scheduled to run, and has not run yet.
>>
>> This patch tries to solve this problem by avoiding scheduling work
>> when it's already scheduled.
>>
>> Benchmark results:
>> The following tests are run on a RAM backed virtual disk on Hyper-V,
>> with 8 FIO jobs with 4k random read I/O. The test numbers are for IOPS.
>>
>> queue_depth	pre-patch	after-patch	improvement
>> 16		190k		190k		0%
>> 64		235k		240k		2%
>> 256		180k		256k		42%
>> 1024		156k		250k		60%
>>
>> Signed-off-by: Long Li <longli@microsoft.com>
>> ---
>>  block/blk-mq.c         | 12 ++++++++++++
>>  include/linux/blk-mq.h |  1 +
>>  2 files changed, 13 insertions(+)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c index
>> ec791156e9cc..a882bd65167a 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -1476,6 +1476,16 @@ static void
>__blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
>>  		put_cpu();
>>  	}
>>
>> +	/*
>> +	 * Queue a work to run queue. If this is a non-delay run and the
>> +	 * work is already scheduled, avoid scheduling the same work again.
>> +	 */
>> +	if (!msecs) {
>> +		if (test_bit(BLK_MQ_S_WORK_QUEUED, &hctx->state))
>> +			return;
>
>With this change, if the kblockd work is already scheduled with a delay, then
>the current no-delay run request will incur a delay because
>kblockd_mod_delayed_work_on() is not called, implying that
>__queue_delayed_work() does not execute __queue_work() as mandated
>by the 0 delay. The work is *not* started immediately.

BLK_MQ_S_WORK_QUEUED is not touched if this is for a delayed kblockd work, so the following non-delayed runs will get immediately scheduled.

But I think you have raised a valid point for delayed runs, consider the following sequence with three calls with quick succession.

1. __blk_mq_delay_run_hw_queue with no delay (this will set BLK_MQ_S_WORK_QUEUED)
2. __blk_mq_delay_run_hw_queue with a delay (this will modify the kblockd work if the work is not fired)
3. __blk_mq_delay_run_hw_queue with no delay (this will not do anything since BLK_MQ_S_WORK_QUEUED is already set)

This sequence can potentially wait until 2 fires. Ideally it should fire immediately at step 3.

I will send v2 to address this.

>
>While your results show improvements of IOPS at high queue depth, doesn't
>this change degrade IOPS and especially latency at low queue depth ?

Here are additional tests done in Azure and Hyper-v, with the patch:
NVMe showed little difference at all queue depths. (Samsung P983 and P963)
SCSI showed little difference when queue depth <64. (RAM disk and local VHD)


Long

>
>> +		set_bit(BLK_MQ_S_WORK_QUEUED, &hctx->state);
>> +	}
>> +
>>  	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
>&hctx->run_work,
>>  				    msecs_to_jiffies(msecs));
>>  }
>> @@ -1561,6 +1571,7 @@ void blk_mq_stop_hw_queue(struct
>blk_mq_hw_ctx *hctx)
>>  	cancel_delayed_work(&hctx->run_work);
>>
>>  	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
>> +	clear_bit(BLK_MQ_S_WORK_QUEUED, &hctx->state);
>>  }
>>  EXPORT_SYMBOL(blk_mq_stop_hw_queue);
>>
>> @@ -1626,6 +1637,7 @@ static void blk_mq_run_work_fn(struct
>work_struct *work)
>>  	struct blk_mq_hw_ctx *hctx;
>>
>>  	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
>> +	clear_bit(BLK_MQ_S_WORK_QUEUED, &hctx->state);
>>
>>  	/*
>>  	 * If we are stopped, don't run the queue.
>> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index
>> 0bf056de5cc3..98269d3fd141 100644
>> --- a/include/linux/blk-mq.h
>> +++ b/include/linux/blk-mq.h
>> @@ -234,6 +234,7 @@ enum {
>>  	BLK_MQ_S_STOPPED	= 0,
>>  	BLK_MQ_S_TAG_ACTIVE	= 1,
>>  	BLK_MQ_S_SCHED_RESTART	= 2,
>> +	BLK_MQ_S_WORK_QUEUED	= 3,
>>
>>  	BLK_MQ_MAX_DEPTH	= 10240,
>>
>>
>
>
>--
>Damien Le Moal
>Western Digital Research
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ec791156e9cc..a882bd65167a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1476,6 +1476,16 @@  static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
 		put_cpu();
 	}
 
+	/*
+	 * Queue a work to run queue. If this is a non-delay run and the
+	 * work is already scheduled, avoid scheduling the same work again.
+	 */
+	if (!msecs) {
+		if (test_bit(BLK_MQ_S_WORK_QUEUED, &hctx->state))
+			return;
+		set_bit(BLK_MQ_S_WORK_QUEUED, &hctx->state);
+	}
+
 	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
 				    msecs_to_jiffies(msecs));
 }
@@ -1561,6 +1571,7 @@  void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 	cancel_delayed_work(&hctx->run_work);
 
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
+	clear_bit(BLK_MQ_S_WORK_QUEUED, &hctx->state);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
 
@@ -1626,6 +1637,7 @@  static void blk_mq_run_work_fn(struct work_struct *work)
 	struct blk_mq_hw_ctx *hctx;
 
 	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
+	clear_bit(BLK_MQ_S_WORK_QUEUED, &hctx->state);
 
 	/*
 	 * If we are stopped, don't run the queue.
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0bf056de5cc3..98269d3fd141 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -234,6 +234,7 @@  enum {
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
+	BLK_MQ_S_WORK_QUEUED	= 3,
 
 	BLK_MQ_MAX_DEPTH	= 10240,