diff mbox

[3/4] blk-mq: implement hybrid poll mode for sync O_DIRECT

Message ID 1478034325-28232-4-git-send-email-axboe@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jens Axboe Nov. 1, 2016, 9:05 p.m. UTC
This patch enables a hybrid polling mode. Instead of polling after IO
submission, we can induce an artificial delay, and then poll after that.
For example, if the IO is presumed to complete in 8 usecs from now, we
can sleep for 4 usecs, wake up, and then do our polling. This still puts
a sleep/wakeup cycle in the IO path, but instead of the wakeup happening
after the IO has completed, it'll happen before. With this hybrid
scheme, we can achieve big latency reductions while still using the same
(or less) amount of CPU.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 38 ++++++++++++++++++++++++++++++++++++++
 block/blk-sysfs.c      | 29 +++++++++++++++++++++++++++++
 block/blk.h            |  1 +
 include/linux/blkdev.h |  1 +
 4 files changed, 69 insertions(+)

Comments

Christoph Hellwig Nov. 2, 2016, 2:54 p.m. UTC | #1
On Tue, Nov 01, 2016 at 03:05:24PM -0600, Jens Axboe wrote:
> This patch enables a hybrid polling mode. Instead of polling after IO
> submission, we can induce an artificial delay, and then poll after that.
> For example, if the IO is presumed to complete in 8 usecs from now, we
> can sleep for 4 usecs, wake up, and then do our polling. This still puts
> a sleep/wakeup cycle in the IO path, but instead of the wakeup happening
> after the IO has completed, it'll happen before. With this hybrid
> scheme, we can achieve big latency reductions while still using the same
> (or less) amount of CPU.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>

This looks very nice.  I'll need to run some tests before giving the
formal ACK, though.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ming Lei Nov. 3, 2016, 12:27 p.m. UTC | #2
On Wed, Nov 2, 2016 at 5:05 AM, Jens Axboe <axboe@fb.com> wrote:
> This patch enables a hybrid polling mode. Instead of polling after IO
> submission, we can induce an artificial delay, and then poll after that.
> For example, if the IO is presumed to complete in 8 usecs from now, we
> can sleep for 4 usecs, wake up, and then do our polling. This still puts

I guess in reality it isn't easy to figure a perfect poll time:

- for one driver, different CPU and different drive/disk may cause different
completion time

- for requests with different size, the completion time can be different too

Is there one way to figure out the poll time automatically?

> a sleep/wakeup cycle in the IO path, but instead of the wakeup happening
> after the IO has completed, it'll happen before. With this hybrid
> scheme, we can achieve big latency reductions while still using the same
> (or less) amount of CPU.
>
> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---
>  block/blk-mq.c         | 38 ++++++++++++++++++++++++++++++++++++++
>  block/blk-sysfs.c      | 29 +++++++++++++++++++++++++++++
>  block/blk.h            |  1 +
>  include/linux/blkdev.h |  1 +
>  4 files changed, 69 insertions(+)
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 4ef35588c299..caa55bec9411 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -302,6 +302,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
>         rq->rq_flags = 0;
>
>         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
> +       clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
>         blk_mq_put_tag(hctx, ctx, tag);
>         blk_queue_exit(q);
>  }
> @@ -2352,11 +2353,48 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
>
> +static void blk_mq_poll_hybrid_sleep(struct request_queue *q,
> +                                    struct request *rq)
> +{
> +       struct hrtimer_sleeper hs;
> +       ktime_t kt;
> +
> +       if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
> +               return;
> +
> +       set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
> +
> +       /*
> +        * This will be replaced with the stats tracking code, using
> +        * 'avg_completion_time / 2' as the pre-sleep target.
> +        */
> +       kt = ktime_set(0, q->poll_nsec);
> +
> +       hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +       hrtimer_set_expires(&hs.timer, kt);
> +
> +       hrtimer_init_sleeper(&hs, current);
> +       do {
> +               if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
> +                       break;
> +               set_current_state(TASK_INTERRUPTIBLE);
> +               hrtimer_start_expires(&hs.timer, HRTIMER_MODE_REL);
> +               if (hs.task)
> +                       io_schedule();
> +               hrtimer_cancel(&hs.timer);
> +       } while (hs.task && !signal_pending(current));
> +
> +       __set_current_state(TASK_RUNNING);
> +       destroy_hrtimer_on_stack(&hs.timer);
> +}
> +
>  bool blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
>  {
>         struct request_queue *q = hctx->queue;
>         long state;
>
> +       blk_mq_poll_hybrid_sleep(q, rq);
> +
>         hctx->poll_considered++;
>
>         state = current->state;
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 5bb4648f434a..467b81c6713c 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -336,6 +336,28 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
>         return ret;
>  }
>
> +static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
> +{
> +       return queue_var_show(q->poll_nsec / 1000, page);
> +}
> +
> +static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
> +                               size_t count)
> +{
> +       unsigned long poll_usec;
> +       ssize_t ret;
> +
> +       if (!q->mq_ops || !q->mq_ops->poll)
> +               return -EINVAL;
> +
> +       ret = queue_var_store(&poll_usec, page, count);
> +       if (ret < 0)
> +               return ret;
> +
> +       q->poll_nsec = poll_usec * 1000;
> +       return ret;
> +}
> +
>  static ssize_t queue_poll_show(struct request_queue *q, char *page)
>  {
>         return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
> @@ -562,6 +584,12 @@ static struct queue_sysfs_entry queue_poll_entry = {
>         .store = queue_poll_store,
>  };
>
> +static struct queue_sysfs_entry queue_poll_delay_entry = {
> +       .attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR },
> +       .show = queue_poll_delay_show,
> +       .store = queue_poll_delay_store,
> +};
> +
>  static struct queue_sysfs_entry queue_wc_entry = {
>         .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
>         .show = queue_wc_show,
> @@ -608,6 +636,7 @@ static struct attribute *default_attrs[] = {
>         &queue_wc_entry.attr,
>         &queue_dax_entry.attr,
>         &queue_stats_entry.attr,
> +       &queue_poll_delay_entry.attr,
>         NULL,
>  };
>
> diff --git a/block/blk.h b/block/blk.h
> index aa132dea598c..041185e5f129 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -111,6 +111,7 @@ void blk_account_io_done(struct request *req);
>  enum rq_atomic_flags {
>         REQ_ATOM_COMPLETE = 0,
>         REQ_ATOM_STARTED,
> +       REQ_ATOM_POLL_SLEPT,
>  };
>
>  /*
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index dcd8d6e8801f..6acd220dc3f3 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -502,6 +502,7 @@ struct request_queue {
>         unsigned int            request_fn_active;
>
>         unsigned int            rq_timeout;
> +       unsigned int            poll_nsec;
>         struct timer_list       timeout;
>         struct work_struct      timeout_work;
>         struct list_head        timeout_list;
> --
> 2.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-block" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Nov. 3, 2016, 1:41 p.m. UTC | #3
On 11/03/2016 06:27 AM, Ming Lei wrote:
> On Wed, Nov 2, 2016 at 5:05 AM, Jens Axboe <axboe@fb.com> wrote:
>> This patch enables a hybrid polling mode. Instead of polling after IO
>> submission, we can induce an artificial delay, and then poll after that.
>> For example, if the IO is presumed to complete in 8 usecs from now, we
>> can sleep for 4 usecs, wake up, and then do our polling. This still puts
>
> I guess in reality it isn't easy to figure a perfect poll time:
>
> - for one driver, different CPU and different drive/disk may cause different
> completion time
>
> - for requests with different size, the completion time can be different too
>
> Is there one way to figure out the poll time automatically?

Yes, it's not easy to figure out the perfect time, the point is to try
and make a guess that's a bit better than the current "let's poll the
whole time". I suspect that for most real world cases, you are going to
be polling for smallish IO of roughly the same size. Hence the stats
should be useful.

But we could extend the tracking a bit and make it smarter.
Bart Van Assche Nov. 3, 2016, 2:01 p.m. UTC | #4
On 11/01/2016 03:05 PM, Jens Axboe wrote:
> +static void blk_mq_poll_hybrid_sleep(struct request_queue *q,
> +				     struct request *rq)
> +{
> +	struct hrtimer_sleeper hs;
> +	ktime_t kt;
> +
> +	if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
> +		return;
> +
> +	set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
> +
> +	/*
> +	 * This will be replaced with the stats tracking code, using
> +	 * 'avg_completion_time / 2' as the pre-sleep target.
> +	 */
> +	kt = ktime_set(0, q->poll_nsec);
> +
> +	hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +	hrtimer_set_expires(&hs.timer, kt);
> +
> +	hrtimer_init_sleeper(&hs, current);
> +	do {
> +		if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
> +			break;
> +		set_current_state(TASK_INTERRUPTIBLE);
> +		hrtimer_start_expires(&hs.timer, HRTIMER_MODE_REL);
> +		if (hs.task)
> +			io_schedule();
> +		hrtimer_cancel(&hs.timer);
> +	} while (hs.task && !signal_pending(current));
> +
> +	__set_current_state(TASK_RUNNING);
> +	destroy_hrtimer_on_stack(&hs.timer);
> +}

Hello Jens,

Will avg_completion_time/2 be a good choice for a polling interval if an 
application submits requests of varying sizes, e.g. 4 KB and 64 KB?

Can avg_completion_time be smaller than the context switch time? If so, 
do you think any other thread will be able to do any useful work after 
the timer has been started and before the timer has expired?

Thanks,

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Nov. 3, 2016, 2:15 p.m. UTC | #5
On 11/03/2016 08:01 AM, Bart Van Assche wrote:
> On 11/01/2016 03:05 PM, Jens Axboe wrote:
>> +static void blk_mq_poll_hybrid_sleep(struct request_queue *q,
>> +                     struct request *rq)
>> +{
>> +    struct hrtimer_sleeper hs;
>> +    ktime_t kt;
>> +
>> +    if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT,
>> &rq->atomic_flags))
>> +        return;
>> +
>> +    set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
>> +
>> +    /*
>> +     * This will be replaced with the stats tracking code, using
>> +     * 'avg_completion_time / 2' as the pre-sleep target.
>> +     */
>> +    kt = ktime_set(0, q->poll_nsec);
>> +
>> +    hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>> +    hrtimer_set_expires(&hs.timer, kt);
>> +
>> +    hrtimer_init_sleeper(&hs, current);
>> +    do {
>> +        if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
>> +            break;
>> +        set_current_state(TASK_INTERRUPTIBLE);
>> +        hrtimer_start_expires(&hs.timer, HRTIMER_MODE_REL);
>> +        if (hs.task)
>> +            io_schedule();
>> +        hrtimer_cancel(&hs.timer);
>> +    } while (hs.task && !signal_pending(current));
>> +
>> +    __set_current_state(TASK_RUNNING);
>> +    destroy_hrtimer_on_stack(&hs.timer);
>> +}
>
> Hello Jens,
>
> Will avg_completion_time/2 be a good choice for a polling interval if an
> application submits requests of varying sizes, e.g. 4 KB and 64 KB?

Not necessarily. This is a first implementation to demonstrate what is
possible, we can definitely make it more clever. As mentioned in the
previous email, I believe most cases will be small(ish) IO of roughly
the same size, and it'll work fine for that.

One possible improvement could be to factor in the minimum times as
well. Since we're assuming some level of predictability here,
incorporating the 'min' time could help too. This is one of the devices
I used for testing:

# cat stats
read : samples=15199, mean=5185, min=5068, max=25616
write: samples=0, mean=0, min=-1, max=0

and when it looks like that, then using mean/2 works well. If I do a
512/64k 50/50 split, it looks like this:

cat stats
read : samples=4896, mean=20996, min=5250, max=49721
write: samples=0, mean=0, min=-1, max=0

Using the classic polling with this workload, I get 39900 IOPS at 100%
CPU load. With the hybrid poll enabled, I get 37700 IOPS at 68% CPU. For
the hybrid polling, with the default settings, we end up being a bit
slower than pure poll (not unexpected), but at a higher level of
efficiency. Even for this 512b/64k split case, that's still the case.

For comparison, without polling, we run at 25800 IOPS (at 39% CPU).

> Can avg_completion_time be smaller than the context switch time? If so,
> do you think any other thread will be able to do any useful work after
> the timer has been started and before the timer has expired?

There are definitely cases where we want to just keep busy polling. I
didn't add any minimum yet, but yes, you can imagine cases where we can
blow our budget by doing the sleep up front. We just want to do the busy
poll for that case.
diff mbox

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4ef35588c299..caa55bec9411 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -302,6 +302,7 @@  static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	rq->rq_flags = 0;
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
 	blk_mq_put_tag(hctx, ctx, tag);
 	blk_queue_exit(q);
 }
@@ -2352,11 +2353,48 @@  void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
+static void blk_mq_poll_hybrid_sleep(struct request_queue *q,
+				     struct request *rq)
+{
+	struct hrtimer_sleeper hs;
+	ktime_t kt;
+
+	if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+		return;
+
+	set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+
+	/*
+	 * This will be replaced with the stats tracking code, using
+	 * 'avg_completion_time / 2' as the pre-sleep target.
+	 */
+	kt = ktime_set(0, q->poll_nsec);
+
+	hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_set_expires(&hs.timer, kt);
+
+	hrtimer_init_sleeper(&hs, current);
+	do {
+		if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+			break;
+		set_current_state(TASK_INTERRUPTIBLE);
+		hrtimer_start_expires(&hs.timer, HRTIMER_MODE_REL);
+		if (hs.task)
+			io_schedule();
+		hrtimer_cancel(&hs.timer);
+	} while (hs.task && !signal_pending(current));
+
+	__set_current_state(TASK_RUNNING);
+	destroy_hrtimer_on_stack(&hs.timer);
+}
+
 bool blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
 	struct request_queue *q = hctx->queue;
 	long state;
 
+	blk_mq_poll_hybrid_sleep(q, rq);
+
 	hctx->poll_considered++;
 
 	state = current->state;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5bb4648f434a..467b81c6713c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -336,6 +336,28 @@  queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 	return ret;
 }
 
+static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->poll_nsec / 1000, page);
+}
+
+static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
+				size_t count)
+{
+	unsigned long poll_usec;
+	ssize_t ret;
+
+	if (!q->mq_ops || !q->mq_ops->poll)
+		return -EINVAL;
+
+	ret = queue_var_store(&poll_usec, page, count);
+	if (ret < 0)
+		return ret;
+
+	q->poll_nsec = poll_usec * 1000;
+	return ret;
+}
+
 static ssize_t queue_poll_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
@@ -562,6 +584,12 @@  static struct queue_sysfs_entry queue_poll_entry = {
 	.store = queue_poll_store,
 };
 
+static struct queue_sysfs_entry queue_poll_delay_entry = {
+	.attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_poll_delay_show,
+	.store = queue_poll_delay_store,
+};
+
 static struct queue_sysfs_entry queue_wc_entry = {
 	.attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_wc_show,
@@ -608,6 +636,7 @@  static struct attribute *default_attrs[] = {
 	&queue_wc_entry.attr,
 	&queue_dax_entry.attr,
 	&queue_stats_entry.attr,
+	&queue_poll_delay_entry.attr,
 	NULL,
 };
 
diff --git a/block/blk.h b/block/blk.h
index aa132dea598c..041185e5f129 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -111,6 +111,7 @@  void blk_account_io_done(struct request *req);
 enum rq_atomic_flags {
 	REQ_ATOM_COMPLETE = 0,
 	REQ_ATOM_STARTED,
+	REQ_ATOM_POLL_SLEPT,
 };
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index dcd8d6e8801f..6acd220dc3f3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -502,6 +502,7 @@  struct request_queue {
 	unsigned int		request_fn_active;
 
 	unsigned int		rq_timeout;
+	unsigned int		poll_nsec;
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
 	struct list_head	timeout_list;