diff mbox

[2/2] blk-stat: add a poll_size value to the request_queue struct

Message ID 1490494692-2416-3-git-send-email-sbates@raithlin.com (mailing list archive)
State New, archived
Headers show

Commit Message

Stephen Bates March 26, 2017, 2:18 a.m. UTC
From: Stephen Bates <sbates@raithlin.com>

In order to bucket IO for the polling algorithm we use a sysfs entry
to set the filter value. It is signed and we will use that as follows:

 0   : No filtering. All IO are considered in stat generation
 > 0 : Filtering based on IO of exactly this size only.
 < 0 : Filtering based on IO less than or equal to -1 time this value.

Use this member to implement a new bucket function for the io polling
stats.

Signed-off-by: Stephen Bates <sbates@raithlin.com>
---
 block/blk-mq.c         | 17 +++++++++++++++--
 block/blk-sysfs.c      | 30 ++++++++++++++++++++++++++++++
 include/linux/blkdev.h |  1 +
 3 files changed, 46 insertions(+), 2 deletions(-)

Comments

Sagi Grimberg March 28, 2017, 10:50 a.m. UTC | #1
On 26/03/17 05:18, sbates@raithlin.com wrote:
> From: Stephen Bates <sbates@raithlin.com>
>
> In order to bucket IO for the polling algorithm we use a sysfs entry
> to set the filter value. It is signed and we will use that as follows:
>
>  0   : No filtering. All IO are considered in stat generation
>  > 0 : Filtering based on IO of exactly this size only.
>  < 0 : Filtering based on IO less than or equal to -1 time this value.

I'd say that this is a fairly non-trivial semantic meanning to this...

Is there any use for the size exact match filter? If not then
I suggest we always make it (<=) in its semantics...

> Use this member to implement a new bucket function for the io polling
> stats.
>
> Signed-off-by: Stephen Bates <sbates@raithlin.com>
> ---
>  block/blk-mq.c         | 17 +++++++++++++++--
>  block/blk-sysfs.c      | 30 ++++++++++++++++++++++++++++++
>  include/linux/blkdev.h |  1 +
>  3 files changed, 46 insertions(+), 2 deletions(-)
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 5ff66f2..5ea9481 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2327,6 +2327,18 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
>  	blk_mq_sysfs_register(q);
>  }
>
> +int blk_mq_poll_stats_bucket(const struct request *rq)
> +{
> +	int val = rq->q->poll_size;
> +
> +	if ((val == 0) ||
> +	    (val > 0 && blk_rq_bytes(rq) == val) ||
> +	    (val < 0 && blk_rq_bytes(rq) <= -val))
> +		return (int)rq_data_dir(rq);
> +
> +	return -1;
> +}
> +
>  struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>  						  struct request_queue *q)
>  {
> @@ -2338,7 +2350,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>  		goto err_exit;
>
>  	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
> -					     blk_stat_rq_ddir, 2, q);
> +					     blk_mq_poll_stats_bucket, 2, q);
>  	if (!q->poll_cb)
>  		goto err_exit;
>
> @@ -2387,9 +2399,10 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>  	q->nr_requests = set->queue_depth;
>
>  	/*
> -	 * Default to classic polling
> +	 * Default to classic polling. Default to considering all IO.
>  	 */
>  	q->poll_nsec = -1;
> +	q->poll_size = 0;
>
>  	if (set->ops->complete)
>  		blk_queue_softirq_done(q, set->ops->complete);
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index fa831cb..000d5db 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -394,6 +394,29 @@ static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
>  	return count;
>  }
>
> +static ssize_t queue_poll_size_show(struct request_queue *q, char *page)
> +{
> +	return sprintf(page, "%d\n", q->poll_size);
> +}
> +
> +static ssize_t queue_poll_size_store(struct request_queue *q, const char *page,
> +				     size_t count)
> +{
> +	int err, val;
> +
> +	if (!q->mq_ops || !q->mq_ops->poll)
> +		return -EINVAL;
> +
> +	err = kstrtoint(page, 10, &val);
> +	if (err < 0)
> +		return err;
> +
> +	q->poll_size = val;
> +
> +	return count;
> +}
> +
> +
>  static ssize_t queue_poll_show(struct request_queue *q, char *page)
>  {
>  	return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
> @@ -654,6 +677,12 @@ static struct queue_sysfs_entry queue_poll_entry = {
>  	.store = queue_poll_store,
>  };
>
> +static struct queue_sysfs_entry queue_poll_size_entry = {
> +	.attr = {.name = "io_poll_size", .mode = S_IRUGO | S_IWUSR },
> +	.show = queue_poll_size_show,
> +	.store = queue_poll_size_store,
> +};
> +
>  static struct queue_sysfs_entry queue_poll_delay_entry = {
>  	.attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR },
>  	.show = queue_poll_delay_show,
> @@ -710,6 +739,7 @@ static struct attribute *default_attrs[] = {
>  	&queue_dax_entry.attr,
>  	&queue_wb_lat_entry.attr,
>  	&queue_poll_delay_entry.attr,
> +	&queue_poll_size_entry.attr,
>  	NULL,
>  };
>
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 1a7dc42..7ff5388 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -517,6 +517,7 @@ struct request_queue {
>
>  	unsigned int		rq_timeout;
>  	int			poll_nsec;
> +	int			poll_size;
>
>  	struct blk_stat_callback	*poll_cb;
>  	struct blk_rq_stat	poll_stat[2];
>
Stephen Bates March 28, 2017, 7:38 p.m. UTC | #2
>>

>>  In order to bucket IO for the polling algorithm we use a sysfs entry

>>  to set the filter value. It is signed and we will use that as follows:

>>

>>   0   : No filtering. All IO are considered in stat generation

>>   > 0 : Filtering based on IO of exactly this size only.

>>   < 0 : Filtering based on IO less than or equal to -1 time this value.>

>

> I'd say that this is a fairly non-trivial semantic meanning to this...

>

> Is there any use for the size exact match filter? If not then

> I suggest we always make it (<=) in its semantics...


Thanks for the review Sagi. I’d be OK going with <=0 as the exact match would normally be for minimal IO sizes (where <= and = are the same thing). I will see what other feedback I get and aim to do a respin soon…

Stephen
Jens Axboe March 28, 2017, 7:46 p.m. UTC | #3
On 03/28/2017 01:38 PM, Stephen  Bates wrote:
>>>
>>>  In order to bucket IO for the polling algorithm we use a sysfs entry
>>>  to set the filter value. It is signed and we will use that as follows:
>>>
>>>   0   : No filtering. All IO are considered in stat generation
>>>   > 0 : Filtering based on IO of exactly this size only.
>>>   < 0 : Filtering based on IO less than or equal to -1 time this value.>
>>
>> I'd say that this is a fairly non-trivial semantic meanning to this...
>>
>> Is there any use for the size exact match filter? If not then
>> I suggest we always make it (<=) in its semantics...
> 
> Thanks for the review Sagi. I’d be OK going with <=0 as the exact
> match would normally be for minimal IO sizes (where <= and = are the
> same thing). I will see what other feedback I get and aim to do a
> respin soon…

No tunables for this, please. There's absolutely no reason why we should
need it.
Stephen Bates March 28, 2017, 7:58 p.m. UTC | #4
>> 

>> Thanks for the review Sagi. I’d be OK going with <=0 as the exact

>> match would normally be for minimal IO sizes (where <= and = are the

>> same thing). I will see what other feedback I get and aim to do a

>> respin soon…

>

> No tunables for this, please. There's absolutely no reason why we should

> need it.


Jens – by this you mean you want to only bucket IO that are exactly the minimum block size supported by the underlying block device? I was envisioning we might want to relax that in certain cases (e.g. bucket 4KB and below going to a 512B device).

Stephen
Jens Axboe March 28, 2017, 8:38 p.m. UTC | #5
On 03/28/2017 01:58 PM, Stephen  Bates wrote:
>>>
>>> Thanks for the review Sagi. I’d be OK going with <=0 as the exact
>>> match would normally be for minimal IO sizes (where <= and = are the
>>> same thing). I will see what other feedback I get and aim to do a
>>> respin soon…
>>
>> No tunables for this, please. There's absolutely no reason why we should
>> need it.
> 
> Jens – by this you mean you want to only bucket IO that are exactly
> the minimum block size supported by the underlying block device? I was
> envisioning we might want to relax that in certain cases (e.g. bucket
> 4KB and below going to a 512B device).

Sorry, the above was a bit terse. I think a much better solution would
be to create a number of buckets (per data direction) and do stats on
all of them. The buckets would cover a reasonable range of request
sizes. Then when you poll for a given request, we can base our timed
number on the data direction AND size of it. You can get pretty far with
a few buckets:

512b
4k
8k
16k
32k
64k
128k

and you could even have your time estimation function turn these into
something sane. Or just use a composite of buckets, if you wish.
Stephen Bates March 28, 2017, 9:52 p.m. UTC | #6
[Retrying as my new setup secretly converted to html format without telling me. Apologies for the resend.]

>>> 

>>> Thanks for the review Sagi. I’d be OK going with <=0 as the exact

>>> match would normally be for minimal IO sizes (where <= and = are the

>>> same thing). I will see what other feedback I get and aim to do a

>>> respin soon…

>> 

>> No tunables for this, please. There's absolutely no reason why we should

>> need it.

>

> Jens – by this you mean you want to only bucket IO that are exactly

> the minimum block size supported by the underlying block device? I was

> envisioning we might want to relax that in certain cases (e.g. bucket

> 4KB and below going to a 512B device).

 
> Sorry, the above was a bit terse. I think a much better solution would

> be to create a number of buckets (per data direction) and do stats on

> all of them. The buckets would cover a reasonable range of request

> sizes. Then when you poll for a given request, we can base our timed

> number on the data direction AND size of it. You can get pretty far with

> a few buckets:

> 

> 512b

> 4k

> 8k

> 16k

> 32k

> 64k

> 128k

> 

> and you could even have your time estimation function turn these into

> something sane. Or just use a composite of buckets, if you wish.

 
I did go down this path initially but then moved away from it since we were focusing only on the smaller IO size. However I can definitely take a look at this again as I agree that it could be more useful in the long run.
 
I would like to keep my first patch in this series alive since I do think having the option to not bucket an IO is a useful thing to have.
 
I’ll take all the feedback to date and work on a v2. Thanks!
 
Stephen
diff mbox

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5ff66f2..5ea9481 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2327,6 +2327,18 @@  static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	blk_mq_sysfs_register(q);
 }
 
+int blk_mq_poll_stats_bucket(const struct request *rq)
+{
+	int val = rq->q->poll_size;
+
+	if ((val == 0) ||
+	    (val > 0 && blk_rq_bytes(rq) == val) ||
+	    (val < 0 && blk_rq_bytes(rq) <= -val))
+		return (int)rq_data_dir(rq);
+
+	return -1;
+}
+
 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 						  struct request_queue *q)
 {
@@ -2338,7 +2350,7 @@  struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		goto err_exit;
 
 	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
-					     blk_stat_rq_ddir, 2, q);
+					     blk_mq_poll_stats_bucket, 2, q);
 	if (!q->poll_cb)
 		goto err_exit;
 
@@ -2387,9 +2399,10 @@  struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	q->nr_requests = set->queue_depth;
 
 	/*
-	 * Default to classic polling
+	 * Default to classic polling. Default to considering all IO.
 	 */
 	q->poll_nsec = -1;
+	q->poll_size = 0;
 
 	if (set->ops->complete)
 		blk_queue_softirq_done(q, set->ops->complete);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index fa831cb..000d5db 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -394,6 +394,29 @@  static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
 	return count;
 }
 
+static ssize_t queue_poll_size_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%d\n", q->poll_size);
+}
+
+static ssize_t queue_poll_size_store(struct request_queue *q, const char *page,
+				     size_t count)
+{
+	int err, val;
+
+	if (!q->mq_ops || !q->mq_ops->poll)
+		return -EINVAL;
+
+	err = kstrtoint(page, 10, &val);
+	if (err < 0)
+		return err;
+
+	q->poll_size = val;
+
+	return count;
+}
+
+
 static ssize_t queue_poll_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
@@ -654,6 +677,12 @@  static struct queue_sysfs_entry queue_poll_entry = {
 	.store = queue_poll_store,
 };
 
+static struct queue_sysfs_entry queue_poll_size_entry = {
+	.attr = {.name = "io_poll_size", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_poll_size_show,
+	.store = queue_poll_size_store,
+};
+
 static struct queue_sysfs_entry queue_poll_delay_entry = {
 	.attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_poll_delay_show,
@@ -710,6 +739,7 @@  static struct attribute *default_attrs[] = {
 	&queue_dax_entry.attr,
 	&queue_wb_lat_entry.attr,
 	&queue_poll_delay_entry.attr,
+	&queue_poll_size_entry.attr,
 	NULL,
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1a7dc42..7ff5388 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -517,6 +517,7 @@  struct request_queue {
 
 	unsigned int		rq_timeout;
 	int			poll_nsec;
+	int			poll_size;
 
 	struct blk_stat_callback	*poll_cb;
 	struct blk_rq_stat	poll_stat[2];