diff mbox

[v3,2/2] blk-mq: Add a polling specific stats function

Message ID 1491567843-26190-3-git-send-email-sbates@raithlin.com (mailing list archive)
State New, archived
Headers show

Commit Message

Stephen Bates April 7, 2017, 12:24 p.m. UTC
From: Stephen Bates <sbates@raithlin.com>

Rather than bucketing IO statisics based on direction only we also
bucket based on the IO size. This leads to improved polling
performance. Update the bucket callback function and use it in the
polling latency estimation.

Signed-off-by: Stephen Bates <sbates@raithlin.com>
---
 block/blk-mq.c | 45 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 10 deletions(-)

Comments

Omar Sandoval April 20, 2017, 8:07 p.m. UTC | #1
On Fri, Apr 07, 2017 at 06:24:03AM -0600, sbates@raithlin.com wrote:
> From: Stephen Bates <sbates@raithlin.com>
> 
> Rather than bucketing IO statisics based on direction only we also
> bucket based on the IO size. This leads to improved polling
> performance. Update the bucket callback function and use it in the
> polling latency estimation.
> 
> Signed-off-by: Stephen Bates <sbates@raithlin.com>

Hey, Stephen, just taking a look at this now. Comments below.

> ---
>  block/blk-mq.c | 45 +++++++++++++++++++++++++++++++++++----------
>  1 file changed, 35 insertions(+), 10 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 061fc2c..5fd376b 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -42,6 +42,25 @@ static LIST_HEAD(all_q_list);
>  static void blk_mq_poll_stats_start(struct request_queue *q);
>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>  
> +/* Must be consisitent with function below */
> +#define BLK_MQ_POLL_STATS_BKTS 16
> +static int blk_mq_poll_stats_bkt(const struct request *rq)
> +{
> +	int ddir, bytes, bucket;
> +
> +	ddir = blk_stat_rq_ddir(rq);

No need to call the wrapper function here, we can use rq_data_dir()
directly.

> +	bytes = blk_rq_bytes(rq);
> +
> +	bucket = ddir + 2*(ilog2(bytes) - 9);
> +
> +	if (bucket < 0)
> +		return -1;
> +	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
> +		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
> +
> +	return bucket;
> +}

Nitpicking here, but defining things in terms of the number of size
buckets seems more natural to me. How about something like this
(untested)? Note that this obviates the need for patch 1.

#define BLK_MQ_POLL_STATS_SIZE_BKTS 8
static unsigned int blk_mq_poll_stats_bkt(const struct request *rq)
{
	unsigned int size_bucket;

	size_bucket = clamp(ilog2(blk_rq_bytes(rq)) - 9, 0,
			    BLK_MQ_POLL_STATS_SIZE_BKTS - 1);
	return 2 * size_bucket + rq_data_dir(rq);
}

>  /*
>   * Check if any of the ctx's have pending work in this hardware queue
>   */
> @@ -2245,7 +2264,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>  	q->mq_ops = set->ops;
>  
>  	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
> -					     blk_stat_rq_ddir, 2, q);
> +					     blk_mq_poll_stats_bkt,
> +					     BLK_MQ_POLL_STATS_BKTS, q);

With the above change, this would become 2 * BLK_MQ_POLL_STATS_SIZE_BKTS.
Jens Axboe April 20, 2017, 8:16 p.m. UTC | #2
On 04/20/2017 02:07 PM, Omar Sandoval wrote:
> On Fri, Apr 07, 2017 at 06:24:03AM -0600, sbates@raithlin.com wrote:
>> From: Stephen Bates <sbates@raithlin.com>
>>
>> Rather than bucketing IO statisics based on direction only we also
>> bucket based on the IO size. This leads to improved polling
>> performance. Update the bucket callback function and use it in the
>> polling latency estimation.
>>
>> Signed-off-by: Stephen Bates <sbates@raithlin.com>
> 
> Hey, Stephen, just taking a look at this now. Comments below.
> 
>> ---
>>  block/blk-mq.c | 45 +++++++++++++++++++++++++++++++++++----------
>>  1 file changed, 35 insertions(+), 10 deletions(-)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 061fc2c..5fd376b 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -42,6 +42,25 @@ static LIST_HEAD(all_q_list);
>>  static void blk_mq_poll_stats_start(struct request_queue *q);
>>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>>  
>> +/* Must be consisitent with function below */
>> +#define BLK_MQ_POLL_STATS_BKTS 16
>> +static int blk_mq_poll_stats_bkt(const struct request *rq)
>> +{
>> +	int ddir, bytes, bucket;
>> +
>> +	ddir = blk_stat_rq_ddir(rq);
> 
> No need to call the wrapper function here, we can use rq_data_dir()
> directly.
> 
>> +	bytes = blk_rq_bytes(rq);
>> +
>> +	bucket = ddir + 2*(ilog2(bytes) - 9);
>> +
>> +	if (bucket < 0)
>> +		return -1;
>> +	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
>> +		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
>> +
>> +	return bucket;
>> +}
> 
> Nitpicking here, but defining things in terms of the number of size
> buckets seems more natural to me. How about something like this
> (untested)? Note that this obviates the need for patch 1.
> 
> #define BLK_MQ_POLL_STATS_SIZE_BKTS 8
> static unsigned int blk_mq_poll_stats_bkt(const struct request *rq)
> {
> 	unsigned int size_bucket;
> 
> 	size_bucket = clamp(ilog2(blk_rq_bytes(rq)) - 9, 0,
> 			    BLK_MQ_POLL_STATS_SIZE_BKTS - 1);
> 	return 2 * size_bucket + rq_data_dir(rq);
> }

As I wrote in an earlier reply, it would be a lot cleaner to just have
the buckets be:

	buckets[2][BUCKETS_PER_RW];

and not have to do weird math based on both size and data direction.
Just have it return the bucket index based on size, and have the caller
do:

	bucket[rq_data_dir(rq)][bucket_index];
Omar Sandoval April 20, 2017, 8:20 p.m. UTC | #3
On Thu, Apr 20, 2017 at 02:16:04PM -0600, Jens Axboe wrote:
> On 04/20/2017 02:07 PM, Omar Sandoval wrote:
> > On Fri, Apr 07, 2017 at 06:24:03AM -0600, sbates@raithlin.com wrote:
> >> From: Stephen Bates <sbates@raithlin.com>
> >>
> >> Rather than bucketing IO statisics based on direction only we also
> >> bucket based on the IO size. This leads to improved polling
> >> performance. Update the bucket callback function and use it in the
> >> polling latency estimation.
> >>
> >> Signed-off-by: Stephen Bates <sbates@raithlin.com>
> > 
> > Hey, Stephen, just taking a look at this now. Comments below.
> > 
> >> ---
> >>  block/blk-mq.c | 45 +++++++++++++++++++++++++++++++++++----------
> >>  1 file changed, 35 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/block/blk-mq.c b/block/blk-mq.c
> >> index 061fc2c..5fd376b 100644
> >> --- a/block/blk-mq.c
> >> +++ b/block/blk-mq.c
> >> @@ -42,6 +42,25 @@ static LIST_HEAD(all_q_list);
> >>  static void blk_mq_poll_stats_start(struct request_queue *q);
> >>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
> >>  
> >> +/* Must be consisitent with function below */
> >> +#define BLK_MQ_POLL_STATS_BKTS 16
> >> +static int blk_mq_poll_stats_bkt(const struct request *rq)
> >> +{
> >> +	int ddir, bytes, bucket;
> >> +
> >> +	ddir = blk_stat_rq_ddir(rq);
> > 
> > No need to call the wrapper function here, we can use rq_data_dir()
> > directly.
> > 
> >> +	bytes = blk_rq_bytes(rq);
> >> +
> >> +	bucket = ddir + 2*(ilog2(bytes) - 9);
> >> +
> >> +	if (bucket < 0)
> >> +		return -1;
> >> +	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
> >> +		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
> >> +
> >> +	return bucket;
> >> +}
> > 
> > Nitpicking here, but defining things in terms of the number of size
> > buckets seems more natural to me. How about something like this
> > (untested)? Note that this obviates the need for patch 1.
> > 
> > #define BLK_MQ_POLL_STATS_SIZE_BKTS 8
> > static unsigned int blk_mq_poll_stats_bkt(const struct request *rq)
> > {
> > 	unsigned int size_bucket;
> > 
> > 	size_bucket = clamp(ilog2(blk_rq_bytes(rq)) - 9, 0,
> > 			    BLK_MQ_POLL_STATS_SIZE_BKTS - 1);
> > 	return 2 * size_bucket + rq_data_dir(rq);
> > }
> 
> As I wrote in an earlier reply, it would be a lot cleaner to just have
> the buckets be:
> 
> 	buckets[2][BUCKETS_PER_RW];
> 
> and not have to do weird math based on both size and data direction.
> Just have it return the bucket index based on size, and have the caller
> do:
> 
> 	bucket[rq_data_dir(rq)][bucket_index];

This removes a lot of the flexibility of the interface. Kyber, for one,
has this stats callback:

static unsigned int rq_sched_domain(const struct request *rq)
{
	unsigned int op = rq->cmd_flags;

	if ((op & REQ_OP_MASK) == REQ_OP_READ)
		return KYBER_READ;
	else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
		return KYBER_SYNC_WRITE;
	else
		return KYBER_OTHER;
}

The buckets aren't subdivisions of read vs. write. We could shoehorn it
in your way if we really wanted to, but that's pointless.
Jens Axboe April 20, 2017, 8:22 p.m. UTC | #4
On 04/20/2017 02:20 PM, Omar Sandoval wrote:
> On Thu, Apr 20, 2017 at 02:16:04PM -0600, Jens Axboe wrote:
>> On 04/20/2017 02:07 PM, Omar Sandoval wrote:
>>> On Fri, Apr 07, 2017 at 06:24:03AM -0600, sbates@raithlin.com wrote:
>>>> From: Stephen Bates <sbates@raithlin.com>
>>>>
>>>> Rather than bucketing IO statisics based on direction only we also
>>>> bucket based on the IO size. This leads to improved polling
>>>> performance. Update the bucket callback function and use it in the
>>>> polling latency estimation.
>>>>
>>>> Signed-off-by: Stephen Bates <sbates@raithlin.com>
>>>
>>> Hey, Stephen, just taking a look at this now. Comments below.
>>>
>>>> ---
>>>>  block/blk-mq.c | 45 +++++++++++++++++++++++++++++++++++----------
>>>>  1 file changed, 35 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>> index 061fc2c..5fd376b 100644
>>>> --- a/block/blk-mq.c
>>>> +++ b/block/blk-mq.c
>>>> @@ -42,6 +42,25 @@ static LIST_HEAD(all_q_list);
>>>>  static void blk_mq_poll_stats_start(struct request_queue *q);
>>>>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>>>>  
>>>> +/* Must be consisitent with function below */
>>>> +#define BLK_MQ_POLL_STATS_BKTS 16
>>>> +static int blk_mq_poll_stats_bkt(const struct request *rq)
>>>> +{
>>>> +	int ddir, bytes, bucket;
>>>> +
>>>> +	ddir = blk_stat_rq_ddir(rq);
>>>
>>> No need to call the wrapper function here, we can use rq_data_dir()
>>> directly.
>>>
>>>> +	bytes = blk_rq_bytes(rq);
>>>> +
>>>> +	bucket = ddir + 2*(ilog2(bytes) - 9);
>>>> +
>>>> +	if (bucket < 0)
>>>> +		return -1;
>>>> +	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
>>>> +		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
>>>> +
>>>> +	return bucket;
>>>> +}
>>>
>>> Nitpicking here, but defining things in terms of the number of size
>>> buckets seems more natural to me. How about something like this
>>> (untested)? Note that this obviates the need for patch 1.
>>>
>>> #define BLK_MQ_POLL_STATS_SIZE_BKTS 8
>>> static unsigned int blk_mq_poll_stats_bkt(const struct request *rq)
>>> {
>>> 	unsigned int size_bucket;
>>>
>>> 	size_bucket = clamp(ilog2(blk_rq_bytes(rq)) - 9, 0,
>>> 			    BLK_MQ_POLL_STATS_SIZE_BKTS - 1);
>>> 	return 2 * size_bucket + rq_data_dir(rq);
>>> }
>>
>> As I wrote in an earlier reply, it would be a lot cleaner to just have
>> the buckets be:
>>
>> 	buckets[2][BUCKETS_PER_RW];
>>
>> and not have to do weird math based on both size and data direction.
>> Just have it return the bucket index based on size, and have the caller
>> do:
>>
>> 	bucket[rq_data_dir(rq)][bucket_index];
> 
> This removes a lot of the flexibility of the interface. Kyber, for one,
> has this stats callback:
> 
> static unsigned int rq_sched_domain(const struct request *rq)
> {
> 	unsigned int op = rq->cmd_flags;
> 
> 	if ((op & REQ_OP_MASK) == REQ_OP_READ)
> 		return KYBER_READ;
> 	else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
> 		return KYBER_SYNC_WRITE;
> 	else
> 		return KYBER_OTHER;
> }

Good point, I guess other users could have different methods of bucketization.
  
> The buckets aren't subdivisions of read vs. write. We could shoehorn it
> in your way if we really wanted to, but that's pointless.

Nah, let's just leave it as-is then, even though I don't think it's the
prettiest thing I've ever seen.
Stephen Bates April 20, 2017, 8:33 p.m. UTC | #5
> Nah, let's just leave it as-is then, even though I don't think it's the

> prettiest thing I've ever seen.


I did look at making the stats buckets in the request_queue struct based on dir and size. Something like:

-       struct blk_rq_stat      poll_stat[2];
+       struct blk_rq_stat      poll_stat[2][BLK_MQ_POLL_STATS_BKTS/2];

This actually did clean up some in some places but because the callback still uses a linear array of buckets we do get this:

-       if (cb->stat[READ].nr_samples)
-               q->poll_stat[READ] = cb->stat[READ];
-       if (cb->stat[WRITE].nr_samples)
-               q->poll_stat[WRITE] = cb->stat[WRITE];
+       for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
+               if (cb->stat[bucket].nr_samples)
+                       q->poll_stat[bucket%2][bucket/2] = cb->stat[bucket];

I tend to agree with Omar that keeping the buckets in a linear array is overall cleaner and more generalized.

However right now I am stuck as I am seeing the kernel oops I reported before in testing of my latest patchset [1]. I will try and find some time to bisect that but it looks like it was introduced when the support for mq schedulers was added (on or around bd166ef18).

Stephen

[1] http://marc.info/?l=linux-block&m=149156785215919&w=2
Jens Axboe April 20, 2017, 8:34 p.m. UTC | #6
On 04/20/2017 02:33 PM, Stephen  Bates wrote:
> 
>> Nah, let's just leave it as-is then, even though I don't think it's the
>> prettiest thing I've ever seen.
> 
> I did look at making the stats buckets in the request_queue struct
> based on dir and size. Something like:
> 
> -       struct blk_rq_stat      poll_stat[2];
> +       struct blk_rq_stat      poll_stat[2][BLK_MQ_POLL_STATS_BKTS/2];
> 
> This actually did clean up some in some places but because the
> callback still uses a linear array of buckets we do get this:
> 
> -       if (cb->stat[READ].nr_samples)
> -               q->poll_stat[READ] = cb->stat[READ];
> -       if (cb->stat[WRITE].nr_samples)
> -               q->poll_stat[WRITE] = cb->stat[WRITE];
> +       for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
> +               if (cb->stat[bucket].nr_samples)
> +                       q->poll_stat[bucket%2][bucket/2] = cb->stat[bucket];
> 
> I tend to agree with Omar that keeping the buckets in a linear array
> is overall cleaner and more generalized.

I agree, it's fine as-is. We should queue it up for 4.12.

> However right now I am stuck as I am seeing the kernel oops I reported
> before in testing of my latest patchset [1]. I will try and find some
> time to bisect that but it looks like it was introduced when the
> support for mq schedulers was added (on or around bd166ef18).

Just replied to that one, let me know if the suggestion works.
Stephen Bates April 20, 2017, 8:47 p.m. UTC | #7
> I agree, it's fine as-is. We should queue it up for 4.12.


Great. I will get something based on Omar’s latest comments asap.

> > However right now I am stuck as I am seeing the kernel oops I reported

> > before in testing of my latest patchset [1]. I will try and find some

>> time to bisect that but it looks like it was introduced when the

> > support for mq schedulers was added (on or around bd166ef18).


> Just replied to that one, let me know if the suggestion works.


That suggestion worked. Do you want me to send a patch for it?

Stephen
diff mbox

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 061fc2c..5fd376b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -42,6 +42,25 @@  static LIST_HEAD(all_q_list);
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
+/* Must be consisitent with function below */
+#define BLK_MQ_POLL_STATS_BKTS 16
+static int blk_mq_poll_stats_bkt(const struct request *rq)
+{
+	int ddir, bytes, bucket;
+
+	ddir = blk_stat_rq_ddir(rq);
+	bytes = blk_rq_bytes(rq);
+
+	bucket = ddir + 2*(ilog2(bytes) - 9);
+
+	if (bucket < 0)
+		return -1;
+	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
+		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
+
+	return bucket;
+}
+
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
@@ -2245,7 +2264,8 @@  struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	q->mq_ops = set->ops;
 
 	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
-					     blk_stat_rq_ddir, 2, q);
+					     blk_mq_poll_stats_bkt,
+					     BLK_MQ_POLL_STATS_BKTS, q);
 	if (!q->poll_cb)
 		goto err_exit;
 
@@ -2663,11 +2683,12 @@  static void blk_mq_poll_stats_start(struct request_queue *q)
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
 {
 	struct request_queue *q = cb->data;
+	int bucket;
 
-	if (cb->stat[READ].nr_samples)
-		q->poll_stat[READ] = cb->stat[READ];
-	if (cb->stat[WRITE].nr_samples)
-		q->poll_stat[WRITE] = cb->stat[WRITE];
+	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
+		if (cb->stat[bucket].nr_samples)
+			q->poll_stat[bucket] = cb->stat[bucket];
+	}
 }
 
 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
@@ -2675,6 +2696,7 @@  static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
 				       struct request *rq)
 {
 	unsigned long ret = 0;
+	int bucket;
 
 	/*
 	 * If stats collection isn't on, don't sleep but turn it on for
@@ -2689,12 +2711,15 @@  static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
 	 * For instance, if the completion latencies are tight, we can
 	 * get closer than just half the mean. This is especially
 	 * important on devices where the completion latencies are longer
-	 * than ~10 usec.
+	 * than ~10 usec. We do use the stats for the relevant IO size
+	 * if available which does lead to better estimates.
 	 */
-	if (req_op(rq) == REQ_OP_READ && q->poll_stat[READ].nr_samples)
-		ret = (q->poll_stat[READ].mean + 1) / 2;
-	else if (req_op(rq) == REQ_OP_WRITE && q->poll_stat[WRITE].nr_samples)
-		ret = (q->poll_stat[WRITE].mean + 1) / 2;
+	bucket = blk_mq_poll_stats_bkt(rq);
+	if (bucket < 0)
+		return ret;
+
+	if (q->poll_stat[bucket].nr_samples)
+		ret = (q->poll_stat[bucket].mean + 1) / 2;
 
 	return ret;
 }