Message ID | 1491567843-26190-3-git-send-email-sbates@raithlin.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Apr 07, 2017 at 06:24:03AM -0600, sbates@raithlin.com wrote: > From: Stephen Bates <sbates@raithlin.com> > > Rather than bucketing IO statisics based on direction only we also > bucket based on the IO size. This leads to improved polling > performance. Update the bucket callback function and use it in the > polling latency estimation. > > Signed-off-by: Stephen Bates <sbates@raithlin.com> Hey, Stephen, just taking a look at this now. Comments below. > --- > block/blk-mq.c | 45 +++++++++++++++++++++++++++++++++++---------- > 1 file changed, 35 insertions(+), 10 deletions(-) > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 061fc2c..5fd376b 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -42,6 +42,25 @@ static LIST_HEAD(all_q_list); > static void blk_mq_poll_stats_start(struct request_queue *q); > static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); > > +/* Must be consisitent with function below */ > +#define BLK_MQ_POLL_STATS_BKTS 16 > +static int blk_mq_poll_stats_bkt(const struct request *rq) > +{ > + int ddir, bytes, bucket; > + > + ddir = blk_stat_rq_ddir(rq); No need to call the wrapper function here, we can use rq_data_dir() directly. > + bytes = blk_rq_bytes(rq); > + > + bucket = ddir + 2*(ilog2(bytes) - 9); > + > + if (bucket < 0) > + return -1; > + else if (bucket >= BLK_MQ_POLL_STATS_BKTS) > + return ddir + BLK_MQ_POLL_STATS_BKTS - 2; > + > + return bucket; > +} Nitpicking here, but defining things in terms of the number of size buckets seems more natural to me. How about something like this (untested)? Note that this obviates the need for patch 1. #define BLK_MQ_POLL_STATS_SIZE_BKTS 8 static unsigned int blk_mq_poll_stats_bkt(const struct request *rq) { unsigned int size_bucket; size_bucket = clamp(ilog2(blk_rq_bytes(rq)) - 9, 0, BLK_MQ_POLL_STATS_SIZE_BKTS - 1); return 2 * size_bucket + rq_data_dir(rq); } > /* > * Check if any of the ctx's have pending work in this hardware queue > */ > @@ -2245,7 +2264,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, > q->mq_ops = set->ops; > > q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, > - blk_stat_rq_ddir, 2, q); > + blk_mq_poll_stats_bkt, > + BLK_MQ_POLL_STATS_BKTS, q); With the above change, this would become 2 * BLK_MQ_POLL_STATS_SIZE_BKTS.
On 04/20/2017 02:07 PM, Omar Sandoval wrote: > On Fri, Apr 07, 2017 at 06:24:03AM -0600, sbates@raithlin.com wrote: >> From: Stephen Bates <sbates@raithlin.com> >> >> Rather than bucketing IO statisics based on direction only we also >> bucket based on the IO size. This leads to improved polling >> performance. Update the bucket callback function and use it in the >> polling latency estimation. >> >> Signed-off-by: Stephen Bates <sbates@raithlin.com> > > Hey, Stephen, just taking a look at this now. Comments below. > >> --- >> block/blk-mq.c | 45 +++++++++++++++++++++++++++++++++++---------- >> 1 file changed, 35 insertions(+), 10 deletions(-) >> >> diff --git a/block/blk-mq.c b/block/blk-mq.c >> index 061fc2c..5fd376b 100644 >> --- a/block/blk-mq.c >> +++ b/block/blk-mq.c >> @@ -42,6 +42,25 @@ static LIST_HEAD(all_q_list); >> static void blk_mq_poll_stats_start(struct request_queue *q); >> static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); >> >> +/* Must be consisitent with function below */ >> +#define BLK_MQ_POLL_STATS_BKTS 16 >> +static int blk_mq_poll_stats_bkt(const struct request *rq) >> +{ >> + int ddir, bytes, bucket; >> + >> + ddir = blk_stat_rq_ddir(rq); > > No need to call the wrapper function here, we can use rq_data_dir() > directly. > >> + bytes = blk_rq_bytes(rq); >> + >> + bucket = ddir + 2*(ilog2(bytes) - 9); >> + >> + if (bucket < 0) >> + return -1; >> + else if (bucket >= BLK_MQ_POLL_STATS_BKTS) >> + return ddir + BLK_MQ_POLL_STATS_BKTS - 2; >> + >> + return bucket; >> +} > > Nitpicking here, but defining things in terms of the number of size > buckets seems more natural to me. How about something like this > (untested)? Note that this obviates the need for patch 1. > > #define BLK_MQ_POLL_STATS_SIZE_BKTS 8 > static unsigned int blk_mq_poll_stats_bkt(const struct request *rq) > { > unsigned int size_bucket; > > size_bucket = clamp(ilog2(blk_rq_bytes(rq)) - 9, 0, > BLK_MQ_POLL_STATS_SIZE_BKTS - 1); > return 2 * size_bucket + rq_data_dir(rq); > } As I wrote in an earlier reply, it would be a lot cleaner to just have the buckets be: buckets[2][BUCKETS_PER_RW]; and not have to do weird math based on both size and data direction. Just have it return the bucket index based on size, and have the caller do: bucket[rq_data_dir(rq)][bucket_index];
On Thu, Apr 20, 2017 at 02:16:04PM -0600, Jens Axboe wrote: > On 04/20/2017 02:07 PM, Omar Sandoval wrote: > > On Fri, Apr 07, 2017 at 06:24:03AM -0600, sbates@raithlin.com wrote: > >> From: Stephen Bates <sbates@raithlin.com> > >> > >> Rather than bucketing IO statisics based on direction only we also > >> bucket based on the IO size. This leads to improved polling > >> performance. Update the bucket callback function and use it in the > >> polling latency estimation. > >> > >> Signed-off-by: Stephen Bates <sbates@raithlin.com> > > > > Hey, Stephen, just taking a look at this now. Comments below. > > > >> --- > >> block/blk-mq.c | 45 +++++++++++++++++++++++++++++++++++---------- > >> 1 file changed, 35 insertions(+), 10 deletions(-) > >> > >> diff --git a/block/blk-mq.c b/block/blk-mq.c > >> index 061fc2c..5fd376b 100644 > >> --- a/block/blk-mq.c > >> +++ b/block/blk-mq.c > >> @@ -42,6 +42,25 @@ static LIST_HEAD(all_q_list); > >> static void blk_mq_poll_stats_start(struct request_queue *q); > >> static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); > >> > >> +/* Must be consisitent with function below */ > >> +#define BLK_MQ_POLL_STATS_BKTS 16 > >> +static int blk_mq_poll_stats_bkt(const struct request *rq) > >> +{ > >> + int ddir, bytes, bucket; > >> + > >> + ddir = blk_stat_rq_ddir(rq); > > > > No need to call the wrapper function here, we can use rq_data_dir() > > directly. > > > >> + bytes = blk_rq_bytes(rq); > >> + > >> + bucket = ddir + 2*(ilog2(bytes) - 9); > >> + > >> + if (bucket < 0) > >> + return -1; > >> + else if (bucket >= BLK_MQ_POLL_STATS_BKTS) > >> + return ddir + BLK_MQ_POLL_STATS_BKTS - 2; > >> + > >> + return bucket; > >> +} > > > > Nitpicking here, but defining things in terms of the number of size > > buckets seems more natural to me. How about something like this > > (untested)? Note that this obviates the need for patch 1. > > > > #define BLK_MQ_POLL_STATS_SIZE_BKTS 8 > > static unsigned int blk_mq_poll_stats_bkt(const struct request *rq) > > { > > unsigned int size_bucket; > > > > size_bucket = clamp(ilog2(blk_rq_bytes(rq)) - 9, 0, > > BLK_MQ_POLL_STATS_SIZE_BKTS - 1); > > return 2 * size_bucket + rq_data_dir(rq); > > } > > As I wrote in an earlier reply, it would be a lot cleaner to just have > the buckets be: > > buckets[2][BUCKETS_PER_RW]; > > and not have to do weird math based on both size and data direction. > Just have it return the bucket index based on size, and have the caller > do: > > bucket[rq_data_dir(rq)][bucket_index]; This removes a lot of the flexibility of the interface. Kyber, for one, has this stats callback: static unsigned int rq_sched_domain(const struct request *rq) { unsigned int op = rq->cmd_flags; if ((op & REQ_OP_MASK) == REQ_OP_READ) return KYBER_READ; else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op)) return KYBER_SYNC_WRITE; else return KYBER_OTHER; } The buckets aren't subdivisions of read vs. write. We could shoehorn it in your way if we really wanted to, but that's pointless.
On 04/20/2017 02:20 PM, Omar Sandoval wrote: > On Thu, Apr 20, 2017 at 02:16:04PM -0600, Jens Axboe wrote: >> On 04/20/2017 02:07 PM, Omar Sandoval wrote: >>> On Fri, Apr 07, 2017 at 06:24:03AM -0600, sbates@raithlin.com wrote: >>>> From: Stephen Bates <sbates@raithlin.com> >>>> >>>> Rather than bucketing IO statisics based on direction only we also >>>> bucket based on the IO size. This leads to improved polling >>>> performance. Update the bucket callback function and use it in the >>>> polling latency estimation. >>>> >>>> Signed-off-by: Stephen Bates <sbates@raithlin.com> >>> >>> Hey, Stephen, just taking a look at this now. Comments below. >>> >>>> --- >>>> block/blk-mq.c | 45 +++++++++++++++++++++++++++++++++++---------- >>>> 1 file changed, 35 insertions(+), 10 deletions(-) >>>> >>>> diff --git a/block/blk-mq.c b/block/blk-mq.c >>>> index 061fc2c..5fd376b 100644 >>>> --- a/block/blk-mq.c >>>> +++ b/block/blk-mq.c >>>> @@ -42,6 +42,25 @@ static LIST_HEAD(all_q_list); >>>> static void blk_mq_poll_stats_start(struct request_queue *q); >>>> static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); >>>> >>>> +/* Must be consisitent with function below */ >>>> +#define BLK_MQ_POLL_STATS_BKTS 16 >>>> +static int blk_mq_poll_stats_bkt(const struct request *rq) >>>> +{ >>>> + int ddir, bytes, bucket; >>>> + >>>> + ddir = blk_stat_rq_ddir(rq); >>> >>> No need to call the wrapper function here, we can use rq_data_dir() >>> directly. >>> >>>> + bytes = blk_rq_bytes(rq); >>>> + >>>> + bucket = ddir + 2*(ilog2(bytes) - 9); >>>> + >>>> + if (bucket < 0) >>>> + return -1; >>>> + else if (bucket >= BLK_MQ_POLL_STATS_BKTS) >>>> + return ddir + BLK_MQ_POLL_STATS_BKTS - 2; >>>> + >>>> + return bucket; >>>> +} >>> >>> Nitpicking here, but defining things in terms of the number of size >>> buckets seems more natural to me. How about something like this >>> (untested)? Note that this obviates the need for patch 1. >>> >>> #define BLK_MQ_POLL_STATS_SIZE_BKTS 8 >>> static unsigned int blk_mq_poll_stats_bkt(const struct request *rq) >>> { >>> unsigned int size_bucket; >>> >>> size_bucket = clamp(ilog2(blk_rq_bytes(rq)) - 9, 0, >>> BLK_MQ_POLL_STATS_SIZE_BKTS - 1); >>> return 2 * size_bucket + rq_data_dir(rq); >>> } >> >> As I wrote in an earlier reply, it would be a lot cleaner to just have >> the buckets be: >> >> buckets[2][BUCKETS_PER_RW]; >> >> and not have to do weird math based on both size and data direction. >> Just have it return the bucket index based on size, and have the caller >> do: >> >> bucket[rq_data_dir(rq)][bucket_index]; > > This removes a lot of the flexibility of the interface. Kyber, for one, > has this stats callback: > > static unsigned int rq_sched_domain(const struct request *rq) > { > unsigned int op = rq->cmd_flags; > > if ((op & REQ_OP_MASK) == REQ_OP_READ) > return KYBER_READ; > else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op)) > return KYBER_SYNC_WRITE; > else > return KYBER_OTHER; > } Good point, I guess other users could have different methods of bucketization. > The buckets aren't subdivisions of read vs. write. We could shoehorn it > in your way if we really wanted to, but that's pointless. Nah, let's just leave it as-is then, even though I don't think it's the prettiest thing I've ever seen.
> Nah, let's just leave it as-is then, even though I don't think it's the > prettiest thing I've ever seen. I did look at making the stats buckets in the request_queue struct based on dir and size. Something like: - struct blk_rq_stat poll_stat[2]; + struct blk_rq_stat poll_stat[2][BLK_MQ_POLL_STATS_BKTS/2]; This actually did clean up some in some places but because the callback still uses a linear array of buckets we do get this: - if (cb->stat[READ].nr_samples) - q->poll_stat[READ] = cb->stat[READ]; - if (cb->stat[WRITE].nr_samples) - q->poll_stat[WRITE] = cb->stat[WRITE]; + for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { + if (cb->stat[bucket].nr_samples) + q->poll_stat[bucket%2][bucket/2] = cb->stat[bucket]; I tend to agree with Omar that keeping the buckets in a linear array is overall cleaner and more generalized. However right now I am stuck as I am seeing the kernel oops I reported before in testing of my latest patchset [1]. I will try and find some time to bisect that but it looks like it was introduced when the support for mq schedulers was added (on or around bd166ef18). Stephen [1] http://marc.info/?l=linux-block&m=149156785215919&w=2
On 04/20/2017 02:33 PM, Stephen Bates wrote: > >> Nah, let's just leave it as-is then, even though I don't think it's the >> prettiest thing I've ever seen. > > I did look at making the stats buckets in the request_queue struct > based on dir and size. Something like: > > - struct blk_rq_stat poll_stat[2]; > + struct blk_rq_stat poll_stat[2][BLK_MQ_POLL_STATS_BKTS/2]; > > This actually did clean up some in some places but because the > callback still uses a linear array of buckets we do get this: > > - if (cb->stat[READ].nr_samples) > - q->poll_stat[READ] = cb->stat[READ]; > - if (cb->stat[WRITE].nr_samples) > - q->poll_stat[WRITE] = cb->stat[WRITE]; > + for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { > + if (cb->stat[bucket].nr_samples) > + q->poll_stat[bucket%2][bucket/2] = cb->stat[bucket]; > > I tend to agree with Omar that keeping the buckets in a linear array > is overall cleaner and more generalized. I agree, it's fine as-is. We should queue it up for 4.12. > However right now I am stuck as I am seeing the kernel oops I reported > before in testing of my latest patchset [1]. I will try and find some > time to bisect that but it looks like it was introduced when the > support for mq schedulers was added (on or around bd166ef18). Just replied to that one, let me know if the suggestion works.
> I agree, it's fine as-is. We should queue it up for 4.12. Great. I will get something based on Omar’s latest comments asap. > > However right now I am stuck as I am seeing the kernel oops I reported > > before in testing of my latest patchset [1]. I will try and find some >> time to bisect that but it looks like it was introduced when the > > support for mq schedulers was added (on or around bd166ef18). > Just replied to that one, let me know if the suggestion works. That suggestion worked. Do you want me to send a patch for it? Stephen
diff --git a/block/blk-mq.c b/block/blk-mq.c index 061fc2c..5fd376b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -42,6 +42,25 @@ static LIST_HEAD(all_q_list); static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); +/* Must be consisitent with function below */ +#define BLK_MQ_POLL_STATS_BKTS 16 +static int blk_mq_poll_stats_bkt(const struct request *rq) +{ + int ddir, bytes, bucket; + + ddir = blk_stat_rq_ddir(rq); + bytes = blk_rq_bytes(rq); + + bucket = ddir + 2*(ilog2(bytes) - 9); + + if (bucket < 0) + return -1; + else if (bucket >= BLK_MQ_POLL_STATS_BKTS) + return ddir + BLK_MQ_POLL_STATS_BKTS - 2; + + return bucket; +} + /* * Check if any of the ctx's have pending work in this hardware queue */ @@ -2245,7 +2264,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->mq_ops = set->ops; q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, - blk_stat_rq_ddir, 2, q); + blk_mq_poll_stats_bkt, + BLK_MQ_POLL_STATS_BKTS, q); if (!q->poll_cb) goto err_exit; @@ -2663,11 +2683,12 @@ static void blk_mq_poll_stats_start(struct request_queue *q) static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) { struct request_queue *q = cb->data; + int bucket; - if (cb->stat[READ].nr_samples) - q->poll_stat[READ] = cb->stat[READ]; - if (cb->stat[WRITE].nr_samples) - q->poll_stat[WRITE] = cb->stat[WRITE]; + for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { + if (cb->stat[bucket].nr_samples) + q->poll_stat[bucket] = cb->stat[bucket]; + } } static unsigned long blk_mq_poll_nsecs(struct request_queue *q, @@ -2675,6 +2696,7 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q, struct request *rq) { unsigned long ret = 0; + int bucket; /* * If stats collection isn't on, don't sleep but turn it on for @@ -2689,12 +2711,15 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q, * For instance, if the completion latencies are tight, we can * get closer than just half the mean. This is especially * important on devices where the completion latencies are longer - * than ~10 usec. + * than ~10 usec. We do use the stats for the relevant IO size + * if available which does lead to better estimates. */ - if (req_op(rq) == REQ_OP_READ && q->poll_stat[READ].nr_samples) - ret = (q->poll_stat[READ].mean + 1) / 2; - else if (req_op(rq) == REQ_OP_WRITE && q->poll_stat[WRITE].nr_samples) - ret = (q->poll_stat[WRITE].mean + 1) / 2; + bucket = blk_mq_poll_stats_bkt(rq); + if (bucket < 0) + return ret; + + if (q->poll_stat[bucket].nr_samples) + ret = (q->poll_stat[bucket].mean + 1) / 2; return ret; }