Message ID | dd30f4d94aa19956ad4500b1177741fd071ec37f.1558791181.git.asml.silence@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/1] blk-mq: Fix disabled hybrid polling | expand |
On Sat, May 25, 2019 at 04:42:11PM +0300, Pavel Begunkov (Silence) wrote: > From: Pavel Begunkov <asml.silence@gmail.com> > > Commit 4bc6339a583cec650b05 ("block: move blk_stat_add() to > __blk_mq_end_request()") moved blk_stat_add(), so now it's called after > blk_update_request(), which zeroes rq->__data_len. Without length, > blk_stat_add() can't calculate stat bucket and returns error, > effectively disabling hybrid polling. I don't see how this patch fixes this problem, am I missing something? The timing issue seems orthogonal. > __blk_mq_end_request() is a right place to call blk_stat_add(), as it's > guaranteed to be called for each request. Yet, calculating time there > won't provide sufficient accuracy/precision for finer tuned hybrid > polling, because a path from __blk_mq_complete_request() to > __blk_mq_end_request() adds unpredictable overhead. > > Add io_end_time_ns field in struct request, save time as soon as > possible (at __blk_mq_complete_request()) and reuse later. > > Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> > --- > block/blk-mq.c | 13 ++++++++++--- > block/blk-stat.c | 4 ++-- > block/blk-stat.h | 2 +- > include/linux/blkdev.h | 11 +++++++++++ > 4 files changed, 24 insertions(+), 6 deletions(-) > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 32b8ad3d341b..8f6b6bfe0ccb 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -330,6 +330,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, > else > rq->start_time_ns = 0; > rq->io_start_time_ns = 0; > + rq->io_end_time_ns = 0; > rq->nr_phys_segments = 0; > #if defined(CONFIG_BLK_DEV_INTEGRITY) > rq->nr_integrity_segments = 0; > @@ -532,14 +533,17 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request); > > inline void __blk_mq_end_request(struct request *rq, blk_status_t error) > { > - u64 now = 0; > + u64 now = rq->io_end_time_ns; Kyber expects the timestamp passed in to kyber_complete_request() to include the software overhead. iostat should probably include the software overhead, too. So, we probably won't be able to avoid calling ktime_get() twice, once for I/O time and one for the end-to-end time. > - if (blk_mq_need_time_stamp(rq)) > + /* called directly bypassing __blk_mq_complete_request */ > + if (blk_mq_need_time_stamp(rq) && !now) { > now = ktime_get_ns(); > + rq->io_end_time_ns = now; > + } > > if (rq->rq_flags & RQF_STATS) { > blk_mq_poll_stats_start(rq->q); > - blk_stat_add(rq, now); > + blk_stat_add(rq); > } > > if (rq->internal_tag != -1) > @@ -579,6 +583,9 @@ static void __blk_mq_complete_request(struct request *rq) > bool shared = false; > int cpu; > > + if (blk_mq_need_time_stamp(rq)) > + rq->io_end_time_ns = ktime_get_ns(); > + > WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); > /* > * Most of single queue controllers, there is only one irq vector > diff --git a/block/blk-stat.c b/block/blk-stat.c > index 940f15d600f8..9b9b30927ea8 100644 > --- a/block/blk-stat.c > +++ b/block/blk-stat.c > @@ -48,7 +48,7 @@ void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value) > stat->nr_samples++; > } > > -void blk_stat_add(struct request *rq, u64 now) > +void blk_stat_add(struct request *rq) > { > struct request_queue *q = rq->q; > struct blk_stat_callback *cb; > @@ -56,7 +56,7 @@ void blk_stat_add(struct request *rq, u64 now) > int bucket; > u64 value; > > - value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; > + value = blk_rq_io_time(rq); > > blk_throtl_stat_add(rq, value); > > diff --git a/block/blk-stat.h b/block/blk-stat.h > index 17b47a86eefb..2653818cee36 100644 > --- a/block/blk-stat.h > +++ b/block/blk-stat.h > @@ -65,7 +65,7 @@ struct blk_stat_callback { > struct blk_queue_stats *blk_alloc_queue_stats(void); > void blk_free_queue_stats(struct blk_queue_stats *); > > -void blk_stat_add(struct request *rq, u64 now); > +void blk_stat_add(struct request *rq); > > /* record time/size info in request but not add a callback */ > void blk_stat_enable_accounting(struct request_queue *q); > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 592669bcc536..2a8d4b68d707 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -198,6 +198,9 @@ struct request { > u64 start_time_ns; > /* Time that I/O was submitted to the device. */ > u64 io_start_time_ns; > + /* Time that I/O was reported completed by the device. */ > + u64 io_end_time_ns; > + > > #ifdef CONFIG_BLK_WBT > unsigned short wbt_flags; > @@ -385,6 +388,14 @@ static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, > > #endif /* CONFIG_BLK_DEV_ZONED */ > > +static inline u64 blk_rq_io_time(struct request *rq) > +{ > + u64 start = rq->io_start_time_ns; > + u64 end = rq->io_end_time_ns; > + > + return (end - start) ? end - start : 0; I think you meant: return end >= start ? end - start : 0; > +} > + > struct request_queue { > /* > * Together with queue_head for cacheline sharing > -- > 2.21.0 >
On 5/28/2019 9:37 PM, Omar Sandoval wrote: > On Sat, May 25, 2019 at 04:42:11PM +0300, Pavel Begunkov (Silence) wrote: >> From: Pavel Begunkov <asml.silence@gmail.com> >> >> Commit 4bc6339a583cec650b05 ("block: move blk_stat_add() to >> __blk_mq_end_request()") moved blk_stat_add(), so now it's called after >> blk_update_request(), which zeroes rq->__data_len. Without length, >> blk_stat_add() can't calculate stat bucket and returns error, >> effectively disabling hybrid polling. > > I don't see how this patch fixes this problem, am I missing something? > The timing issue seems orthogonal. You're right, it got completely mixed up. It's rather an addition to a patch from Hou Tao. But considering your comment below, it would be better just to revert part of changes. > >> __blk_mq_end_request() is a right place to call blk_stat_add(), as it's >> guaranteed to be called for each request. Yet, calculating time there >> won't provide sufficient accuracy/precision for finer tuned hybrid >> polling, because a path from __blk_mq_complete_request() to >> __blk_mq_end_request() adds unpredictable overhead. >> >> Add io_end_time_ns field in struct request, save time as soon as >> possible (at __blk_mq_complete_request()) and reuse later. >> >> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> >> --- >> block/blk-mq.c | 13 ++++++++++--- >> block/blk-stat.c | 4 ++-- >> block/blk-stat.h | 2 +- >> include/linux/blkdev.h | 11 +++++++++++ >> 4 files changed, 24 insertions(+), 6 deletions(-) >> >> diff --git a/block/blk-mq.c b/block/blk-mq.c >> index 32b8ad3d341b..8f6b6bfe0ccb 100644 >> --- a/block/blk-mq.c >> +++ b/block/blk-mq.c >> @@ -330,6 +330,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, >> else >> rq->start_time_ns = 0; >> rq->io_start_time_ns = 0; >> + rq->io_end_time_ns = 0; >> rq->nr_phys_segments = 0; >> #if defined(CONFIG_BLK_DEV_INTEGRITY) >> rq->nr_integrity_segments = 0; >> @@ -532,14 +533,17 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request); >> >> inline void __blk_mq_end_request(struct request *rq, blk_status_t error) >> { >> - u64 now = 0; >> + u64 now = rq->io_end_time_ns; > > Kyber expects the timestamp passed in to kyber_complete_request() to > include the software overhead. iostat should probably include the > software overhead, too. So, we probably won't be able to avoid calling > ktime_get() twice, once for I/O time and one for the end-to-end time. I asked that in another thread, and it seems we can't reuse this. > >> - if (blk_mq_need_time_stamp(rq)) >> + /* called directly bypassing __blk_mq_complete_request */ >> + if (blk_mq_need_time_stamp(rq) && !now) { >> now = ktime_get_ns(); >> + rq->io_end_time_ns = now; >> + } >> >> if (rq->rq_flags & RQF_STATS) { >> blk_mq_poll_stats_start(rq->q); >> - blk_stat_add(rq, now); >> + blk_stat_add(rq); >> } >> >> if (rq->internal_tag != -1) >> @@ -579,6 +583,9 @@ static void __blk_mq_complete_request(struct request *rq) >> bool shared = false; >> int cpu; >> >> + if (blk_mq_need_time_stamp(rq)) >> + rq->io_end_time_ns = ktime_get_ns(); >> + >> WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); >> /* >> * Most of single queue controllers, there is only one irq vector >> diff --git a/block/blk-stat.c b/block/blk-stat.c >> index 940f15d600f8..9b9b30927ea8 100644 >> --- a/block/blk-stat.c >> +++ b/block/blk-stat.c >> @@ -48,7 +48,7 @@ void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value) >> stat->nr_samples++; >> } >> >> -void blk_stat_add(struct request *rq, u64 now) >> +void blk_stat_add(struct request *rq) >> { >> struct request_queue *q = rq->q; >> struct blk_stat_callback *cb; >> @@ -56,7 +56,7 @@ void blk_stat_add(struct request *rq, u64 now) >> int bucket; >> u64 value; >> >> - value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; >> + value = blk_rq_io_time(rq); >> >> blk_throtl_stat_add(rq, value); >> >> diff --git a/block/blk-stat.h b/block/blk-stat.h >> index 17b47a86eefb..2653818cee36 100644 >> --- a/block/blk-stat.h >> +++ b/block/blk-stat.h >> @@ -65,7 +65,7 @@ struct blk_stat_callback { >> struct blk_queue_stats *blk_alloc_queue_stats(void); >> void blk_free_queue_stats(struct blk_queue_stats *); >> >> -void blk_stat_add(struct request *rq, u64 now); >> +void blk_stat_add(struct request *rq); >> >> /* record time/size info in request but not add a callback */ >> void blk_stat_enable_accounting(struct request_queue *q); >> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h >> index 592669bcc536..2a8d4b68d707 100644 >> --- a/include/linux/blkdev.h >> +++ b/include/linux/blkdev.h >> @@ -198,6 +198,9 @@ struct request { >> u64 start_time_ns; >> /* Time that I/O was submitted to the device. */ >> u64 io_start_time_ns; >> + /* Time that I/O was reported completed by the device. */ >> + u64 io_end_time_ns; >> + >> >> #ifdef CONFIG_BLK_WBT >> unsigned short wbt_flags; >> @@ -385,6 +388,14 @@ static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, >> >> #endif /* CONFIG_BLK_DEV_ZONED */ >> >> +static inline u64 blk_rq_io_time(struct request *rq) >> +{ >> + u64 start = rq->io_start_time_ns; >> + u64 end = rq->io_end_time_ns; >> + >> + return (end - start) ? end - start : 0; > > I think you meant: > > return end >= start ? end - start : 0; > Sure, thanks >> +} >> + >> struct request_queue { >> /* >> * Together with queue_head for cacheline sharing >> -- >> 2.21.0 >>
diff --git a/block/blk-mq.c b/block/blk-mq.c index 32b8ad3d341b..8f6b6bfe0ccb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -330,6 +330,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, else rq->start_time_ns = 0; rq->io_start_time_ns = 0; + rq->io_end_time_ns = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -532,14 +533,17 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { - u64 now = 0; + u64 now = rq->io_end_time_ns; - if (blk_mq_need_time_stamp(rq)) + /* called directly bypassing __blk_mq_complete_request */ + if (blk_mq_need_time_stamp(rq) && !now) { now = ktime_get_ns(); + rq->io_end_time_ns = now; + } if (rq->rq_flags & RQF_STATS) { blk_mq_poll_stats_start(rq->q); - blk_stat_add(rq, now); + blk_stat_add(rq); } if (rq->internal_tag != -1) @@ -579,6 +583,9 @@ static void __blk_mq_complete_request(struct request *rq) bool shared = false; int cpu; + if (blk_mq_need_time_stamp(rq)) + rq->io_end_time_ns = ktime_get_ns(); + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* * Most of single queue controllers, there is only one irq vector diff --git a/block/blk-stat.c b/block/blk-stat.c index 940f15d600f8..9b9b30927ea8 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -48,7 +48,7 @@ void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value) stat->nr_samples++; } -void blk_stat_add(struct request *rq, u64 now) +void blk_stat_add(struct request *rq) { struct request_queue *q = rq->q; struct blk_stat_callback *cb; @@ -56,7 +56,7 @@ void blk_stat_add(struct request *rq, u64 now) int bucket; u64 value; - value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; + value = blk_rq_io_time(rq); blk_throtl_stat_add(rq, value); diff --git a/block/blk-stat.h b/block/blk-stat.h index 17b47a86eefb..2653818cee36 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -65,7 +65,7 @@ struct blk_stat_callback { struct blk_queue_stats *blk_alloc_queue_stats(void); void blk_free_queue_stats(struct blk_queue_stats *); -void blk_stat_add(struct request *rq, u64 now); +void blk_stat_add(struct request *rq); /* record time/size info in request but not add a callback */ void blk_stat_enable_accounting(struct request_queue *q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 592669bcc536..2a8d4b68d707 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -198,6 +198,9 @@ struct request { u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; + /* Time that I/O was reported completed by the device. */ + u64 io_end_time_ns; + #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; @@ -385,6 +388,14 @@ static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, #endif /* CONFIG_BLK_DEV_ZONED */ +static inline u64 blk_rq_io_time(struct request *rq) +{ + u64 start = rq->io_start_time_ns; + u64 end = rq->io_end_time_ns; + + return (end - start) ? end - start : 0; +} + struct request_queue { /* * Together with queue_head for cacheline sharing