diff mbox series

[1/1] blk-mq: Fix disabled hybrid polling

Message ID dd30f4d94aa19956ad4500b1177741fd071ec37f.1558791181.git.asml.silence@gmail.com (mailing list archive)
State New, archived
Headers show
Series [1/1] blk-mq: Fix disabled hybrid polling | expand

Commit Message

Pavel Begunkov May 25, 2019, 1:42 p.m. UTC
From: Pavel Begunkov <asml.silence@gmail.com>

Commit 4bc6339a583cec650b05 ("block: move blk_stat_add() to
__blk_mq_end_request()") moved blk_stat_add(), so now it's called after
blk_update_request(), which zeroes rq->__data_len. Without length,
blk_stat_add() can't calculate stat bucket and returns error,
effectively disabling hybrid polling.

__blk_mq_end_request() is a right place to call blk_stat_add(), as it's
guaranteed to be called for each request. Yet, calculating time there
won't provide sufficient accuracy/precision for finer tuned hybrid
polling, because a path from __blk_mq_complete_request() to
__blk_mq_end_request() adds unpredictable overhead.

Add io_end_time_ns field in struct request, save time as soon as
possible (at __blk_mq_complete_request()) and reuse later.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 block/blk-mq.c         | 13 ++++++++++---
 block/blk-stat.c       |  4 ++--
 block/blk-stat.h       |  2 +-
 include/linux/blkdev.h | 11 +++++++++++
 4 files changed, 24 insertions(+), 6 deletions(-)

Comments

Omar Sandoval May 28, 2019, 6:37 p.m. UTC | #1
On Sat, May 25, 2019 at 04:42:11PM +0300, Pavel Begunkov (Silence) wrote:
> From: Pavel Begunkov <asml.silence@gmail.com>
> 
> Commit 4bc6339a583cec650b05 ("block: move blk_stat_add() to
> __blk_mq_end_request()") moved blk_stat_add(), so now it's called after
> blk_update_request(), which zeroes rq->__data_len. Without length,
> blk_stat_add() can't calculate stat bucket and returns error,
> effectively disabling hybrid polling.

I don't see how this patch fixes this problem, am I missing something?
The timing issue seems orthogonal.

> __blk_mq_end_request() is a right place to call blk_stat_add(), as it's
> guaranteed to be called for each request. Yet, calculating time there
> won't provide sufficient accuracy/precision for finer tuned hybrid
> polling, because a path from __blk_mq_complete_request() to
> __blk_mq_end_request() adds unpredictable overhead.
> 
> Add io_end_time_ns field in struct request, save time as soon as
> possible (at __blk_mq_complete_request()) and reuse later.
> 
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>  block/blk-mq.c         | 13 ++++++++++---
>  block/blk-stat.c       |  4 ++--
>  block/blk-stat.h       |  2 +-
>  include/linux/blkdev.h | 11 +++++++++++
>  4 files changed, 24 insertions(+), 6 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 32b8ad3d341b..8f6b6bfe0ccb 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -330,6 +330,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>  	else
>  		rq->start_time_ns = 0;
>  	rq->io_start_time_ns = 0;
> +	rq->io_end_time_ns = 0;
>  	rq->nr_phys_segments = 0;
>  #if defined(CONFIG_BLK_DEV_INTEGRITY)
>  	rq->nr_integrity_segments = 0;
> @@ -532,14 +533,17 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request);
>  
>  inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
>  {
> -	u64 now = 0;
> +	u64 now = rq->io_end_time_ns;

Kyber expects the timestamp passed in to kyber_complete_request() to
include the software overhead. iostat should probably include the
software overhead, too. So, we probably won't be able to avoid calling
ktime_get() twice, once for I/O time and one for the end-to-end time.

> -	if (blk_mq_need_time_stamp(rq))
> +	/* called directly bypassing __blk_mq_complete_request */
> +	if (blk_mq_need_time_stamp(rq) && !now) {
>  		now = ktime_get_ns();
> +		rq->io_end_time_ns = now;
> +	}
>  
>  	if (rq->rq_flags & RQF_STATS) {
>  		blk_mq_poll_stats_start(rq->q);
> -		blk_stat_add(rq, now);
> +		blk_stat_add(rq);
>  	}
>  
>  	if (rq->internal_tag != -1)
> @@ -579,6 +583,9 @@ static void __blk_mq_complete_request(struct request *rq)
>  	bool shared = false;
>  	int cpu;
>  
> +	if (blk_mq_need_time_stamp(rq))
> +		rq->io_end_time_ns = ktime_get_ns();
> +
>  	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
>  	/*
>  	 * Most of single queue controllers, there is only one irq vector
> diff --git a/block/blk-stat.c b/block/blk-stat.c
> index 940f15d600f8..9b9b30927ea8 100644
> --- a/block/blk-stat.c
> +++ b/block/blk-stat.c
> @@ -48,7 +48,7 @@ void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value)
>  	stat->nr_samples++;
>  }
>  
> -void blk_stat_add(struct request *rq, u64 now)
> +void blk_stat_add(struct request *rq)
>  {
>  	struct request_queue *q = rq->q;
>  	struct blk_stat_callback *cb;
> @@ -56,7 +56,7 @@ void blk_stat_add(struct request *rq, u64 now)
>  	int bucket;
>  	u64 value;
>  
> -	value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
> +	value = blk_rq_io_time(rq);
>  
>  	blk_throtl_stat_add(rq, value);
>  
> diff --git a/block/blk-stat.h b/block/blk-stat.h
> index 17b47a86eefb..2653818cee36 100644
> --- a/block/blk-stat.h
> +++ b/block/blk-stat.h
> @@ -65,7 +65,7 @@ struct blk_stat_callback {
>  struct blk_queue_stats *blk_alloc_queue_stats(void);
>  void blk_free_queue_stats(struct blk_queue_stats *);
>  
> -void blk_stat_add(struct request *rq, u64 now);
> +void blk_stat_add(struct request *rq);
>  
>  /* record time/size info in request but not add a callback */
>  void blk_stat_enable_accounting(struct request_queue *q);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 592669bcc536..2a8d4b68d707 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -198,6 +198,9 @@ struct request {
>  	u64 start_time_ns;
>  	/* Time that I/O was submitted to the device. */
>  	u64 io_start_time_ns;
> +	/* Time that I/O was reported completed by the device. */
> +	u64 io_end_time_ns;
> +
>  
>  #ifdef CONFIG_BLK_WBT
>  	unsigned short wbt_flags;
> @@ -385,6 +388,14 @@ static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
>  
>  #endif /* CONFIG_BLK_DEV_ZONED */
>  
> +static inline u64 blk_rq_io_time(struct request *rq)
> +{
> +	u64 start = rq->io_start_time_ns;
> +	u64 end = rq->io_end_time_ns;
> +
> +	return (end - start) ? end - start : 0;

I think you meant:

	return end >= start ? end - start : 0;

> +}
> +
>  struct request_queue {
>  	/*
>  	 * Together with queue_head for cacheline sharing
> -- 
> 2.21.0
>
Pavel Begunkov May 30, 2019, 9:19 a.m. UTC | #2
On 5/28/2019 9:37 PM, Omar Sandoval wrote:
> On Sat, May 25, 2019 at 04:42:11PM +0300, Pavel Begunkov (Silence) wrote:
>> From: Pavel Begunkov <asml.silence@gmail.com>
>>
>> Commit 4bc6339a583cec650b05 ("block: move blk_stat_add() to
>> __blk_mq_end_request()") moved blk_stat_add(), so now it's called after
>> blk_update_request(), which zeroes rq->__data_len. Without length,
>> blk_stat_add() can't calculate stat bucket and returns error,
>> effectively disabling hybrid polling.
> 
> I don't see how this patch fixes this problem, am I missing something?
> The timing issue seems orthogonal.
You're right, it got completely mixed up. It's rather an addition to a
patch from Hou Tao. But considering your comment below, it would be
better just to revert part of changes.


> 
>> __blk_mq_end_request() is a right place to call blk_stat_add(), as it's
>> guaranteed to be called for each request. Yet, calculating time there
>> won't provide sufficient accuracy/precision for finer tuned hybrid
>> polling, because a path from __blk_mq_complete_request() to
>> __blk_mq_end_request() adds unpredictable overhead.
>>
>> Add io_end_time_ns field in struct request, save time as soon as
>> possible (at __blk_mq_complete_request()) and reuse later.
>>
>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>> ---
>>  block/blk-mq.c         | 13 ++++++++++---
>>  block/blk-stat.c       |  4 ++--
>>  block/blk-stat.h       |  2 +-
>>  include/linux/blkdev.h | 11 +++++++++++
>>  4 files changed, 24 insertions(+), 6 deletions(-)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 32b8ad3d341b..8f6b6bfe0ccb 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -330,6 +330,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>>  	else
>>  		rq->start_time_ns = 0;
>>  	rq->io_start_time_ns = 0;
>> +	rq->io_end_time_ns = 0;
>>  	rq->nr_phys_segments = 0;
>>  #if defined(CONFIG_BLK_DEV_INTEGRITY)
>>  	rq->nr_integrity_segments = 0;
>> @@ -532,14 +533,17 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request);
>>  
>>  inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
>>  {
>> -	u64 now = 0;
>> +	u64 now = rq->io_end_time_ns;
> 
> Kyber expects the timestamp passed in to kyber_complete_request() to
> include the software overhead. iostat should probably include the
> software overhead, too. So, we probably won't be able to avoid calling
> ktime_get() twice, once for I/O time and one for the end-to-end time.

I asked that in another thread, and it seems we can't reuse this.



> 
>> -	if (blk_mq_need_time_stamp(rq))
>> +	/* called directly bypassing __blk_mq_complete_request */
>> +	if (blk_mq_need_time_stamp(rq) && !now) {
>>  		now = ktime_get_ns();
>> +		rq->io_end_time_ns = now;
>> +	}
>>  
>>  	if (rq->rq_flags & RQF_STATS) {
>>  		blk_mq_poll_stats_start(rq->q);
>> -		blk_stat_add(rq, now);
>> +		blk_stat_add(rq);
>>  	}
>>  
>>  	if (rq->internal_tag != -1)
>> @@ -579,6 +583,9 @@ static void __blk_mq_complete_request(struct request *rq)
>>  	bool shared = false;
>>  	int cpu;
>>  
>> +	if (blk_mq_need_time_stamp(rq))
>> +		rq->io_end_time_ns = ktime_get_ns();
>> +
>>  	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
>>  	/*
>>  	 * Most of single queue controllers, there is only one irq vector
>> diff --git a/block/blk-stat.c b/block/blk-stat.c
>> index 940f15d600f8..9b9b30927ea8 100644
>> --- a/block/blk-stat.c
>> +++ b/block/blk-stat.c
>> @@ -48,7 +48,7 @@ void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value)
>>  	stat->nr_samples++;
>>  }
>>  
>> -void blk_stat_add(struct request *rq, u64 now)
>> +void blk_stat_add(struct request *rq)
>>  {
>>  	struct request_queue *q = rq->q;
>>  	struct blk_stat_callback *cb;
>> @@ -56,7 +56,7 @@ void blk_stat_add(struct request *rq, u64 now)
>>  	int bucket;
>>  	u64 value;
>>  
>> -	value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
>> +	value = blk_rq_io_time(rq);
>>  
>>  	blk_throtl_stat_add(rq, value);
>>  
>> diff --git a/block/blk-stat.h b/block/blk-stat.h
>> index 17b47a86eefb..2653818cee36 100644
>> --- a/block/blk-stat.h
>> +++ b/block/blk-stat.h
>> @@ -65,7 +65,7 @@ struct blk_stat_callback {
>>  struct blk_queue_stats *blk_alloc_queue_stats(void);
>>  void blk_free_queue_stats(struct blk_queue_stats *);
>>  
>> -void blk_stat_add(struct request *rq, u64 now);
>> +void blk_stat_add(struct request *rq);
>>  
>>  /* record time/size info in request but not add a callback */
>>  void blk_stat_enable_accounting(struct request_queue *q);
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index 592669bcc536..2a8d4b68d707 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -198,6 +198,9 @@ struct request {
>>  	u64 start_time_ns;
>>  	/* Time that I/O was submitted to the device. */
>>  	u64 io_start_time_ns;
>> +	/* Time that I/O was reported completed by the device. */
>> +	u64 io_end_time_ns;
>> +
>>  
>>  #ifdef CONFIG_BLK_WBT
>>  	unsigned short wbt_flags;
>> @@ -385,6 +388,14 @@ static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
>>  
>>  #endif /* CONFIG_BLK_DEV_ZONED */
>>  
>> +static inline u64 blk_rq_io_time(struct request *rq)
>> +{
>> +	u64 start = rq->io_start_time_ns;
>> +	u64 end = rq->io_end_time_ns;
>> +
>> +	return (end - start) ? end - start : 0;
> 
> I think you meant:
> 
> 	return end >= start ? end - start : 0;
> 
Sure, thanks



>> +}
>> +
>>  struct request_queue {
>>  	/*
>>  	 * Together with queue_head for cacheline sharing
>> -- 
>> 2.21.0
>>
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 32b8ad3d341b..8f6b6bfe0ccb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -330,6 +330,7 @@  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	else
 		rq->start_time_ns = 0;
 	rq->io_start_time_ns = 0;
+	rq->io_end_time_ns = 0;
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	rq->nr_integrity_segments = 0;
@@ -532,14 +533,17 @@  EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 {
-	u64 now = 0;
+	u64 now = rq->io_end_time_ns;
 
-	if (blk_mq_need_time_stamp(rq))
+	/* called directly bypassing __blk_mq_complete_request */
+	if (blk_mq_need_time_stamp(rq) && !now) {
 		now = ktime_get_ns();
+		rq->io_end_time_ns = now;
+	}
 
 	if (rq->rq_flags & RQF_STATS) {
 		blk_mq_poll_stats_start(rq->q);
-		blk_stat_add(rq, now);
+		blk_stat_add(rq);
 	}
 
 	if (rq->internal_tag != -1)
@@ -579,6 +583,9 @@  static void __blk_mq_complete_request(struct request *rq)
 	bool shared = false;
 	int cpu;
 
+	if (blk_mq_need_time_stamp(rq))
+		rq->io_end_time_ns = ktime_get_ns();
+
 	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
 	/*
 	 * Most of single queue controllers, there is only one irq vector
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 940f15d600f8..9b9b30927ea8 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -48,7 +48,7 @@  void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value)
 	stat->nr_samples++;
 }
 
-void blk_stat_add(struct request *rq, u64 now)
+void blk_stat_add(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	struct blk_stat_callback *cb;
@@ -56,7 +56,7 @@  void blk_stat_add(struct request *rq, u64 now)
 	int bucket;
 	u64 value;
 
-	value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
+	value = blk_rq_io_time(rq);
 
 	blk_throtl_stat_add(rq, value);
 
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 17b47a86eefb..2653818cee36 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -65,7 +65,7 @@  struct blk_stat_callback {
 struct blk_queue_stats *blk_alloc_queue_stats(void);
 void blk_free_queue_stats(struct blk_queue_stats *);
 
-void blk_stat_add(struct request *rq, u64 now);
+void blk_stat_add(struct request *rq);
 
 /* record time/size info in request but not add a callback */
 void blk_stat_enable_accounting(struct request_queue *q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 592669bcc536..2a8d4b68d707 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -198,6 +198,9 @@  struct request {
 	u64 start_time_ns;
 	/* Time that I/O was submitted to the device. */
 	u64 io_start_time_ns;
+	/* Time that I/O was reported completed by the device. */
+	u64 io_end_time_ns;
+
 
 #ifdef CONFIG_BLK_WBT
 	unsigned short wbt_flags;
@@ -385,6 +388,14 @@  static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
 
 #endif /* CONFIG_BLK_DEV_ZONED */
 
+static inline u64 blk_rq_io_time(struct request *rq)
+{
+	u64 start = rq->io_start_time_ns;
+	u64 end = rq->io_end_time_ns;
+
+	return (end - start) ? end - start : 0;
+}
+
 struct request_queue {
 	/*
 	 * Together with queue_head for cacheline sharing