diff mbox series

[1/2] block: make rq sector size accessible for block stats

Message ID 20190521075904.135060-2-houtao1@huawei.com (mailing list archive)
State New, archived
Headers show
Series [1/2] block: make rq sector size accessible for block stats | expand

Commit Message

Hou Tao May 21, 2019, 7:59 a.m. UTC
Currently rq->data_len will be decreased by partial completion or
zeroed by completion, so when blk_stat_add() is invoked, data_len
will be zero and there will never be samples in poll_cb because
blk_mq_poll_stats_bkt() will return -1 if data_len is zero.

We could move blk_stat_add() back to __blk_mq_complete_request(),
but that would make the effort of trying to call ktime_get_ns()
once in vain. Instead we can reuse throtl_size field, and use
it for both block stats and block throttle, and adjust the
logic in blk_mq_poll_stats_bkt() accordingly.

Fixes: 4bc6339a583c ("block: move blk_stat_add() to __blk_mq_end_request()")
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 block/blk-mq.c         | 11 +++++------
 block/blk-throttle.c   |  3 ++-
 include/linux/blkdev.h | 15 ++++++++++++---
 3 files changed, 19 insertions(+), 10 deletions(-)

Comments

Pavel Begunkov May 25, 2019, 11:46 a.m. UTC | #1
That's the same bug I posted a patch about 3+ weeks ago. No answer yet,
however.
https://www.spinics.net/lists/linux-block/msg40049.html

I think putting time sampling at __blk_mq_end_request() won't give
sufficient precision for more sophisticated hybrid polling, because path
__blk_mq_complete_request() -> __blk_mq_end_request() adds a lot of
overhead (redirect rq to another cpu, dma unmap, etc). That's the case
for mentioned patchset
https://www.spinics.net/lists/linux-block/msg40044.html

It works OK for 50-80us requests (e.g. SSD read), but is not good enough
for 20us and less (e.g. SSD write). I think it would be better to save
the time in advance, and use it later.

By the way, it seems, that __blk_mq_end_request() is really better place
to record stats, as not all drivers use __blk_mq_complete_request(),
even though they don't implement blk_poll().


On 21/05/2019 10:59, Hou Tao wrote:
> Currently rq->data_len will be decreased by partial completion or
> zeroed by completion, so when blk_stat_add() is invoked, data_len
> will be zero and there will never be samples in poll_cb because
> blk_mq_poll_stats_bkt() will return -1 if data_len is zero.
> 
> We could move blk_stat_add() back to __blk_mq_complete_request(),
> but that would make the effort of trying to call ktime_get_ns()
> once in vain. Instead we can reuse throtl_size field, and use
> it for both block stats and block throttle, and adjust the
> logic in blk_mq_poll_stats_bkt() accordingly.
> 
> Fixes: 4bc6339a583c ("block: move blk_stat_add() to __blk_mq_end_request()")
> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
>  block/blk-mq.c         | 11 +++++------
>  block/blk-throttle.c   |  3 ++-
>  include/linux/blkdev.h | 15 ++++++++++++---
>  3 files changed, 19 insertions(+), 10 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 08a6248d8536..4d1462172f0f 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -44,12 +44,12 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>  
>  static int blk_mq_poll_stats_bkt(const struct request *rq)
>  {
> -	int ddir, bytes, bucket;
> +	int ddir, sectors, bucket;
>  
>  	ddir = rq_data_dir(rq);
> -	bytes = blk_rq_bytes(rq);
> +	sectors = blk_rq_stats_sectors(rq);
>  
> -	bucket = ddir + 2*(ilog2(bytes) - 9);
> +	bucket = ddir + 2 * ilog2(sectors);
>  
>  	if (bucket < 0)
>  		return -1;
> @@ -329,6 +329,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>  	else
>  		rq->start_time_ns = 0;
>  	rq->io_start_time_ns = 0;
> +	rq->stats_sectors = 0;
>  	rq->nr_phys_segments = 0;
>  #if defined(CONFIG_BLK_DEV_INTEGRITY)
>  	rq->nr_integrity_segments = 0;
> @@ -678,9 +679,7 @@ void blk_mq_start_request(struct request *rq)
>  
>  	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
>  		rq->io_start_time_ns = ktime_get_ns();
> -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
> -		rq->throtl_size = blk_rq_sectors(rq);
> -#endif
> +		rq->stats_sectors = blk_rq_sectors(rq);
>  		rq->rq_flags |= RQF_STATS;
>  		rq_qos_issue(q, rq);
>  	}
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 1b97a73d2fb1..88459a4ac704 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -2249,7 +2249,8 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
>  	struct request_queue *q = rq->q;
>  	struct throtl_data *td = q->td;
>  
> -	throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
> +	throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
> +			     time_ns >> 10);
>  }
>  
>  void blk_throtl_bio_endio(struct bio *bio)
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 1aafeb923e7b..68a0841d3554 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -202,9 +202,12 @@ struct request {
>  #ifdef CONFIG_BLK_WBT
>  	unsigned short wbt_flags;
>  #endif
> -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
> -	unsigned short throtl_size;
> -#endif
> +	/*
> +	 * rq sectors used for blk stats. It has the same value
> +	 * with blk_rq_sectors(rq), except that it never be zeroed
> +	 * by completion.
> +	 */
> +	unsigned short stats_sectors;
>  
>  	/*
>  	 * Number of scatter-gather DMA addr+len pairs after
> @@ -892,6 +895,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
>   * blk_rq_err_bytes()		: bytes left till the next error boundary
>   * blk_rq_sectors()		: sectors left in the entire request
>   * blk_rq_cur_sectors()		: sectors left in the current segment
> + * blk_rq_stats_sectors()	: sectors of the entire request used for stats
>   */
>  static inline sector_t blk_rq_pos(const struct request *rq)
>  {
> @@ -920,6 +924,11 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
>  	return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
>  }
>  
> +static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
> +{
> +	return rq->stats_sectors;
> +}
> +
>  #ifdef CONFIG_BLK_DEV_ZONED
>  static inline unsigned int blk_rq_zone_no(struct request *rq)
>  {
>
Pavel Begunkov May 25, 2019, 1:52 p.m. UTC | #2
I mean this way
https://lore.kernel.org/linux-block/dd30f4d94aa19956ad4500b1177741fd071ec37f.1558791181.git.asml.silence@gmail.com/T/#u


On 25/05/2019 14:46, Pavel Begunkov wrote:
> That's the same bug I posted a patch about 3+ weeks ago. No answer yet,
> however.
> https://www.spinics.net/lists/linux-block/msg40049.html
> 
> I think putting time sampling at __blk_mq_end_request() won't give
> sufficient precision for more sophisticated hybrid polling, because path
> __blk_mq_complete_request() -> __blk_mq_end_request() adds a lot of
> overhead (redirect rq to another cpu, dma unmap, etc). That's the case
> for mentioned patchset
> https://www.spinics.net/lists/linux-block/msg40044.html
> 
> It works OK for 50-80us requests (e.g. SSD read), but is not good enough
> for 20us and less (e.g. SSD write). I think it would be better to save
> the time in advance, and use it later.
> 
> By the way, it seems, that __blk_mq_end_request() is really better place
> to record stats, as not all drivers use __blk_mq_complete_request(),
> even though they don't implement blk_poll().
> 
> 
> On 21/05/2019 10:59, Hou Tao wrote:
>> Currently rq->data_len will be decreased by partial completion or
>> zeroed by completion, so when blk_stat_add() is invoked, data_len
>> will be zero and there will never be samples in poll_cb because
>> blk_mq_poll_stats_bkt() will return -1 if data_len is zero.
>>
>> We could move blk_stat_add() back to __blk_mq_complete_request(),
>> but that would make the effort of trying to call ktime_get_ns()
>> once in vain. Instead we can reuse throtl_size field, and use
>> it for both block stats and block throttle, and adjust the
>> logic in blk_mq_poll_stats_bkt() accordingly.
>>
>> Fixes: 4bc6339a583c ("block: move blk_stat_add() to __blk_mq_end_request()")
>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>> ---
>>  block/blk-mq.c         | 11 +++++------
>>  block/blk-throttle.c   |  3 ++-
>>  include/linux/blkdev.h | 15 ++++++++++++---
>>  3 files changed, 19 insertions(+), 10 deletions(-)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 08a6248d8536..4d1462172f0f 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -44,12 +44,12 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>>  
>>  static int blk_mq_poll_stats_bkt(const struct request *rq)
>>  {
>> -	int ddir, bytes, bucket;
>> +	int ddir, sectors, bucket;
>>  
>>  	ddir = rq_data_dir(rq);
>> -	bytes = blk_rq_bytes(rq);
>> +	sectors = blk_rq_stats_sectors(rq);
>>  
>> -	bucket = ddir + 2*(ilog2(bytes) - 9);
>> +	bucket = ddir + 2 * ilog2(sectors);
>>  
>>  	if (bucket < 0)
>>  		return -1;
>> @@ -329,6 +329,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>>  	else
>>  		rq->start_time_ns = 0;
>>  	rq->io_start_time_ns = 0;
>> +	rq->stats_sectors = 0;
>>  	rq->nr_phys_segments = 0;
>>  #if defined(CONFIG_BLK_DEV_INTEGRITY)
>>  	rq->nr_integrity_segments = 0;
>> @@ -678,9 +679,7 @@ void blk_mq_start_request(struct request *rq)
>>  
>>  	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
>>  		rq->io_start_time_ns = ktime_get_ns();
>> -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>> -		rq->throtl_size = blk_rq_sectors(rq);
>> -#endif
>> +		rq->stats_sectors = blk_rq_sectors(rq);
>>  		rq->rq_flags |= RQF_STATS;
>>  		rq_qos_issue(q, rq);
>>  	}
>> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
>> index 1b97a73d2fb1..88459a4ac704 100644
>> --- a/block/blk-throttle.c
>> +++ b/block/blk-throttle.c
>> @@ -2249,7 +2249,8 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
>>  	struct request_queue *q = rq->q;
>>  	struct throtl_data *td = q->td;
>>  
>> -	throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
>> +	throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
>> +			     time_ns >> 10);
>>  }
>>  
>>  void blk_throtl_bio_endio(struct bio *bio)
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index 1aafeb923e7b..68a0841d3554 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -202,9 +202,12 @@ struct request {
>>  #ifdef CONFIG_BLK_WBT
>>  	unsigned short wbt_flags;
>>  #endif
>> -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>> -	unsigned short throtl_size;
>> -#endif
>> +	/*
>> +	 * rq sectors used for blk stats. It has the same value
>> +	 * with blk_rq_sectors(rq), except that it never be zeroed
>> +	 * by completion.
>> +	 */
>> +	unsigned short stats_sectors;
>>  
>>  	/*
>>  	 * Number of scatter-gather DMA addr+len pairs after
>> @@ -892,6 +895,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
>>   * blk_rq_err_bytes()		: bytes left till the next error boundary
>>   * blk_rq_sectors()		: sectors left in the entire request
>>   * blk_rq_cur_sectors()		: sectors left in the current segment
>> + * blk_rq_stats_sectors()	: sectors of the entire request used for stats
>>   */
>>  static inline sector_t blk_rq_pos(const struct request *rq)
>>  {
>> @@ -920,6 +924,11 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
>>  	return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
>>  }
>>  
>> +static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
>> +{
>> +	return rq->stats_sectors;
>> +}
>> +
>>  #ifdef CONFIG_BLK_DEV_ZONED
>>  static inline unsigned int blk_rq_zone_no(struct request *rq)
>>  {
>>
>
Hou Tao May 27, 2019, 11:39 a.m. UTC | #3
Hi,

On 2019/5/25 19:46, Pavel Begunkov wrote:
> That's the same bug I posted a patch about 3+ weeks ago. No answer yet,
> however.
> https://www.spinics.net/lists/linux-block/msg40049.html
> 
Sorry for missing that.

> I think putting time sampling at __blk_mq_end_request() won't give
> sufficient precision for more sophisticated hybrid polling, because path
> __blk_mq_complete_request() -> __blk_mq_end_request() adds a lot of
> overhead (redirect rq to another cpu, dma unmap, etc). That's the case
> for mentioned patchset
> https://www.spinics.net/lists/linux-block/msg40044.html
> 
> It works OK for 50-80us requests (e.g. SSD read), but is not good enough
> for 20us and less (e.g. SSD write). I think it would be better to save
> the time in advance, and use it later.
> 
Yes, getting the end time from blk_mq_complete_request() will be more accurate.

> By the way, it seems, that __blk_mq_end_request() is really better place
> to record stats, as not all drivers use __blk_mq_complete_request(),
> even though they don't implement blk_poll().
> 
And __blk_mq_complete_request() may be invoked multiple times (although no possible
for nvme device), but __blk_mq_end_request() is only invoked once.

Regards,
Tao

> 
> On 21/05/2019 10:59, Hou Tao wrote:
>> Currently rq->data_len will be decreased by partial completion or
>> zeroed by completion, so when blk_stat_add() is invoked, data_len
>> will be zero and there will never be samples in poll_cb because
>> blk_mq_poll_stats_bkt() will return -1 if data_len is zero.
>>
>> We could move blk_stat_add() back to __blk_mq_complete_request(),
>> but that would make the effort of trying to call ktime_get_ns()
>> once in vain. Instead we can reuse throtl_size field, and use
>> it for both block stats and block throttle, and adjust the
>> logic in blk_mq_poll_stats_bkt() accordingly.
>>
>> Fixes: 4bc6339a583c ("block: move blk_stat_add() to __blk_mq_end_request()")
>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>> ---
>>  block/blk-mq.c         | 11 +++++------
>>  block/blk-throttle.c   |  3 ++-
>>  include/linux/blkdev.h | 15 ++++++++++++---
>>  3 files changed, 19 insertions(+), 10 deletions(-)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 08a6248d8536..4d1462172f0f 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -44,12 +44,12 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>>  
>>  static int blk_mq_poll_stats_bkt(const struct request *rq)
>>  {
>> -	int ddir, bytes, bucket;
>> +	int ddir, sectors, bucket;
>>  
>>  	ddir = rq_data_dir(rq);
>> -	bytes = blk_rq_bytes(rq);
>> +	sectors = blk_rq_stats_sectors(rq);
>>  
>> -	bucket = ddir + 2*(ilog2(bytes) - 9);
>> +	bucket = ddir + 2 * ilog2(sectors);
>>  
>>  	if (bucket < 0)
>>  		return -1;
>> @@ -329,6 +329,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>>  	else
>>  		rq->start_time_ns = 0;
>>  	rq->io_start_time_ns = 0;
>> +	rq->stats_sectors = 0;
>>  	rq->nr_phys_segments = 0;
>>  #if defined(CONFIG_BLK_DEV_INTEGRITY)
>>  	rq->nr_integrity_segments = 0;
>> @@ -678,9 +679,7 @@ void blk_mq_start_request(struct request *rq)
>>  
>>  	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
>>  		rq->io_start_time_ns = ktime_get_ns();
>> -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>> -		rq->throtl_size = blk_rq_sectors(rq);
>> -#endif
>> +		rq->stats_sectors = blk_rq_sectors(rq);
>>  		rq->rq_flags |= RQF_STATS;
>>  		rq_qos_issue(q, rq);
>>  	}
>> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
>> index 1b97a73d2fb1..88459a4ac704 100644
>> --- a/block/blk-throttle.c
>> +++ b/block/blk-throttle.c
>> @@ -2249,7 +2249,8 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
>>  	struct request_queue *q = rq->q;
>>  	struct throtl_data *td = q->td;
>>  
>> -	throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
>> +	throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
>> +			     time_ns >> 10);
>>  }
>>  
>>  void blk_throtl_bio_endio(struct bio *bio)
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index 1aafeb923e7b..68a0841d3554 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -202,9 +202,12 @@ struct request {
>>  #ifdef CONFIG_BLK_WBT
>>  	unsigned short wbt_flags;
>>  #endif
>> -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>> -	unsigned short throtl_size;
>> -#endif
>> +	/*
>> +	 * rq sectors used for blk stats. It has the same value
>> +	 * with blk_rq_sectors(rq), except that it never be zeroed
>> +	 * by completion.
>> +	 */
>> +	unsigned short stats_sectors;
>>  
>>  	/*
>>  	 * Number of scatter-gather DMA addr+len pairs after
>> @@ -892,6 +895,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
>>   * blk_rq_err_bytes()		: bytes left till the next error boundary
>>   * blk_rq_sectors()		: sectors left in the entire request
>>   * blk_rq_cur_sectors()		: sectors left in the current segment
>> + * blk_rq_stats_sectors()	: sectors of the entire request used for stats
>>   */
>>  static inline sector_t blk_rq_pos(const struct request *rq)
>>  {
>> @@ -920,6 +924,11 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
>>  	return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
>>  }
>>  
>> +static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
>> +{
>> +	return rq->stats_sectors;
>> +}
>> +
>>  #ifdef CONFIG_BLK_DEV_ZONED
>>  static inline unsigned int blk_rq_zone_no(struct request *rq)
>>  {
>>
>
Pavel Begunkov May 28, 2019, 6:25 p.m. UTC | #4
On 27/05/2019 14:39, Hou Tao wrote:
> Hi,
> 
> On 2019/5/25 19:46, Pavel Begunkov wrote:
>> That's the same bug I posted a patch about 3+ weeks ago. No answer yet,
>> however.
>> https://www.spinics.net/lists/linux-block/msg40049.html
>>
> Sorry for missing that.
Anyway I haven't fixed it properly there.

> 
>> I think putting time sampling at __blk_mq_end_request() won't give
>> sufficient precision for more sophisticated hybrid polling, because path
>> __blk_mq_complete_request() -> __blk_mq_end_request() adds a lot of
>> overhead (redirect rq to another cpu, dma unmap, etc). That's the case
>> for mentioned patchset
>> https://www.spinics.net/lists/linux-block/msg40044.html
>>
>> It works OK for 50-80us requests (e.g. SSD read), but is not good enough
>> for 20us and less (e.g. SSD write). I think it would be better to save
>> the time in advance, and use it later.
>>
> Yes, getting the end time from blk_mq_complete_request() will be more accurate.
What bothers me, if we sample time there, could we use it in throttling?
I'm not sure what it exactly expects.

> 
>> By the way, it seems, that __blk_mq_end_request() is really better place
>> to record stats, as not all drivers use __blk_mq_complete_request(),
>> even though they don't implement blk_poll().
>>
> And __blk_mq_complete_request() may be invoked multiple times (although no possible
> for nvme device), but __blk_mq_end_request() is only invoked once.
Doubt it, that would be rather strange to *complete* a request multiple
times. Need to check, but so far I only seen drivers either using it
once or calling blk_mq_end_request() directly.

> 
> Regards,
> Tao
> 
>>
>> On 21/05/2019 10:59, Hou Tao wrote:
>>> Currently rq->data_len will be decreased by partial completion or
>>> zeroed by completion, so when blk_stat_add() is invoked, data_len
>>> will be zero and there will never be samples in poll_cb because
>>> blk_mq_poll_stats_bkt() will return -1 if data_len is zero.
>>>
>>> We could move blk_stat_add() back to __blk_mq_complete_request(),
>>> but that would make the effort of trying to call ktime_get_ns()
>>> once in vain. Instead we can reuse throtl_size field, and use
>>> it for both block stats and block throttle, and adjust the
>>> logic in blk_mq_poll_stats_bkt() accordingly.
>>>
>>> Fixes: 4bc6339a583c ("block: move blk_stat_add() to __blk_mq_end_request()")
>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>>> ---
>>>  block/blk-mq.c         | 11 +++++------
>>>  block/blk-throttle.c   |  3 ++-
>>>  include/linux/blkdev.h | 15 ++++++++++++---
>>>  3 files changed, 19 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index 08a6248d8536..4d1462172f0f 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -44,12 +44,12 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>>>  
>>>  static int blk_mq_poll_stats_bkt(const struct request *rq)
>>>  {
>>> -	int ddir, bytes, bucket;
>>> +	int ddir, sectors, bucket;
>>>  
>>>  	ddir = rq_data_dir(rq);
>>> -	bytes = blk_rq_bytes(rq);
>>> +	sectors = blk_rq_stats_sectors(rq);
>>>  
>>> -	bucket = ddir + 2*(ilog2(bytes) - 9);
>>> +	bucket = ddir + 2 * ilog2(sectors);
>>>  
>>>  	if (bucket < 0)
>>>  		return -1;
>>> @@ -329,6 +329,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>>>  	else
>>>  		rq->start_time_ns = 0;
>>>  	rq->io_start_time_ns = 0;
>>> +	rq->stats_sectors = 0;
>>>  	rq->nr_phys_segments = 0;
>>>  #if defined(CONFIG_BLK_DEV_INTEGRITY)
>>>  	rq->nr_integrity_segments = 0;
>>> @@ -678,9 +679,7 @@ void blk_mq_start_request(struct request *rq)
>>>  
>>>  	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
>>>  		rq->io_start_time_ns = ktime_get_ns();
>>> -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>>> -		rq->throtl_size = blk_rq_sectors(rq);
>>> -#endif
>>> +		rq->stats_sectors = blk_rq_sectors(rq);
>>>  		rq->rq_flags |= RQF_STATS;
>>>  		rq_qos_issue(q, rq);
>>>  	}
>>> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
>>> index 1b97a73d2fb1..88459a4ac704 100644
>>> --- a/block/blk-throttle.c
>>> +++ b/block/blk-throttle.c
>>> @@ -2249,7 +2249,8 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
>>>  	struct request_queue *q = rq->q;
>>>  	struct throtl_data *td = q->td;
>>>  
>>> -	throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
>>> +	throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
>>> +			     time_ns >> 10);
>>>  }
>>>  
>>>  void blk_throtl_bio_endio(struct bio *bio)
>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>> index 1aafeb923e7b..68a0841d3554 100644
>>> --- a/include/linux/blkdev.h
>>> +++ b/include/linux/blkdev.h
>>> @@ -202,9 +202,12 @@ struct request {
>>>  #ifdef CONFIG_BLK_WBT
>>>  	unsigned short wbt_flags;
>>>  #endif
>>> -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>>> -	unsigned short throtl_size;
>>> -#endif
>>> +	/*
>>> +	 * rq sectors used for blk stats. It has the same value
>>> +	 * with blk_rq_sectors(rq), except that it never be zeroed
>>> +	 * by completion.
>>> +	 */
>>> +	unsigned short stats_sectors;
>>>  
>>>  	/*
>>>  	 * Number of scatter-gather DMA addr+len pairs after
>>> @@ -892,6 +895,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
>>>   * blk_rq_err_bytes()		: bytes left till the next error boundary
>>>   * blk_rq_sectors()		: sectors left in the entire request
>>>   * blk_rq_cur_sectors()		: sectors left in the current segment
>>> + * blk_rq_stats_sectors()	: sectors of the entire request used for stats
>>>   */
>>>  static inline sector_t blk_rq_pos(const struct request *rq)
>>>  {
>>> @@ -920,6 +924,11 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
>>>  	return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
>>>  }
>>>  
>>> +static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
>>> +{
>>> +	return rq->stats_sectors;
>>> +}
>>> +
>>>  #ifdef CONFIG_BLK_DEV_ZONED
>>>  static inline unsigned int blk_rq_zone_no(struct request *rq)
>>>  {
>>>
>>
>
Pavel Begunkov Sept. 15, 2019, 4:52 p.m. UTC | #5
Hi,

Works well for me
Tested-by: Pavel Begunkov <asml.silence@gmail.com>

Yet, take a look at the comment below.

By the way,
Fixes: 795fe54c2a828099e ("block: move blk_stat_add() to
__blk_mq_end_request()")


On 21/05/2019 10:59, Hou Tao wrote:
> +++ b/include/linux/blkdev.h
> @@ -202,9 +202,12 @@ struct request {
>  #ifdef CONFIG_BLK_WBT
>  	unsigned short wbt_flags;
>  #endif
> -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
> -	unsigned short throtl_size;
> -#endif
> +	/*
> +	 * rq sectors used for blk stats. It has the same value
> +	 * with blk_rq_sectors(rq), except that it never be zeroed
> +	 * by completion.
> +	 */
> +	unsigned short stats_sectors;
It can handle max 32MiB. Can a request's data be larger than that?

>  
>  	/*
>  	 * Number of scatter-gather DMA addr+len pairs after
> @@ -892,6 +895,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
>   * blk_rq_err_bytes()		: bytes left till the next error boundary
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 08a6248d8536..4d1462172f0f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -44,12 +44,12 @@  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
 static int blk_mq_poll_stats_bkt(const struct request *rq)
 {
-	int ddir, bytes, bucket;
+	int ddir, sectors, bucket;
 
 	ddir = rq_data_dir(rq);
-	bytes = blk_rq_bytes(rq);
+	sectors = blk_rq_stats_sectors(rq);
 
-	bucket = ddir + 2*(ilog2(bytes) - 9);
+	bucket = ddir + 2 * ilog2(sectors);
 
 	if (bucket < 0)
 		return -1;
@@ -329,6 +329,7 @@  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	else
 		rq->start_time_ns = 0;
 	rq->io_start_time_ns = 0;
+	rq->stats_sectors = 0;
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	rq->nr_integrity_segments = 0;
@@ -678,9 +679,7 @@  void blk_mq_start_request(struct request *rq)
 
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
 		rq->io_start_time_ns = ktime_get_ns();
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-		rq->throtl_size = blk_rq_sectors(rq);
-#endif
+		rq->stats_sectors = blk_rq_sectors(rq);
 		rq->rq_flags |= RQF_STATS;
 		rq_qos_issue(q, rq);
 	}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 1b97a73d2fb1..88459a4ac704 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2249,7 +2249,8 @@  void blk_throtl_stat_add(struct request *rq, u64 time_ns)
 	struct request_queue *q = rq->q;
 	struct throtl_data *td = q->td;
 
-	throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
+	throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
+			     time_ns >> 10);
 }
 
 void blk_throtl_bio_endio(struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1aafeb923e7b..68a0841d3554 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -202,9 +202,12 @@  struct request {
 #ifdef CONFIG_BLK_WBT
 	unsigned short wbt_flags;
 #endif
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-	unsigned short throtl_size;
-#endif
+	/*
+	 * rq sectors used for blk stats. It has the same value
+	 * with blk_rq_sectors(rq), except that it never be zeroed
+	 * by completion.
+	 */
+	unsigned short stats_sectors;
 
 	/*
 	 * Number of scatter-gather DMA addr+len pairs after
@@ -892,6 +895,7 @@  static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
  * blk_rq_err_bytes()		: bytes left till the next error boundary
  * blk_rq_sectors()		: sectors left in the entire request
  * blk_rq_cur_sectors()		: sectors left in the current segment
+ * blk_rq_stats_sectors()	: sectors of the entire request used for stats
  */
 static inline sector_t blk_rq_pos(const struct request *rq)
 {
@@ -920,6 +924,11 @@  static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 	return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
 }
 
+static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
+{
+	return rq->stats_sectors;
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline unsigned int blk_rq_zone_no(struct request *rq)
 {