[RFC,4/5] block: add a statistic table for io latency
diff mbox series

Message ID 20200708075819.4531-5-guoqing.jiang@cloud.ionos.com
State New
Headers show
Series
  • block: add two statistic tables
Related show

Commit Message

Guoqing Jiang July 8, 2020, 7:58 a.m. UTC
Usually, we get the status of block device by cat stat file,
but we can only know the total time with that file. And we
would like to know more accurate statistic, such as each
latency range, which helps people to diagnose if there is
issue about the hardware.

Also a new config option is introduced to control if people
want to know the additional statistics or not, and we also
use the option for io sector in next patch.

Signed-off-by: Florian-Ewald Mueller <florian-ewald.mueller@cloud.ionos.com>
Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
---
 block/Kconfig             |  8 ++++++++
 block/blk-core.c          | 35 +++++++++++++++++++++++++++++++++++
 block/genhd.c             | 26 ++++++++++++++++++++++++++
 include/linux/part_stat.h |  7 +++++++
 4 files changed, 76 insertions(+)

Comments

Ming Lei July 8, 2020, 1:29 p.m. UTC | #1
On Wed, Jul 08, 2020 at 09:58:18AM +0200, Guoqing Jiang wrote:
> Usually, we get the status of block device by cat stat file,
> but we can only know the total time with that file. And we
> would like to know more accurate statistic, such as each
> latency range, which helps people to diagnose if there is
> issue about the hardware.
> 
> Also a new config option is introduced to control if people
> want to know the additional statistics or not, and we also
> use the option for io sector in next patch.
> 
> Signed-off-by: Florian-Ewald Mueller <florian-ewald.mueller@cloud.ionos.com>
> Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
> ---
>  block/Kconfig             |  8 ++++++++
>  block/blk-core.c          | 35 +++++++++++++++++++++++++++++++++++
>  block/genhd.c             | 26 ++++++++++++++++++++++++++
>  include/linux/part_stat.h |  7 +++++++
>  4 files changed, 76 insertions(+)
> 
> diff --git a/block/Kconfig b/block/Kconfig
> index 9357d7302398..dba71feaa85b 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -175,6 +175,14 @@ config BLK_DEBUG_FS
>  	Unless you are building a kernel for a tiny system, you should
>  	say Y here.
>  
> +config BLK_ADDITIONAL_DISKSTAT
> +	bool "Block layer additional diskstat"
> +	default n
> +	help
> +	Enabling this option adds io latency statistics for each block device.
> +
> +	If unsure, say N.
> +
>  config BLK_DEBUG_FS_ZONED
>         bool
>         default BLK_DEBUG_FS && BLK_DEV_ZONED
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 0e806a8c62fb..7a129c8f1b23 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1411,6 +1411,39 @@ static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
>  	}
>  }
>  
> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
> +/*
> + * Either account additional stat for request if req is not NULL or account for bio.
> + */
> +static void blk_additional_latency(struct hd_struct *part, const int sgrp,
> +				   struct request *req, unsigned long start_ns)
> +{
> +	unsigned int idx;
> +	unsigned long duration, now = ktime_get_ns();
> +
> +	if (req)
> +		duration = (now - req->start_time_ns) / NSEC_PER_MSEC;
> +	else
> +		duration = (now - start_ns) / NSEC_PER_MSEC;
> +
> +	duration /= HZ_TO_MSEC_NUM;
> +	if (likely(duration > 0)) {
> +		idx = ilog2(duration);
> +		if (idx > ADD_STAT_NUM - 1)
> +			idx = ADD_STAT_NUM - 1;
> +	} else
> +		idx = 0;
> +	part_stat_inc(part, latency_table[idx][sgrp]);
> +
> +}
> +#else
> +static void blk_additional_latency(struct hd_struct *part, const int sgrp,
> +				   struct request *req, unsigned long start_jiffies)
> +
> +{
> +}
> +#endif
> +
>  static void blk_account_io_completion(struct request *req, unsigned int bytes)
>  {
>  	if (req->part && blk_do_io_stat(req)) {
> @@ -1440,6 +1473,7 @@ void blk_account_io_done(struct request *req, u64 now)
>  		part = req->part;
>  
>  		update_io_ticks(part, jiffies, true);
> +		blk_additional_latency(part, sgrp, req, 0);
>  		part_stat_inc(part, ios[sgrp]);
>  		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
>  		part_stat_unlock();
> @@ -1489,6 +1523,7 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op,
>  
>  	part_stat_lock();
>  	update_io_ticks(part, now, true);
> +	blk_additional_latency(part, sgrp, NULL, start_time);
>  	part_stat_add(part, nsecs[sgrp], duration);
>  	part_stat_local_dec(part, in_flight[op_is_write(op)]);
>  	part_stat_unlock();
> diff --git a/block/genhd.c b/block/genhd.c
> index 60ae4e1b4d38..a33937a74fb1 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -1420,6 +1420,29 @@ static struct device_attribute dev_attr_fail_timeout =
>  	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
>  #endif
>  
> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
> +static ssize_t io_latency_show(struct device *dev, struct device_attribute *attr, char *buf)
> +{
> +	struct hd_struct *p = dev_to_part(dev);
> +	size_t count = 0;
> +	int i, sgrp;
> +
> +	for (i = 0; i < ADD_STAT_NUM; i++) {
> +		count += scnprintf(buf + count, PAGE_SIZE - count, "%5d ms: ",
> +				   (1 << i) * HZ_TO_MSEC_NUM);
> +		for (sgrp = 0; sgrp < NR_STAT_GROUPS; sgrp++)
> +			count += scnprintf(buf + count, PAGE_SIZE - count, "%lu ",
> +					   part_stat_read(p, latency_table[i][sgrp]));
> +		count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
> +	}
> +
> +	return count;
> +}
> +
> +static struct device_attribute dev_attr_io_latency =
> +	__ATTR(io_latency, 0444, io_latency_show, NULL);
> +#endif
> +
>  static struct attribute *disk_attrs[] = {
>  	&dev_attr_range.attr,
>  	&dev_attr_ext_range.attr,
> @@ -1438,6 +1461,9 @@ static struct attribute *disk_attrs[] = {
>  #endif
>  #ifdef CONFIG_FAIL_IO_TIMEOUT
>  	&dev_attr_fail_timeout.attr,
> +#endif
> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
> +	&dev_attr_io_latency.attr,
>  #endif
>  	NULL
>  };
> diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
> index 24125778ef3e..fe3def8c69d7 100644
> --- a/include/linux/part_stat.h
> +++ b/include/linux/part_stat.h
> @@ -9,6 +9,13 @@ struct disk_stats {
>  	unsigned long sectors[NR_STAT_GROUPS];
>  	unsigned long ios[NR_STAT_GROUPS];
>  	unsigned long merges[NR_STAT_GROUPS];
> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
> +/*
> + * We measure latency (ms) for 1, 2, ..., 1024 and >=1024.
> + */
> +#define ADD_STAT_NUM	12
> +	unsigned long latency_table[ADD_STAT_NUM][NR_STAT_GROUPS];
> +#endif
>  	unsigned long io_ticks;
>  	local_t in_flight[2];
>  };
> -- 
> 2.17.1
> 

Hi Guoqing,

I believe it isn't hard to write a ebpf based script(bcc or bpftrace) to
collect this kind of performance data, so looks not necessary to do it
in kernel.

Thanks,
Ming
Guoqing Jiang July 8, 2020, 2:02 p.m. UTC | #2
On 7/8/20 3:29 PM, Ming Lei wrote:
> On Wed, Jul 08, 2020 at 09:58:18AM +0200, Guoqing Jiang wrote:
>> Usually, we get the status of block device by cat stat file,
>> but we can only know the total time with that file. And we
>> would like to know more accurate statistic, such as each
>> latency range, which helps people to diagnose if there is
>> issue about the hardware.
>>
>> Also a new config option is introduced to control if people
>> want to know the additional statistics or not, and we also
>> use the option for io sector in next patch.
>>
>> Signed-off-by: Florian-Ewald Mueller <florian-ewald.mueller@cloud.ionos.com>
>> Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
>> ---
>>   block/Kconfig             |  8 ++++++++
>>   block/blk-core.c          | 35 +++++++++++++++++++++++++++++++++++
>>   block/genhd.c             | 26 ++++++++++++++++++++++++++
>>   include/linux/part_stat.h |  7 +++++++
>>   4 files changed, 76 insertions(+)
>>
>> diff --git a/block/Kconfig b/block/Kconfig
>> index 9357d7302398..dba71feaa85b 100644
>> --- a/block/Kconfig
>> +++ b/block/Kconfig
>> @@ -175,6 +175,14 @@ config BLK_DEBUG_FS
>>   	Unless you are building a kernel for a tiny system, you should
>>   	say Y here.
>>   
>> +config BLK_ADDITIONAL_DISKSTAT
>> +	bool "Block layer additional diskstat"
>> +	default n
>> +	help
>> +	Enabling this option adds io latency statistics for each block device.
>> +
>> +	If unsure, say N.
>> +
>>   config BLK_DEBUG_FS_ZONED
>>          bool
>>          default BLK_DEBUG_FS && BLK_DEV_ZONED
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 0e806a8c62fb..7a129c8f1b23 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -1411,6 +1411,39 @@ static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
>>   	}
>>   }
>>   
>> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
>> +/*
>> + * Either account additional stat for request if req is not NULL or account for bio.
>> + */
>> +static void blk_additional_latency(struct hd_struct *part, const int sgrp,
>> +				   struct request *req, unsigned long start_ns)
>> +{
>> +	unsigned int idx;
>> +	unsigned long duration, now = ktime_get_ns();
>> +
>> +	if (req)
>> +		duration = (now - req->start_time_ns) / NSEC_PER_MSEC;
>> +	else
>> +		duration = (now - start_ns) / NSEC_PER_MSEC;
>> +
>> +	duration /= HZ_TO_MSEC_NUM;
>> +	if (likely(duration > 0)) {
>> +		idx = ilog2(duration);
>> +		if (idx > ADD_STAT_NUM - 1)
>> +			idx = ADD_STAT_NUM - 1;
>> +	} else
>> +		idx = 0;
>> +	part_stat_inc(part, latency_table[idx][sgrp]);
>> +
>> +}
>> +#else
>> +static void blk_additional_latency(struct hd_struct *part, const int sgrp,
>> +				   struct request *req, unsigned long start_jiffies)
>> +
>> +{
>> +}
>> +#endif
>> +
>>   static void blk_account_io_completion(struct request *req, unsigned int bytes)
>>   {
>>   	if (req->part && blk_do_io_stat(req)) {
>> @@ -1440,6 +1473,7 @@ void blk_account_io_done(struct request *req, u64 now)
>>   		part = req->part;
>>   
>>   		update_io_ticks(part, jiffies, true);
>> +		blk_additional_latency(part, sgrp, req, 0);
>>   		part_stat_inc(part, ios[sgrp]);
>>   		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
>>   		part_stat_unlock();
>> @@ -1489,6 +1523,7 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op,
>>   
>>   	part_stat_lock();
>>   	update_io_ticks(part, now, true);
>> +	blk_additional_latency(part, sgrp, NULL, start_time);
>>   	part_stat_add(part, nsecs[sgrp], duration);
>>   	part_stat_local_dec(part, in_flight[op_is_write(op)]);
>>   	part_stat_unlock();
>> diff --git a/block/genhd.c b/block/genhd.c
>> index 60ae4e1b4d38..a33937a74fb1 100644
>> --- a/block/genhd.c
>> +++ b/block/genhd.c
>> @@ -1420,6 +1420,29 @@ static struct device_attribute dev_attr_fail_timeout =
>>   	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
>>   #endif
>>   
>> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
>> +static ssize_t io_latency_show(struct device *dev, struct device_attribute *attr, char *buf)
>> +{
>> +	struct hd_struct *p = dev_to_part(dev);
>> +	size_t count = 0;
>> +	int i, sgrp;
>> +
>> +	for (i = 0; i < ADD_STAT_NUM; i++) {
>> +		count += scnprintf(buf + count, PAGE_SIZE - count, "%5d ms: ",
>> +				   (1 << i) * HZ_TO_MSEC_NUM);
>> +		for (sgrp = 0; sgrp < NR_STAT_GROUPS; sgrp++)
>> +			count += scnprintf(buf + count, PAGE_SIZE - count, "%lu ",
>> +					   part_stat_read(p, latency_table[i][sgrp]));
>> +		count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
>> +	}
>> +
>> +	return count;
>> +}
>> +
>> +static struct device_attribute dev_attr_io_latency =
>> +	__ATTR(io_latency, 0444, io_latency_show, NULL);
>> +#endif
>> +
>>   static struct attribute *disk_attrs[] = {
>>   	&dev_attr_range.attr,
>>   	&dev_attr_ext_range.attr,
>> @@ -1438,6 +1461,9 @@ static struct attribute *disk_attrs[] = {
>>   #endif
>>   #ifdef CONFIG_FAIL_IO_TIMEOUT
>>   	&dev_attr_fail_timeout.attr,
>> +#endif
>> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
>> +	&dev_attr_io_latency.attr,
>>   #endif
>>   	NULL
>>   };
>> diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
>> index 24125778ef3e..fe3def8c69d7 100644
>> --- a/include/linux/part_stat.h
>> +++ b/include/linux/part_stat.h
>> @@ -9,6 +9,13 @@ struct disk_stats {
>>   	unsigned long sectors[NR_STAT_GROUPS];
>>   	unsigned long ios[NR_STAT_GROUPS];
>>   	unsigned long merges[NR_STAT_GROUPS];
>> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
>> +/*
>> + * We measure latency (ms) for 1, 2, ..., 1024 and >=1024.
>> + */
>> +#define ADD_STAT_NUM	12
>> +	unsigned long latency_table[ADD_STAT_NUM][NR_STAT_GROUPS];
>> +#endif
>>   	unsigned long io_ticks;
>>   	local_t in_flight[2];
>>   };
>> -- 
>> 2.17.1
>>
> Hi Guoqing,
>
> I believe it isn't hard to write a ebpf based script(bcc or bpftrace) to
> collect this kind of performance data, so looks not necessary to do it
> in kernel.

Hi Ming,

Sorry, I don't know well about bcc or bpftrace, but I assume they need to
read the latency value from somewhere inside kernel. Could you point
how can I get the latency value? Thanks in advance!

Thanks,
Guoqing
Guoqing Jiang July 8, 2020, 2:06 p.m. UTC | #3
On 7/8/20 4:02 PM, Guoqing Jiang wrote:
>> Hi Guoqing,
>>
>> I believe it isn't hard to write a ebpf based script(bcc or bpftrace) to
>> collect this kind of performance data, so looks not necessary to do it
>> in kernel.
>
> Hi Ming,
>
> Sorry, I don't know well about bcc or bpftrace, but I assume they need to
> read the latency value from somewhere inside kernel. Could you point
> how can I get the latency value? Thanks in advance!

Hmm, I suppose biolatency is suitable for track latency, will look into it.

Thanks,
Guoqing
Guoqing Jiang July 9, 2020, 6:48 p.m. UTC | #4
Hi Ming,

On 7/8/20 4:06 PM, Guoqing Jiang wrote:
> On 7/8/20 4:02 PM, Guoqing Jiang wrote:
>>> Hi Guoqing,
>>>
>>> I believe it isn't hard to write a ebpf based script(bcc or 
>>> bpftrace) to
>>> collect this kind of performance data, so looks not necessary to do it
>>> in kernel.
>>
>> Hi Ming,
>>
>> Sorry, I don't know well about bcc or bpftrace, but I assume they 
>> need to
>> read the latency value from somewhere inside kernel. Could you point
>> how can I get the latency value? Thanks in advance!
>
> Hmm, I suppose biolatency is suitable for track latency, will look 
> into it.

I think biolatency can't trace data if it is not running, also seems no 
place
inside kernel have recorded such information for ebpf to read, correct me
if my understanding is wrong.

And as cloud provider,we would like to know data when necessary instead
of collect data by keep script running because it is expensive than just 
read
node IMHO.

Thanks,
Guoqing
Ming Lei July 10, 2020, 12:53 a.m. UTC | #5
Hi Guoqing,

On Thu, Jul 09, 2020 at 08:48:08PM +0200, Guoqing Jiang wrote:
> Hi Ming,
> 
> On 7/8/20 4:06 PM, Guoqing Jiang wrote:
> > On 7/8/20 4:02 PM, Guoqing Jiang wrote:
> > > > Hi Guoqing,
> > > > 
> > > > I believe it isn't hard to write a ebpf based script(bcc or
> > > > bpftrace) to
> > > > collect this kind of performance data, so looks not necessary to do it
> > > > in kernel.
> > > 
> > > Hi Ming,
> > > 
> > > Sorry, I don't know well about bcc or bpftrace, but I assume they
> > > need to
> > > read the latency value from somewhere inside kernel. Could you point
> > > how can I get the latency value? Thanks in advance!
> > 
> > Hmm, I suppose biolatency is suitable for track latency, will look into
> > it.
> 
> I think biolatency can't trace data if it is not running,

Yeah, the ebpf prog is only injected when the trace is started.

> also seems no
> place
> inside kernel have recorded such information for ebpf to read, correct me
> if my understanding is wrong.

Just record the info by starting the bcc script in case you need that, is there
anything wrong with this usage? Always doing such stuff in kernel isn't fair for
users which don't care or need this info.

> 
> And as cloud provider,we would like to know data when necessary instead
> of collect data by keep script running because it is expensive than just
> read
> node IMHO.

It shouldn't be expensive. It might be a bit slow to inject the ebpf prog because
the code has to be verified, however once it is put inside kernel, it should have
been efficient enough. The kernel side prog only updates & stores the latency
summery data into bpf map, and the stored summery data can be read out anytime
by userspace.

Could you explain a bit why it is expensive? such as biolatency


Thanks, 
Ming
Guoqing Jiang July 10, 2020, 8:55 a.m. UTC | #6
Hi Ming,

On 7/10/20 2:53 AM, Ming Lei wrote:
> Hi Guoqing,
>
> On Thu, Jul 09, 2020 at 08:48:08PM +0200, Guoqing Jiang wrote:
>> Hi Ming,
>>
>> On 7/8/20 4:06 PM, Guoqing Jiang wrote:
>>> On 7/8/20 4:02 PM, Guoqing Jiang wrote:
>>>>> Hi Guoqing,
>>>>>
>>>>> I believe it isn't hard to write a ebpf based script(bcc or
>>>>> bpftrace) to
>>>>> collect this kind of performance data, so looks not necessary to do it
>>>>> in kernel.
>>>> Hi Ming,
>>>>
>>>> Sorry, I don't know well about bcc or bpftrace, but I assume they
>>>> need to
>>>> read the latency value from somewhere inside kernel. Could you point
>>>> how can I get the latency value? Thanks in advance!
>>> Hmm, I suppose biolatency is suitable for track latency, will look into
>>> it.
>> I think biolatency can't trace data if it is not running,
> Yeah, the ebpf prog is only injected when the trace is started.
>
>> also seems no
>> place
>> inside kernel have recorded such information for ebpf to read, correct me
>> if my understanding is wrong.
> Just record the info by starting the bcc script in case you need that, is there
> anything wrong with this usage? Always doing such stuff in kernel isn't fair for
> users which don't care or need this info.

That is why we add a Kconfig option and set it to N by default. And I 
suppose
with modern cpu, the cost with several more instructions would not be that
expensive even the option is enabled, just my $0.02.

>> And as cloud provider,we would like to know data when necessary instead
>> of collect data by keep script running because it is expensive than just
>> read
>> node IMHO.
> It shouldn't be expensive. It might be a bit slow to inject the ebpf prog because
> the code has to be verified, however once it is put inside kernel, it should have
> been efficient enough. The kernel side prog only updates & stores the latency
> summery data into bpf map, and the stored summery data can be read out anytime
> by userspace.
>
> Could you explain a bit why it is expensive? such as biolatency

I thought I am compare read a sys node + extra instructions in kernel with
launch a specific process for monitoring which need to occupy more
resources (memory) and context switch. And for biolatency, it calls the
bpf_ktime_get_ns to calculate latency for each IO which I assume the
ktime_get_ns will be triggered finally, and it is not cheap as you said.

Of course, we will keep the change in our own tree if it doesn't make sense
to others.

Thanks,
Guoqing
Ming Lei July 10, 2020, 10 a.m. UTC | #7
On Fri, Jul 10, 2020 at 10:55:24AM +0200, Guoqing Jiang wrote:
> Hi Ming,
> 
> On 7/10/20 2:53 AM, Ming Lei wrote:
> > Hi Guoqing,
> > 
> > On Thu, Jul 09, 2020 at 08:48:08PM +0200, Guoqing Jiang wrote:
> > > Hi Ming,
> > > 
> > > On 7/8/20 4:06 PM, Guoqing Jiang wrote:
> > > > On 7/8/20 4:02 PM, Guoqing Jiang wrote:
> > > > > > Hi Guoqing,
> > > > > > 
> > > > > > I believe it isn't hard to write a ebpf based script(bcc or
> > > > > > bpftrace) to
> > > > > > collect this kind of performance data, so looks not necessary to do it
> > > > > > in kernel.
> > > > > Hi Ming,
> > > > > 
> > > > > Sorry, I don't know well about bcc or bpftrace, but I assume they
> > > > > need to
> > > > > read the latency value from somewhere inside kernel. Could you point
> > > > > how can I get the latency value? Thanks in advance!
> > > > Hmm, I suppose biolatency is suitable for track latency, will look into
> > > > it.
> > > I think biolatency can't trace data if it is not running,
> > Yeah, the ebpf prog is only injected when the trace is started.
> > 
> > > also seems no
> > > place
> > > inside kernel have recorded such information for ebpf to read, correct me
> > > if my understanding is wrong.
> > Just record the info by starting the bcc script in case you need that, is there
> > anything wrong with this usage? Always doing such stuff in kernel isn't fair for
> > users which don't care or need this info.
> 
> That is why we add a Kconfig option and set it to N by default. And I
> suppose
> with modern cpu, the cost with several more instructions would not be that
> expensive even the option is enabled, just my $0.02.
> 
> > > And as cloud provider,we would like to know data when necessary instead
> > > of collect data by keep script running because it is expensive than just
> > > read
> > > node IMHO.
> > It shouldn't be expensive. It might be a bit slow to inject the ebpf prog because
> > the code has to be verified, however once it is put inside kernel, it should have
> > been efficient enough. The kernel side prog only updates & stores the latency
> > summery data into bpf map, and the stored summery data can be read out anytime
> > by userspace.
> > 
> > Could you explain a bit why it is expensive? such as biolatency
> 
> I thought I am compare read a sys node + extra instructions in kernel with
> launch a specific process for monitoring which need to occupy more
> resources (memory) and context switch. And for biolatency, it calls the
> bpf_ktime_get_ns to calculate latency for each IO which I assume the
> ktime_get_ns will be triggered finally, and it is not cheap as you said.

You can replace one read of timestamp with rq->start_time_ns too, just
like what this patch does. You can write your bcc/bfptrace script,
which is quite easy to start. Once you learn its power, maybe you will love
it.


Thanks,
Ming
Guoqing Jiang July 10, 2020, 10:29 a.m. UTC | #8
On 7/10/20 12:00 PM, Ming Lei wrote:
> On Fri, Jul 10, 2020 at 10:55:24AM +0200, Guoqing Jiang wrote:
>> Hi Ming,
>>
>> On 7/10/20 2:53 AM, Ming Lei wrote:
>>> Hi Guoqing,
>>>
>>> On Thu, Jul 09, 2020 at 08:48:08PM +0200, Guoqing Jiang wrote:
>>>> Hi Ming,
>>>>
>>>> On 7/8/20 4:06 PM, Guoqing Jiang wrote:
>>>>> On 7/8/20 4:02 PM, Guoqing Jiang wrote:
>>>>>>> Hi Guoqing,
>>>>>>>
>>>>>>> I believe it isn't hard to write a ebpf based script(bcc or
>>>>>>> bpftrace) to
>>>>>>> collect this kind of performance data, so looks not necessary to do it
>>>>>>> in kernel.
>>>>>> Hi Ming,
>>>>>>
>>>>>> Sorry, I don't know well about bcc or bpftrace, but I assume they
>>>>>> need to
>>>>>> read the latency value from somewhere inside kernel. Could you point
>>>>>> how can I get the latency value? Thanks in advance!
>>>>> Hmm, I suppose biolatency is suitable for track latency, will look into
>>>>> it.
>>>> I think biolatency can't trace data if it is not running,
>>> Yeah, the ebpf prog is only injected when the trace is started.
>>>
>>>> also seems no
>>>> place
>>>> inside kernel have recorded such information for ebpf to read, correct me
>>>> if my understanding is wrong.
>>> Just record the info by starting the bcc script in case you need that, is there
>>> anything wrong with this usage? Always doing such stuff in kernel isn't fair for
>>> users which don't care or need this info.
>> That is why we add a Kconfig option and set it to N by default. And I
>> suppose
>> with modern cpu, the cost with several more instructions would not be that
>> expensive even the option is enabled, just my $0.02.
>>
>>>> And as cloud provider,we would like to know data when necessary instead
>>>> of collect data by keep script running because it is expensive than just
>>>> read
>>>> node IMHO.
>>> It shouldn't be expensive. It might be a bit slow to inject the ebpf prog because
>>> the code has to be verified, however once it is put inside kernel, it should have
>>> been efficient enough. The kernel side prog only updates & stores the latency
>>> summery data into bpf map, and the stored summery data can be read out anytime
>>> by userspace.
>>>
>>> Could you explain a bit why it is expensive? such as biolatency
>> I thought I am compare read a sys node + extra instructions in kernel with
>> launch a specific process for monitoring which need to occupy more
>> resources (memory) and context switch. And for biolatency, it calls the
>> bpf_ktime_get_ns to calculate latency for each IO which I assume the
>> ktime_get_ns will be triggered finally, and it is not cheap as you said.
> You can replace one read of timestamp with rq->start_time_ns too, just
> like what this patch does. You can write your bcc/bfptrace script,
> which is quite easy to start. Once you learn its power, maybe you will love
> it.

Yes, I definitely need to learn more about it :-). But even with the 
change,
I still believe read a node is cheaper than a script.

And seems biolatency can't trace bio based driver per below, and with
collect data in tree we can trace all block drivers.

# load BPF program
b = BPF(text=bpf_text)
if args.queued:
     b.attach_kprobe(event="blk_account_io_start", 
fn_name="trace_req_start")
else:
     b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
     b.attach_kprobe(event="blk_mq_start_request", 
fn_name="trace_req_start")
b.attach_kprobe(event="blk_account_io_completion",
     fn_name="trace_req_completion")

Could it possible to extend it support trace both request and bio? Otherwise
we have to run another script to trace md raid.

Thanks,
Guoqing
Jens Axboe July 10, 2020, 2:04 p.m. UTC | #9
On 7/10/20 2:55 AM, Guoqing Jiang wrote:
> Hi Ming,
> 
> On 7/10/20 2:53 AM, Ming Lei wrote:
>> Hi Guoqing,
>>
>> On Thu, Jul 09, 2020 at 08:48:08PM +0200, Guoqing Jiang wrote:
>>> Hi Ming,
>>>
>>> On 7/8/20 4:06 PM, Guoqing Jiang wrote:
>>>> On 7/8/20 4:02 PM, Guoqing Jiang wrote:
>>>>>> Hi Guoqing,
>>>>>>
>>>>>> I believe it isn't hard to write a ebpf based script(bcc or
>>>>>> bpftrace) to
>>>>>> collect this kind of performance data, so looks not necessary to do it
>>>>>> in kernel.
>>>>> Hi Ming,
>>>>>
>>>>> Sorry, I don't know well about bcc or bpftrace, but I assume they
>>>>> need to
>>>>> read the latency value from somewhere inside kernel. Could you point
>>>>> how can I get the latency value? Thanks in advance!
>>>> Hmm, I suppose biolatency is suitable for track latency, will look into
>>>> it.
>>> I think biolatency can't trace data if it is not running,
>> Yeah, the ebpf prog is only injected when the trace is started.
>>
>>> also seems no
>>> place
>>> inside kernel have recorded such information for ebpf to read, correct me
>>> if my understanding is wrong.
>> Just record the info by starting the bcc script in case you need that, is there
>> anything wrong with this usage? Always doing such stuff in kernel isn't fair for
>> users which don't care or need this info.
> 
> That is why we add a Kconfig option and set it to N by default. And I 
> suppose
> with modern cpu, the cost with several more instructions would not be that
> expensive even the option is enabled, just my $0.02.

Never justify it with a Kconfig option, that doesn't help anything at
all. Distros then enable it, and all users are stuck with this overhead.
The ktime_get() is definitely extra overhead.

FWIW, I agree with Ming here in that this can easily be done from
userspace. And if that's the case, then I don't see why everybody should
carry this extra burden.
Ming Lei July 11, 2020, 1:32 a.m. UTC | #10
On Fri, Jul 10, 2020 at 12:29:28PM +0200, Guoqing Jiang wrote:
> On 7/10/20 12:00 PM, Ming Lei wrote:
> > On Fri, Jul 10, 2020 at 10:55:24AM +0200, Guoqing Jiang wrote:
> > > Hi Ming,
> > > 
> > > On 7/10/20 2:53 AM, Ming Lei wrote:
> > > > Hi Guoqing,
> > > > 
> > > > On Thu, Jul 09, 2020 at 08:48:08PM +0200, Guoqing Jiang wrote:
> > > > > Hi Ming,
> > > > > 
> > > > > On 7/8/20 4:06 PM, Guoqing Jiang wrote:
> > > > > > On 7/8/20 4:02 PM, Guoqing Jiang wrote:
> > > > > > > > Hi Guoqing,
> > > > > > > > 
> > > > > > > > I believe it isn't hard to write a ebpf based script(bcc or
> > > > > > > > bpftrace) to
> > > > > > > > collect this kind of performance data, so looks not necessary to do it
> > > > > > > > in kernel.
> > > > > > > Hi Ming,
> > > > > > > 
> > > > > > > Sorry, I don't know well about bcc or bpftrace, but I assume they
> > > > > > > need to
> > > > > > > read the latency value from somewhere inside kernel. Could you point
> > > > > > > how can I get the latency value? Thanks in advance!
> > > > > > Hmm, I suppose biolatency is suitable for track latency, will look into
> > > > > > it.
> > > > > I think biolatency can't trace data if it is not running,
> > > > Yeah, the ebpf prog is only injected when the trace is started.
> > > > 
> > > > > also seems no
> > > > > place
> > > > > inside kernel have recorded such information for ebpf to read, correct me
> > > > > if my understanding is wrong.
> > > > Just record the info by starting the bcc script in case you need that, is there
> > > > anything wrong with this usage? Always doing such stuff in kernel isn't fair for
> > > > users which don't care or need this info.
> > > That is why we add a Kconfig option and set it to N by default. And I
> > > suppose
> > > with modern cpu, the cost with several more instructions would not be that
> > > expensive even the option is enabled, just my $0.02.
> > > 
> > > > > And as cloud provider,we would like to know data when necessary instead
> > > > > of collect data by keep script running because it is expensive than just
> > > > > read
> > > > > node IMHO.
> > > > It shouldn't be expensive. It might be a bit slow to inject the ebpf prog because
> > > > the code has to be verified, however once it is put inside kernel, it should have
> > > > been efficient enough. The kernel side prog only updates & stores the latency
> > > > summery data into bpf map, and the stored summery data can be read out anytime
> > > > by userspace.
> > > > 
> > > > Could you explain a bit why it is expensive? such as biolatency
> > > I thought I am compare read a sys node + extra instructions in kernel with
> > > launch a specific process for monitoring which need to occupy more
> > > resources (memory) and context switch. And for biolatency, it calls the
> > > bpf_ktime_get_ns to calculate latency for each IO which I assume the
> > > ktime_get_ns will be triggered finally, and it is not cheap as you said.
> > You can replace one read of timestamp with rq->start_time_ns too, just
> > like what this patch does. You can write your bcc/bfptrace script,
> > which is quite easy to start. Once you learn its power, maybe you will love
> > it.
> 
> Yes, I definitely need to learn more about it :-). But even with the change,
> I still believe read a node is cheaper than a script.
> 
> And seems biolatency can't trace bio based driver per below, and with
> collect data in tree we can trace all block drivers.
> 
> # load BPF program
> b = BPF(text=bpf_text)
> if args.queued:
>     b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start")
> else:
>     b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
>     b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
> b.attach_kprobe(event="blk_account_io_completion",
>     fn_name="trace_req_completion")
> 
> Could it possible to extend it support trace both request and bio? Otherwise
> we have to run another script to trace md raid.

It is pretty easy to extend support bio, just add kprobe on submit_bio
and bio_endio().

thanks,
Ming
Guoqing Jiang July 12, 2020, 8:39 p.m. UTC | #11
On 7/11/20 3:32 AM, Ming Lei wrote:
> On Fri, Jul 10, 2020 at 12:29:28PM +0200, Guoqing Jiang wrote:
>> On 7/10/20 12:00 PM, Ming Lei wrote:
>>> On Fri, Jul 10, 2020 at 10:55:24AM +0200, Guoqing Jiang wrote:
>>>> Hi Ming,
>>>>
>>>> On 7/10/20 2:53 AM, Ming Lei wrote:
>>>>> Hi Guoqing,
>>>>>
>>>>> On Thu, Jul 09, 2020 at 08:48:08PM +0200, Guoqing Jiang wrote:
>>>>>> Hi Ming,
>>>>>>
>>>>>> On 7/8/20 4:06 PM, Guoqing Jiang wrote:
>>>>>>> On 7/8/20 4:02 PM, Guoqing Jiang wrote:
>>>>>>>>> Hi Guoqing,
>>>>>>>>>
>>>>>>>>> I believe it isn't hard to write a ebpf based script(bcc or
>>>>>>>>> bpftrace) to
>>>>>>>>> collect this kind of performance data, so looks not necessary to do it
>>>>>>>>> in kernel.
>>>>>>>> Hi Ming,
>>>>>>>>
>>>>>>>> Sorry, I don't know well about bcc or bpftrace, but I assume they
>>>>>>>> need to
>>>>>>>> read the latency value from somewhere inside kernel. Could you point
>>>>>>>> how can I get the latency value? Thanks in advance!
>>>>>>> Hmm, I suppose biolatency is suitable for track latency, will look into
>>>>>>> it.
>>>>>> I think biolatency can't trace data if it is not running,
>>>>> Yeah, the ebpf prog is only injected when the trace is started.
>>>>>
>>>>>> also seems no
>>>>>> place
>>>>>> inside kernel have recorded such information for ebpf to read, correct me
>>>>>> if my understanding is wrong.
>>>>> Just record the info by starting the bcc script in case you need that, is there
>>>>> anything wrong with this usage? Always doing such stuff in kernel isn't fair for
>>>>> users which don't care or need this info.
>>>> That is why we add a Kconfig option and set it to N by default. And I
>>>> suppose
>>>> with modern cpu, the cost with several more instructions would not be that
>>>> expensive even the option is enabled, just my $0.02.
>>>>
>>>>>> And as cloud provider,we would like to know data when necessary instead
>>>>>> of collect data by keep script running because it is expensive than just
>>>>>> read
>>>>>> node IMHO.
>>>>> It shouldn't be expensive. It might be a bit slow to inject the ebpf prog because
>>>>> the code has to be verified, however once it is put inside kernel, it should have
>>>>> been efficient enough. The kernel side prog only updates & stores the latency
>>>>> summery data into bpf map, and the stored summery data can be read out anytime
>>>>> by userspace.
>>>>>
>>>>> Could you explain a bit why it is expensive? such as biolatency
>>>> I thought I am compare read a sys node + extra instructions in kernel with
>>>> launch a specific process for monitoring which need to occupy more
>>>> resources (memory) and context switch. And for biolatency, it calls the
>>>> bpf_ktime_get_ns to calculate latency for each IO which I assume the
>>>> ktime_get_ns will be triggered finally, and it is not cheap as you said.
>>> You can replace one read of timestamp with rq->start_time_ns too, just
>>> like what this patch does. You can write your bcc/bfptrace script,
>>> which is quite easy to start. Once you learn its power, maybe you will love
>>> it.
>> Yes, I definitely need to learn more about it :-). But even with the change,
>> I still believe read a node is cheaper than a script.
>>
>> And seems biolatency can't trace bio based driver per below, and with
>> collect data in tree we can trace all block drivers.
>>
>> # load BPF program
>> b = BPF(text=bpf_text)
>> if args.queued:
>>      b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start")
>> else:
>>      b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
>>      b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
>> b.attach_kprobe(event="blk_account_io_completion",
>>      fn_name="trace_req_completion")
>>
>> Could it possible to extend it support trace both request and bio? Otherwise
>> we have to run another script to trace md raid.
> It is pretty easy to extend support bio, just add kprobe on submit_bio
> and bio_endio().
>

The thing is that we don't like the cost of ebpf based solution. And FWIW,
two years ago, we had compared about ebpf solution and record those
data in kernel.

A. in-kernel monitor: 1~5% performance drop
B. ebpf monitor: 10~15% performance drop

Note, we even copied each bio in approach A, which means the performance
could be more better since we don't clone bio now.

And I think the major concern is about the additional Kconfig option, since
Jens doesn't like it, so I guess no need to make the change it in upstream
kernel.

Thanks,
Guoqing
Jens Axboe July 12, 2020, 8:44 p.m. UTC | #12
On 7/12/20 2:39 PM, Guoqing Jiang wrote:
> On 7/11/20 3:32 AM, Ming Lei wrote:
>> On Fri, Jul 10, 2020 at 12:29:28PM +0200, Guoqing Jiang wrote:
>>> On 7/10/20 12:00 PM, Ming Lei wrote:
>>>> On Fri, Jul 10, 2020 at 10:55:24AM +0200, Guoqing Jiang wrote:
>>>>> Hi Ming,
>>>>>
>>>>> On 7/10/20 2:53 AM, Ming Lei wrote:
>>>>>> Hi Guoqing,
>>>>>>
>>>>>> On Thu, Jul 09, 2020 at 08:48:08PM +0200, Guoqing Jiang wrote:
>>>>>>> Hi Ming,
>>>>>>>
>>>>>>> On 7/8/20 4:06 PM, Guoqing Jiang wrote:
>>>>>>>> On 7/8/20 4:02 PM, Guoqing Jiang wrote:
>>>>>>>>>> Hi Guoqing,
>>>>>>>>>>
>>>>>>>>>> I believe it isn't hard to write a ebpf based script(bcc or
>>>>>>>>>> bpftrace) to
>>>>>>>>>> collect this kind of performance data, so looks not necessary to do it
>>>>>>>>>> in kernel.
>>>>>>>>> Hi Ming,
>>>>>>>>>
>>>>>>>>> Sorry, I don't know well about bcc or bpftrace, but I assume they
>>>>>>>>> need to
>>>>>>>>> read the latency value from somewhere inside kernel. Could you point
>>>>>>>>> how can I get the latency value? Thanks in advance!
>>>>>>>> Hmm, I suppose biolatency is suitable for track latency, will look into
>>>>>>>> it.
>>>>>>> I think biolatency can't trace data if it is not running,
>>>>>> Yeah, the ebpf prog is only injected when the trace is started.
>>>>>>
>>>>>>> also seems no
>>>>>>> place
>>>>>>> inside kernel have recorded such information for ebpf to read, correct me
>>>>>>> if my understanding is wrong.
>>>>>> Just record the info by starting the bcc script in case you need that, is there
>>>>>> anything wrong with this usage? Always doing such stuff in kernel isn't fair for
>>>>>> users which don't care or need this info.
>>>>> That is why we add a Kconfig option and set it to N by default. And I
>>>>> suppose
>>>>> with modern cpu, the cost with several more instructions would not be that
>>>>> expensive even the option is enabled, just my $0.02.
>>>>>
>>>>>>> And as cloud provider,we would like to know data when necessary instead
>>>>>>> of collect data by keep script running because it is expensive than just
>>>>>>> read
>>>>>>> node IMHO.
>>>>>> It shouldn't be expensive. It might be a bit slow to inject the ebpf prog because
>>>>>> the code has to be verified, however once it is put inside kernel, it should have
>>>>>> been efficient enough. The kernel side prog only updates & stores the latency
>>>>>> summery data into bpf map, and the stored summery data can be read out anytime
>>>>>> by userspace.
>>>>>>
>>>>>> Could you explain a bit why it is expensive? such as biolatency
>>>>> I thought I am compare read a sys node + extra instructions in kernel with
>>>>> launch a specific process for monitoring which need to occupy more
>>>>> resources (memory) and context switch. And for biolatency, it calls the
>>>>> bpf_ktime_get_ns to calculate latency for each IO which I assume the
>>>>> ktime_get_ns will be triggered finally, and it is not cheap as you said.
>>>> You can replace one read of timestamp with rq->start_time_ns too, just
>>>> like what this patch does. You can write your bcc/bfptrace script,
>>>> which is quite easy to start. Once you learn its power, maybe you will love
>>>> it.
>>> Yes, I definitely need to learn more about it :-). But even with the change,
>>> I still believe read a node is cheaper than a script.
>>>
>>> And seems biolatency can't trace bio based driver per below, and with
>>> collect data in tree we can trace all block drivers.
>>>
>>> # load BPF program
>>> b = BPF(text=bpf_text)
>>> if args.queued:
>>>      b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start")
>>> else:
>>>      b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
>>>      b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
>>> b.attach_kprobe(event="blk_account_io_completion",
>>>      fn_name="trace_req_completion")
>>>
>>> Could it possible to extend it support trace both request and bio? Otherwise
>>> we have to run another script to trace md raid.
>> It is pretty easy to extend support bio, just add kprobe on submit_bio
>> and bio_endio().
>>
> 
> The thing is that we don't like the cost of ebpf based solution. And FWIW,
> two years ago, we had compared about ebpf solution and record those
> data in kernel.
> 
> A. in-kernel monitor: 1~5% performance drop
> B. ebpf monitor: 10~15% performance drop
> 
> Note, we even copied each bio in approach A, which means the performance
> could be more better since we don't clone bio now.
> 
> And I think the major concern is about the additional Kconfig option, since
> Jens doesn't like it, so I guess no need to make the change it in upstream
> kernel.

No, my main concern is trying to justify it with having a Kconfig option
to turn it off. Fact is, distros will likely turn it on, and then
everybody gets that overhead. There's a temptation to hide features like
that behind a Kconfig option with this exact justification, and it just
doesn't work like that in practice.

I might be amenable to the change if:

1) it doesn't add any (real) overhead to the normal fast path
2) probably needs to be opt-in. not via kconfig, but as a sysfs
   attribute. Like we have 'iostats' today.

Something I've mentioned in the past has been a blktrace flight recorder
mode, but that's largely been superseded by having bpf available. But
the point is that something like blktrace generally doesn't add ANY
overhead at all if blktrace isn't being run. Your solution ends up
collecting stats all the time, regardless of whether or not anyone is
actually looking at the data. That's a bad idea, and I'd be much happier
with a solution that only adds overhead when actually used, not all the
time.
Guoqing Jiang July 12, 2020, 9:04 p.m. UTC | #13
On 7/12/20 10:44 PM, Jens Axboe wrote:
> On 7/12/20 2:39 PM, Guoqing Jiang wrote:
>> On 7/11/20 3:32 AM, Ming Lei wrote:
>>> On Fri, Jul 10, 2020 at 12:29:28PM +0200, Guoqing Jiang wrote:
>>>> On 7/10/20 12:00 PM, Ming Lei wrote:
>>>>> On Fri, Jul 10, 2020 at 10:55:24AM +0200, Guoqing Jiang wrote:
>>>>>> Hi Ming,
>>>>>>
>>>>>> On 7/10/20 2:53 AM, Ming Lei wrote:
>>>>>>> Hi Guoqing,
>>>>>>>
>>>>>>> On Thu, Jul 09, 2020 at 08:48:08PM +0200, Guoqing Jiang wrote:
>>>>>>>> Hi Ming,
>>>>>>>>
>>>>>>>> On 7/8/20 4:06 PM, Guoqing Jiang wrote:
>>>>>>>>> On 7/8/20 4:02 PM, Guoqing Jiang wrote:
>>>>>>>>>>> Hi Guoqing,
>>>>>>>>>>>
>>>>>>>>>>> I believe it isn't hard to write a ebpf based script(bcc or
>>>>>>>>>>> bpftrace) to
>>>>>>>>>>> collect this kind of performance data, so looks not necessary to do it
>>>>>>>>>>> in kernel.
>>>>>>>>>> Hi Ming,
>>>>>>>>>>
>>>>>>>>>> Sorry, I don't know well about bcc or bpftrace, but I assume they
>>>>>>>>>> need to
>>>>>>>>>> read the latency value from somewhere inside kernel. Could you point
>>>>>>>>>> how can I get the latency value? Thanks in advance!
>>>>>>>>> Hmm, I suppose biolatency is suitable for track latency, will look into
>>>>>>>>> it.
>>>>>>>> I think biolatency can't trace data if it is not running,
>>>>>>> Yeah, the ebpf prog is only injected when the trace is started.
>>>>>>>
>>>>>>>> also seems no
>>>>>>>> place
>>>>>>>> inside kernel have recorded such information for ebpf to read, correct me
>>>>>>>> if my understanding is wrong.
>>>>>>> Just record the info by starting the bcc script in case you need that, is there
>>>>>>> anything wrong with this usage? Always doing such stuff in kernel isn't fair for
>>>>>>> users which don't care or need this info.
>>>>>> That is why we add a Kconfig option and set it to N by default. And I
>>>>>> suppose
>>>>>> with modern cpu, the cost with several more instructions would not be that
>>>>>> expensive even the option is enabled, just my $0.02.
>>>>>>
>>>>>>>> And as cloud provider,we would like to know data when necessary instead
>>>>>>>> of collect data by keep script running because it is expensive than just
>>>>>>>> read
>>>>>>>> node IMHO.
>>>>>>> It shouldn't be expensive. It might be a bit slow to inject the ebpf prog because
>>>>>>> the code has to be verified, however once it is put inside kernel, it should have
>>>>>>> been efficient enough. The kernel side prog only updates & stores the latency
>>>>>>> summery data into bpf map, and the stored summery data can be read out anytime
>>>>>>> by userspace.
>>>>>>>
>>>>>>> Could you explain a bit why it is expensive? such as biolatency
>>>>>> I thought I am compare read a sys node + extra instructions in kernel with
>>>>>> launch a specific process for monitoring which need to occupy more
>>>>>> resources (memory) and context switch. And for biolatency, it calls the
>>>>>> bpf_ktime_get_ns to calculate latency for each IO which I assume the
>>>>>> ktime_get_ns will be triggered finally, and it is not cheap as you said.
>>>>> You can replace one read of timestamp with rq->start_time_ns too, just
>>>>> like what this patch does. You can write your bcc/bfptrace script,
>>>>> which is quite easy to start. Once you learn its power, maybe you will love
>>>>> it.
>>>> Yes, I definitely need to learn more about it :-). But even with the change,
>>>> I still believe read a node is cheaper than a script.
>>>>
>>>> And seems biolatency can't trace bio based driver per below, and with
>>>> collect data in tree we can trace all block drivers.
>>>>
>>>> # load BPF program
>>>> b = BPF(text=bpf_text)
>>>> if args.queued:
>>>>       b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start")
>>>> else:
>>>>       b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
>>>>       b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
>>>> b.attach_kprobe(event="blk_account_io_completion",
>>>>       fn_name="trace_req_completion")
>>>>
>>>> Could it possible to extend it support trace both request and bio? Otherwise
>>>> we have to run another script to trace md raid.
>>> It is pretty easy to extend support bio, just add kprobe on submit_bio
>>> and bio_endio().
>>>
>> The thing is that we don't like the cost of ebpf based solution. And FWIW,
>> two years ago, we had compared about ebpf solution and record those
>> data in kernel.
>>
>> A. in-kernel monitor: 1~5% performance drop
>> B. ebpf monitor: 10~15% performance drop
>>
>> Note, we even copied each bio in approach A, which means the performance
>> could be more better since we don't clone bio now.
>>
>> And I think the major concern is about the additional Kconfig option, since
>> Jens doesn't like it, so I guess no need to make the change it in upstream
>> kernel.
> No, my main concern is trying to justify it with having a Kconfig option
> to turn it off. Fact is, distros will likely turn it on, and then
> everybody gets that overhead. There's a temptation to hide features like
> that behind a Kconfig option with this exact justification, and it just
> doesn't work like that in practice.

Good to know that, :-).

> I might be amenable to the change if:
>
> 1) it doesn't add any (real) overhead to the normal fast path

It is possible to remove ktime_get_ns from current change, but I admit
we can't avoid small overhead (several instructions per each IO I think),
and it is not fair to people who doesn't care about those data.

> 2) probably needs to be opt-in. not via kconfig, but as a sysfs
>     attribute. Like we have 'iostats' today.

Thanks for the suggestion, will investigate a better way.

> Something I've mentioned in the past has been a blktrace flight recorder
> mode, but that's largely been superseded by having bpf available. But
> the point is that something like blktrace generally doesn't add ANY
> overhead at all if blktrace isn't being run. Your solution ends up
> collecting stats all the time, regardless of whether or not anyone is
> actually looking at the data. That's a bad idea, and I'd be much happier
> with a solution that only adds overhead when actually used, not all the
> time.

Appreciate for your input which make me know your concern better.

Thanks,
Guoqing

Patch
diff mbox series

diff --git a/block/Kconfig b/block/Kconfig
index 9357d7302398..dba71feaa85b 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -175,6 +175,14 @@  config BLK_DEBUG_FS
 	Unless you are building a kernel for a tiny system, you should
 	say Y here.
 
+config BLK_ADDITIONAL_DISKSTAT
+	bool "Block layer additional diskstat"
+	default n
+	help
+	Enabling this option adds io latency statistics for each block device.
+
+	If unsure, say N.
+
 config BLK_DEBUG_FS_ZONED
        bool
        default BLK_DEBUG_FS && BLK_DEV_ZONED
diff --git a/block/blk-core.c b/block/blk-core.c
index 0e806a8c62fb..7a129c8f1b23 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1411,6 +1411,39 @@  static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
 	}
 }
 
+#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
+/*
+ * Either account additional stat for request if req is not NULL or account for bio.
+ */
+static void blk_additional_latency(struct hd_struct *part, const int sgrp,
+				   struct request *req, unsigned long start_ns)
+{
+	unsigned int idx;
+	unsigned long duration, now = ktime_get_ns();
+
+	if (req)
+		duration = (now - req->start_time_ns) / NSEC_PER_MSEC;
+	else
+		duration = (now - start_ns) / NSEC_PER_MSEC;
+
+	duration /= HZ_TO_MSEC_NUM;
+	if (likely(duration > 0)) {
+		idx = ilog2(duration);
+		if (idx > ADD_STAT_NUM - 1)
+			idx = ADD_STAT_NUM - 1;
+	} else
+		idx = 0;
+	part_stat_inc(part, latency_table[idx][sgrp]);
+
+}
+#else
+static void blk_additional_latency(struct hd_struct *part, const int sgrp,
+				   struct request *req, unsigned long start_jiffies)
+
+{
+}
+#endif
+
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (req->part && blk_do_io_stat(req)) {
@@ -1440,6 +1473,7 @@  void blk_account_io_done(struct request *req, u64 now)
 		part = req->part;
 
 		update_io_ticks(part, jiffies, true);
+		blk_additional_latency(part, sgrp, req, 0);
 		part_stat_inc(part, ios[sgrp]);
 		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
 		part_stat_unlock();
@@ -1489,6 +1523,7 @@  void disk_end_io_acct(struct gendisk *disk, unsigned int op,
 
 	part_stat_lock();
 	update_io_ticks(part, now, true);
+	blk_additional_latency(part, sgrp, NULL, start_time);
 	part_stat_add(part, nsecs[sgrp], duration);
 	part_stat_local_dec(part, in_flight[op_is_write(op)]);
 	part_stat_unlock();
diff --git a/block/genhd.c b/block/genhd.c
index 60ae4e1b4d38..a33937a74fb1 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1420,6 +1420,29 @@  static struct device_attribute dev_attr_fail_timeout =
 	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
 #endif
 
+#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
+static ssize_t io_latency_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	size_t count = 0;
+	int i, sgrp;
+
+	for (i = 0; i < ADD_STAT_NUM; i++) {
+		count += scnprintf(buf + count, PAGE_SIZE - count, "%5d ms: ",
+				   (1 << i) * HZ_TO_MSEC_NUM);
+		for (sgrp = 0; sgrp < NR_STAT_GROUPS; sgrp++)
+			count += scnprintf(buf + count, PAGE_SIZE - count, "%lu ",
+					   part_stat_read(p, latency_table[i][sgrp]));
+		count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
+	}
+
+	return count;
+}
+
+static struct device_attribute dev_attr_io_latency =
+	__ATTR(io_latency, 0444, io_latency_show, NULL);
+#endif
+
 static struct attribute *disk_attrs[] = {
 	&dev_attr_range.attr,
 	&dev_attr_ext_range.attr,
@@ -1438,6 +1461,9 @@  static struct attribute *disk_attrs[] = {
 #endif
 #ifdef CONFIG_FAIL_IO_TIMEOUT
 	&dev_attr_fail_timeout.attr,
+#endif
+#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
+	&dev_attr_io_latency.attr,
 #endif
 	NULL
 };
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index 24125778ef3e..fe3def8c69d7 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -9,6 +9,13 @@  struct disk_stats {
 	unsigned long sectors[NR_STAT_GROUPS];
 	unsigned long ios[NR_STAT_GROUPS];
 	unsigned long merges[NR_STAT_GROUPS];
+#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
+/*
+ * We measure latency (ms) for 1, 2, ..., 1024 and >=1024.
+ */
+#define ADD_STAT_NUM	12
+	unsigned long latency_table[ADD_STAT_NUM][NR_STAT_GROUPS];
+#endif
 	unsigned long io_ticks;
 	local_t in_flight[2];
 };