diff mbox series

[RFC,V2,1/4] block: add a statistic table for io latency

Message ID 20200713211321.21123-2-guoqing.jiang@cloud.ionos.com (mailing list archive)
State New, archived
Headers show
Series block: add two statistic tables | expand

Commit Message

Guoqing Jiang July 13, 2020, 9:13 p.m. UTC
Usually, we get the status of block device by cat stat file,
but we can only know the total time with that file. And we
would like to know more accurate statistic, such as each
latency range, which helps people to diagnose if there is
issue about the hardware.

Also a new config option is introduced to control if people
want to know the additional statistics or not, and we use
the option for io sector in next patch.

Signed-off-by: Florian-Ewald Mueller <florian-ewald.mueller@cloud.ionos.com>
Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
---
 block/Kconfig             |  8 ++++++++
 block/blk-core.c          | 34 ++++++++++++++++++++++++++++++++++
 block/genhd.c             | 26 ++++++++++++++++++++++++++
 include/linux/part_stat.h |  7 +++++++
 4 files changed, 75 insertions(+)

Comments

Johannes Thumshirn July 14, 2020, 7:43 a.m. UTC | #1
On 13/07/2020 23:14, Guoqing Jiang wrote:
> diff --git a/block/blk-core.c b/block/blk-core.c
> index d9d632639bd1..036eb04782de 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1411,6 +1411,34 @@ static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
>  	}
>  }
>  
> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
> +/*
> + * Either account additional stat for request if req is not NULL or account for bio.
> + */
> +static void blk_additional_latency(struct hd_struct *part, const int sgrp,
> +				   struct request *req, unsigned long start_jiffies)
> +{
> +	unsigned int idx;
> +	unsigned long duration, now = READ_ONCE(jiffies);
> +
> +	if (req)
> +		duration = jiffies_to_nsecs(now) - req->start_time_ns;
> +	else
> +		duration = jiffies_to_nsecs(now - start_jiffies);
> +
> +	duration /= NSEC_PER_MSEC;
> +	duration /= HZ_TO_MSEC_NUM;
> +	if (likely(duration > 0)) {
> +		idx = ilog2(duration);
> +		if (idx > ADD_STAT_NUM - 1)
> +			idx = ADD_STAT_NUM - 1;
> +	} else
> +		idx = 0;
> +	part_stat_inc(part, latency_table[idx][sgrp]);
> +
> +}
> +#endif
> +
>  static void blk_account_io_completion(struct request *req, unsigned int bytes)
>  {
>  	if (req->part && blk_do_io_stat(req)) {
> @@ -1440,6 +1468,9 @@ void blk_account_io_done(struct request *req, u64 now)
>  		part = req->part;
>  
>  		update_io_ticks(part, jiffies, true);
> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
> +		blk_additional_latency(part, sgrp, req, 0);
> +#endif

Not commenting on the general idea here but only the code. The above introduces quite a
lot of ifdefs in code. Please at least move the #ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
into the function body of blk_additional_latency() so you don't need any ifdefs at the
call sites.
Guoqing Jiang July 14, 2020, 8:25 a.m. UTC | #2
On 7/14/20 9:43 AM, Johannes Thumshirn wrote:
> On 13/07/2020 23:14, Guoqing Jiang wrote:
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index d9d632639bd1..036eb04782de 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -1411,6 +1411,34 @@ static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
>>   	}
>>   }
>>   
>> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
>> +/*
>> + * Either account additional stat for request if req is not NULL or account for bio.
>> + */
>> +static void blk_additional_latency(struct hd_struct *part, const int sgrp,
>> +				   struct request *req, unsigned long start_jiffies)
>> +{
>> +	unsigned int idx;
>> +	unsigned long duration, now = READ_ONCE(jiffies);
>> +
>> +	if (req)
>> +		duration = jiffies_to_nsecs(now) - req->start_time_ns;
>> +	else
>> +		duration = jiffies_to_nsecs(now - start_jiffies);
>> +
>> +	duration /= NSEC_PER_MSEC;
>> +	duration /= HZ_TO_MSEC_NUM;
>> +	if (likely(duration > 0)) {
>> +		idx = ilog2(duration);
>> +		if (idx > ADD_STAT_NUM - 1)
>> +			idx = ADD_STAT_NUM - 1;
>> +	} else
>> +		idx = 0;
>> +	part_stat_inc(part, latency_table[idx][sgrp]);
>> +
>> +}
>> +#endif
>> +
>>   static void blk_account_io_completion(struct request *req, unsigned int bytes)
>>   {
>>   	if (req->part && blk_do_io_stat(req)) {
>> @@ -1440,6 +1468,9 @@ void blk_account_io_done(struct request *req, u64 now)
>>   		part = req->part;
>>   
>>   		update_io_ticks(part, jiffies, true);
>> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
>> +		blk_additional_latency(part, sgrp, req, 0);
>> +#endif
> Not commenting on the general idea here but only the code. The above introduces quite a
> lot of ifdefs in code. Please at least move the #ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
> into the function body of blk_additional_latency() so you don't need any ifdefs at the
> call sites.

Sure, will do it, thanks for your suggestion.

Thanks,
Guoqing
Danil Kipnis Aug. 11, 2020, 7:22 a.m. UTC | #3
On Mon, Jul 13, 2020 at 11:13 PM Guoqing Jiang
<guoqing.jiang@cloud.ionos.com> wrote:
>
> Usually, we get the status of block device by cat stat file,
> but we can only know the total time with that file. And we
> would like to know more accurate statistic, such as each
> latency range, which helps people to diagnose if there is
> issue about the hardware.
>
> Also a new config option is introduced to control if people
> want to know the additional statistics or not, and we use
> the option for io sector in next patch.
>
> Signed-off-by: Florian-Ewald Mueller <florian-ewald.mueller@cloud.ionos.com>
> Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
> ---
>  block/Kconfig             |  8 ++++++++
>  block/blk-core.c          | 34 ++++++++++++++++++++++++++++++++++
>  block/genhd.c             | 26 ++++++++++++++++++++++++++
>  include/linux/part_stat.h |  7 +++++++
>  4 files changed, 75 insertions(+)
>
> diff --git a/block/Kconfig b/block/Kconfig
> index bbad5e8bbffe..360f63111e2d 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -176,6 +176,14 @@ config BLK_DEBUG_FS
>         Unless you are building a kernel for a tiny system, you should
>         say Y here.
>
> +config BLK_ADDITIONAL_DISKSTAT
> +       bool "Block layer additional diskstat"
> +       default n
> +       help
> +       Enabling this option adds io latency statistics for each block device.
> +
> +       If unsure, say N.
> +
>  config BLK_DEBUG_FS_ZONED
>         bool
>         default BLK_DEBUG_FS && BLK_DEV_ZONED
> diff --git a/block/blk-core.c b/block/blk-core.c
> index d9d632639bd1..036eb04782de 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1411,6 +1411,34 @@ static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
>         }
>  }
>
> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
> +/*
> + * Either account additional stat for request if req is not NULL or account for bio.
> + */
> +static void blk_additional_latency(struct hd_struct *part, const int sgrp,
> +                                  struct request *req, unsigned long start_jiffies)
> +{
> +       unsigned int idx;
> +       unsigned long duration, now = READ_ONCE(jiffies);
> +
> +       if (req)
> +               duration = jiffies_to_nsecs(now) - req->start_time_ns;
> +       else
> +               duration = jiffies_to_nsecs(now - start_jiffies);
> +
> +       duration /= NSEC_PER_MSEC;
> +       duration /= HZ_TO_MSEC_NUM;
> +       if (likely(duration > 0)) {
> +               idx = ilog2(duration);
> +               if (idx > ADD_STAT_NUM - 1)
> +                       idx = ADD_STAT_NUM - 1;
> +       } else
> +               idx = 0;
> +       part_stat_inc(part, latency_table[idx][sgrp]);
> +
> +}
> +#endif
> +
>  static void blk_account_io_completion(struct request *req, unsigned int bytes)
>  {
>         if (req->part && blk_do_io_stat(req)) {
> @@ -1440,6 +1468,9 @@ void blk_account_io_done(struct request *req, u64 now)
>                 part = req->part;
>
>                 update_io_ticks(part, jiffies, true);
> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
> +               blk_additional_latency(part, sgrp, req, 0);
> +#endif
>                 part_stat_inc(part, ios[sgrp]);
>                 part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
>                 part_stat_unlock();
> @@ -1488,6 +1519,9 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op,
>
>         part_stat_lock();
>         update_io_ticks(part, now, true);
> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
> +       blk_additional_latency(part, sgrp, NULL, start_time);
> +#endif
>         part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
>         part_stat_local_dec(part, in_flight[op_is_write(op)]);
>         part_stat_unlock();
> diff --git a/block/genhd.c b/block/genhd.c
> index c42a49f2f537..f5d2f110fb34 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -1420,6 +1420,29 @@ static struct device_attribute dev_attr_fail_timeout =
>         __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
>  #endif
>
> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
> +static ssize_t io_latency_show(struct device *dev, struct device_attribute *attr, char *buf)
> +{
> +       struct hd_struct *p = dev_to_part(dev);
> +       size_t count = 0;
> +       int i, sgrp;
> +
> +       for (i = 0; i < ADD_STAT_NUM; i++) {
> +               count += scnprintf(buf + count, PAGE_SIZE - count, "%5d ms: ",
> +                                  (1 << i) * HZ_TO_MSEC_NUM);
> +               for (sgrp = 0; sgrp < NR_STAT_GROUPS; sgrp++)
> +                       count += scnprintf(buf + count, PAGE_SIZE - count, "%lu ",
> +                                          part_stat_read(p, latency_table[i][sgrp]));
> +               count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
> +       }
> +
> +       return count;
> +}
> +
> +static struct device_attribute dev_attr_io_latency =
> +       __ATTR(io_latency, 0444, io_latency_show, NULL);
> +#endif
> +
>  static struct attribute *disk_attrs[] = {
>         &dev_attr_range.attr,
>         &dev_attr_ext_range.attr,
> @@ -1438,6 +1461,9 @@ static struct attribute *disk_attrs[] = {
>  #endif
>  #ifdef CONFIG_FAIL_IO_TIMEOUT
>         &dev_attr_fail_timeout.attr,
> +#endif
> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
> +       &dev_attr_io_latency.attr,
>  #endif
>         NULL
>  };
> diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
> index 24125778ef3e..fe3def8c69d7 100644
> --- a/include/linux/part_stat.h
> +++ b/include/linux/part_stat.h
> @@ -9,6 +9,13 @@ struct disk_stats {
>         unsigned long sectors[NR_STAT_GROUPS];
>         unsigned long ios[NR_STAT_GROUPS];
>         unsigned long merges[NR_STAT_GROUPS];
> +#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
> +/*
> + * We measure latency (ms) for 1, 2, ..., 1024 and >=1024.
> + */
> +#define ADD_STAT_NUM   12
> +       unsigned long latency_table[ADD_STAT_NUM][NR_STAT_GROUPS];
> +#endif
>         unsigned long io_ticks;
>         local_t in_flight[2];
>  };
> --
> 2.17.1
>
Hi,

This feature is very useful to analyse io performance in a cluster of
Linux machines. For example an io is generated in the block layer of a
VM, enters the block layer of the host, passes through a couple of
block devices, is then sent over a network to a number of remote
machines, enters the block layer there, crosses yet another couple of
block devices and finally gets submitted to the disks. Then
confirmations travel all the way back to the block layer of the host
and at some point bio_endio is called in the vm.

- Hey folks, a lower accumulated io latency would be nice.
- NP, where do we start?
- ...
- Ping?
diff mbox series

Patch

diff --git a/block/Kconfig b/block/Kconfig
index bbad5e8bbffe..360f63111e2d 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -176,6 +176,14 @@  config BLK_DEBUG_FS
 	Unless you are building a kernel for a tiny system, you should
 	say Y here.
 
+config BLK_ADDITIONAL_DISKSTAT
+	bool "Block layer additional diskstat"
+	default n
+	help
+	Enabling this option adds io latency statistics for each block device.
+
+	If unsure, say N.
+
 config BLK_DEBUG_FS_ZONED
        bool
        default BLK_DEBUG_FS && BLK_DEV_ZONED
diff --git a/block/blk-core.c b/block/blk-core.c
index d9d632639bd1..036eb04782de 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1411,6 +1411,34 @@  static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
 	}
 }
 
+#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
+/*
+ * Either account additional stat for request if req is not NULL or account for bio.
+ */
+static void blk_additional_latency(struct hd_struct *part, const int sgrp,
+				   struct request *req, unsigned long start_jiffies)
+{
+	unsigned int idx;
+	unsigned long duration, now = READ_ONCE(jiffies);
+
+	if (req)
+		duration = jiffies_to_nsecs(now) - req->start_time_ns;
+	else
+		duration = jiffies_to_nsecs(now - start_jiffies);
+
+	duration /= NSEC_PER_MSEC;
+	duration /= HZ_TO_MSEC_NUM;
+	if (likely(duration > 0)) {
+		idx = ilog2(duration);
+		if (idx > ADD_STAT_NUM - 1)
+			idx = ADD_STAT_NUM - 1;
+	} else
+		idx = 0;
+	part_stat_inc(part, latency_table[idx][sgrp]);
+
+}
+#endif
+
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (req->part && blk_do_io_stat(req)) {
@@ -1440,6 +1468,9 @@  void blk_account_io_done(struct request *req, u64 now)
 		part = req->part;
 
 		update_io_ticks(part, jiffies, true);
+#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
+		blk_additional_latency(part, sgrp, req, 0);
+#endif
 		part_stat_inc(part, ios[sgrp]);
 		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
 		part_stat_unlock();
@@ -1488,6 +1519,9 @@  void disk_end_io_acct(struct gendisk *disk, unsigned int op,
 
 	part_stat_lock();
 	update_io_ticks(part, now, true);
+#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
+	blk_additional_latency(part, sgrp, NULL, start_time);
+#endif
 	part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
 	part_stat_local_dec(part, in_flight[op_is_write(op)]);
 	part_stat_unlock();
diff --git a/block/genhd.c b/block/genhd.c
index c42a49f2f537..f5d2f110fb34 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1420,6 +1420,29 @@  static struct device_attribute dev_attr_fail_timeout =
 	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
 #endif
 
+#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
+static ssize_t io_latency_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	size_t count = 0;
+	int i, sgrp;
+
+	for (i = 0; i < ADD_STAT_NUM; i++) {
+		count += scnprintf(buf + count, PAGE_SIZE - count, "%5d ms: ",
+				   (1 << i) * HZ_TO_MSEC_NUM);
+		for (sgrp = 0; sgrp < NR_STAT_GROUPS; sgrp++)
+			count += scnprintf(buf + count, PAGE_SIZE - count, "%lu ",
+					   part_stat_read(p, latency_table[i][sgrp]));
+		count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
+	}
+
+	return count;
+}
+
+static struct device_attribute dev_attr_io_latency =
+	__ATTR(io_latency, 0444, io_latency_show, NULL);
+#endif
+
 static struct attribute *disk_attrs[] = {
 	&dev_attr_range.attr,
 	&dev_attr_ext_range.attr,
@@ -1438,6 +1461,9 @@  static struct attribute *disk_attrs[] = {
 #endif
 #ifdef CONFIG_FAIL_IO_TIMEOUT
 	&dev_attr_fail_timeout.attr,
+#endif
+#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
+	&dev_attr_io_latency.attr,
 #endif
 	NULL
 };
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index 24125778ef3e..fe3def8c69d7 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -9,6 +9,13 @@  struct disk_stats {
 	unsigned long sectors[NR_STAT_GROUPS];
 	unsigned long ios[NR_STAT_GROUPS];
 	unsigned long merges[NR_STAT_GROUPS];
+#ifdef CONFIG_BLK_ADDITIONAL_DISKSTAT
+/*
+ * We measure latency (ms) for 1, 2, ..., 1024 and >=1024.
+ */
+#define ADD_STAT_NUM	12
+	unsigned long latency_table[ADD_STAT_NUM][NR_STAT_GROUPS];
+#endif
 	unsigned long io_ticks;
 	local_t in_flight[2];
 };