Message ID | 20200910022026.632617-1-pizhenwei@bytedance.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | block: add io_error stat for block device | expand |
Hi, Jens How do you think about error stat of a block device? On 9/10/20 10:20 AM, zhenwei pi wrote: > Currently if hitting block req error, block layer only prints error > log with a rate limitation. Then agent has to parse kernel log to > record what happens. > > In this patch, add read/write/discard/flush stat counter to record > io errors. > > Signed-off-by: zhenwei pi <pizhenwei@bytedance.com> > --- > block/blk-core.c | 14 +++++++++++--- > block/genhd.c | 19 +++++++++++++++++++ > include/linux/part_stat.h | 1 + > 3 files changed, 31 insertions(+), 3 deletions(-) > > diff --git a/block/blk-core.c b/block/blk-core.c > index 10c08ac50697..8f1424835700 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -1558,9 +1558,17 @@ bool blk_update_request(struct request *req, blk_status_t error, > req->q->integrity.profile->complete_fn(req, nr_bytes); > #endif > > - if (unlikely(error && !blk_rq_is_passthrough(req) && > - !(req->rq_flags & RQF_QUIET))) > - print_req_error(req, error, __func__); > + if (unlikely(error && !blk_rq_is_passthrough(req))) { > + if (op_is_flush(req_op(req))) > + part_stat_inc(&req->rq_disk->part0, > + io_errors[STAT_FLUSH]); > + else > + part_stat_inc(&req->rq_disk->part0, > + io_errors[op_stat_group(req_op(req))]); > + > + if (!(req->rq_flags & RQF_QUIET)) > + print_req_error(req, error, __func__); > + } > > blk_account_io_completion(req, nr_bytes); > > diff --git a/block/genhd.c b/block/genhd.c > index 99c64641c314..852035095485 100644 > --- a/block/genhd.c > +++ b/block/genhd.c > @@ -104,6 +104,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) > stat->sectors[group] += ptr->sectors[group]; > stat->ios[group] += ptr->ios[group]; > stat->merges[group] += ptr->merges[group]; > + stat->io_errors[group] += ptr->io_errors[group]; > } > > stat->io_ticks += ptr->io_ticks; > @@ -1374,6 +1375,22 @@ static ssize_t disk_discard_alignment_show(struct device *dev, > return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); > } > > +static ssize_t io_error_show(struct device *dev, > + struct device_attribute *attr, char *buf) > +{ > + struct hd_struct *p = dev_to_part(dev); > + struct disk_stats stat; > + > + part_stat_read_all(p, &stat); > + > + return sprintf(buf, > + "%8lu %8lu %8lu %8lu\n", > + stat.io_errors[STAT_READ], > + stat.io_errors[STAT_WRITE], > + stat.io_errors[STAT_DISCARD], > + stat.io_errors[STAT_FLUSH]); > +} > + > static DEVICE_ATTR(range, 0444, disk_range_show, NULL); > static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); > static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); > @@ -1386,6 +1403,7 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); > static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); > static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); > static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); > +static DEVICE_ATTR(io_error, 0444, io_error_show, NULL); > > #ifdef CONFIG_FAIL_MAKE_REQUEST > ssize_t part_fail_show(struct device *dev, > @@ -1437,6 +1455,7 @@ static struct attribute *disk_attrs[] = { > #ifdef CONFIG_FAIL_IO_TIMEOUT > &dev_attr_fail_timeout.attr, > #endif > + &dev_attr_io_error.attr, > NULL > }; > > diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h > index 24125778ef3e..4fe3836d2308 100644 > --- a/include/linux/part_stat.h > +++ b/include/linux/part_stat.h > @@ -9,6 +9,7 @@ struct disk_stats { > unsigned long sectors[NR_STAT_GROUPS]; > unsigned long ios[NR_STAT_GROUPS]; > unsigned long merges[NR_STAT_GROUPS]; > + unsigned long io_errors[NR_STAT_GROUPS]; > unsigned long io_ticks; > local_t in_flight[2]; > }; >
Hi, Jens What do you think about this, adding io error stat for block devices is reasonable? On 9/10/20 10:20 AM, zhenwei pi wrote: > Currently if hitting block req error, block layer only prints error > log with a rate limitation. Then agent has to parse kernel log to > record what happens. > > In this patch, add read/write/discard/flush stat counter to record > io errors. > > Signed-off-by: zhenwei pi <pizhenwei@bytedance.com> > --- > block/blk-core.c | 14 +++++++++++--- > block/genhd.c | 19 +++++++++++++++++++ > include/linux/part_stat.h | 1 + > 3 files changed, 31 insertions(+), 3 deletions(-) > > diff --git a/block/blk-core.c b/block/blk-core.c > index 10c08ac50697..8f1424835700 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -1558,9 +1558,17 @@ bool blk_update_request(struct request *req, blk_status_t error, > req->q->integrity.profile->complete_fn(req, nr_bytes); > #endif > > - if (unlikely(error && !blk_rq_is_passthrough(req) && > - !(req->rq_flags & RQF_QUIET))) > - print_req_error(req, error, __func__); > + if (unlikely(error && !blk_rq_is_passthrough(req))) { > + if (op_is_flush(req_op(req))) > + part_stat_inc(&req->rq_disk->part0, > + io_errors[STAT_FLUSH]); > + else > + part_stat_inc(&req->rq_disk->part0, > + io_errors[op_stat_group(req_op(req))]); > + > + if (!(req->rq_flags & RQF_QUIET)) > + print_req_error(req, error, __func__); > + } > > blk_account_io_completion(req, nr_bytes); > > diff --git a/block/genhd.c b/block/genhd.c > index 99c64641c314..852035095485 100644 > --- a/block/genhd.c > +++ b/block/genhd.c > @@ -104,6 +104,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) > stat->sectors[group] += ptr->sectors[group]; > stat->ios[group] += ptr->ios[group]; > stat->merges[group] += ptr->merges[group]; > + stat->io_errors[group] += ptr->io_errors[group]; > } > > stat->io_ticks += ptr->io_ticks; > @@ -1374,6 +1375,22 @@ static ssize_t disk_discard_alignment_show(struct device *dev, > return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); > } > > +static ssize_t io_error_show(struct device *dev, > + struct device_attribute *attr, char *buf) > +{ > + struct hd_struct *p = dev_to_part(dev); > + struct disk_stats stat; > + > + part_stat_read_all(p, &stat); > + > + return sprintf(buf, > + "%8lu %8lu %8lu %8lu\n", > + stat.io_errors[STAT_READ], > + stat.io_errors[STAT_WRITE], > + stat.io_errors[STAT_DISCARD], > + stat.io_errors[STAT_FLUSH]); > +} > + > static DEVICE_ATTR(range, 0444, disk_range_show, NULL); > static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); > static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); > @@ -1386,6 +1403,7 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); > static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); > static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); > static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); > +static DEVICE_ATTR(io_error, 0444, io_error_show, NULL); > > #ifdef CONFIG_FAIL_MAKE_REQUEST > ssize_t part_fail_show(struct device *dev, > @@ -1437,6 +1455,7 @@ static struct attribute *disk_attrs[] = { > #ifdef CONFIG_FAIL_IO_TIMEOUT > &dev_attr_fail_timeout.attr, > #endif > + &dev_attr_io_error.attr, > NULL > }; > > diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h > index 24125778ef3e..4fe3836d2308 100644 > --- a/include/linux/part_stat.h > +++ b/include/linux/part_stat.h > @@ -9,6 +9,7 @@ struct disk_stats { > unsigned long sectors[NR_STAT_GROUPS]; > unsigned long ios[NR_STAT_GROUPS]; > unsigned long merges[NR_STAT_GROUPS]; > + unsigned long io_errors[NR_STAT_GROUPS]; > unsigned long io_ticks; > local_t in_flight[2]; > }; >
diff --git a/block/blk-core.c b/block/blk-core.c index 10c08ac50697..8f1424835700 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1558,9 +1558,17 @@ bool blk_update_request(struct request *req, blk_status_t error, req->q->integrity.profile->complete_fn(req, nr_bytes); #endif - if (unlikely(error && !blk_rq_is_passthrough(req) && - !(req->rq_flags & RQF_QUIET))) - print_req_error(req, error, __func__); + if (unlikely(error && !blk_rq_is_passthrough(req))) { + if (op_is_flush(req_op(req))) + part_stat_inc(&req->rq_disk->part0, + io_errors[STAT_FLUSH]); + else + part_stat_inc(&req->rq_disk->part0, + io_errors[op_stat_group(req_op(req))]); + + if (!(req->rq_flags & RQF_QUIET)) + print_req_error(req, error, __func__); + } blk_account_io_completion(req, nr_bytes); diff --git a/block/genhd.c b/block/genhd.c index 99c64641c314..852035095485 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -104,6 +104,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) stat->sectors[group] += ptr->sectors[group]; stat->ios[group] += ptr->ios[group]; stat->merges[group] += ptr->merges[group]; + stat->io_errors[group] += ptr->io_errors[group]; } stat->io_ticks += ptr->io_ticks; @@ -1374,6 +1375,22 @@ static ssize_t disk_discard_alignment_show(struct device *dev, return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); } +static ssize_t io_error_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct disk_stats stat; + + part_stat_read_all(p, &stat); + + return sprintf(buf, + "%8lu %8lu %8lu %8lu\n", + stat.io_errors[STAT_READ], + stat.io_errors[STAT_WRITE], + stat.io_errors[STAT_DISCARD], + stat.io_errors[STAT_FLUSH]); +} + static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); @@ -1386,6 +1403,7 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); +static DEVICE_ATTR(io_error, 0444, io_error_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, @@ -1437,6 +1455,7 @@ static struct attribute *disk_attrs[] = { #ifdef CONFIG_FAIL_IO_TIMEOUT &dev_attr_fail_timeout.attr, #endif + &dev_attr_io_error.attr, NULL }; diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index 24125778ef3e..4fe3836d2308 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -9,6 +9,7 @@ struct disk_stats { unsigned long sectors[NR_STAT_GROUPS]; unsigned long ios[NR_STAT_GROUPS]; unsigned long merges[NR_STAT_GROUPS]; + unsigned long io_errors[NR_STAT_GROUPS]; unsigned long io_ticks; local_t in_flight[2]; };
Currently if hitting block req error, block layer only prints error log with a rate limitation. Then agent has to parse kernel log to record what happens. In this patch, add read/write/discard/flush stat counter to record io errors. Signed-off-by: zhenwei pi <pizhenwei@bytedance.com> --- block/blk-core.c | 14 +++++++++++--- block/genhd.c | 19 +++++++++++++++++++ include/linux/part_stat.h | 1 + 3 files changed, 31 insertions(+), 3 deletions(-)