Message ID | 20230427085612.1346752-4-linan666@huaweicloud.com (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | md: bugfix of writing raid sysfs | expand |
Hi, 在 2023/04/27 16:56, linan666@huaweicloud.com 写道: > From: Li Nan <linan122@huawei.com> > > max_corr_read_errors should not be negative number. Change it to > unsigned int where use it. > > Fixes: 1e50915fe0bb ("raid: improve MD/raid10 handling of correctable read errors.") > Signed-off-by: Li Nan <linan122@huawei.com> > --- > drivers/md/md.c | 2 +- > drivers/md/raid10.c | 4 ++-- > 2 files changed, 3 insertions(+), 3 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index faffbd042925..a365ed122960 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -4484,7 +4484,7 @@ __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_stor > > static ssize_t > max_corrected_read_errors_show(struct mddev *mddev, char *page) { > - return sprintf(page, "%d\n", > + return sprintf(page, "%u\n", > atomic_read(&mddev->max_corr_read_errors)); > } > > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c > index 4fcfcb350d2b..28cdb2ae0e91 100644 > --- a/drivers/md/raid10.c > +++ b/drivers/md/raid10.c > @@ -2727,7 +2727,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 > int sect = 0; /* Offset from r10_bio->sector */ > int sectors = r10_bio->sectors; > struct md_rdev *rdev; > - int max_read_errors = atomic_read(&mddev->max_corr_read_errors); > + unsigned int max_read_errors = atomic_read(&mddev->max_corr_read_errors); This line exceed 80 columns. > int d = r10_bio->devs[r10_bio->read_slot].devnum; > > /* still own a reference to this rdev, so it cannot > @@ -2743,7 +2743,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 > check_decay_read_errors(mddev, rdev); > atomic_inc(&rdev->read_errors); > if (atomic_read(&rdev->read_errors) > max_read_errors) { > - pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", > + pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %u:max %u]\n", > mdname(mddev), rdev->bdev, > atomic_read(&rdev->read_errors), max_read_errors); > pr_notice("md/raid10:%s: %pg: Failing raid device\n", > This is not critical, but I think it's better do some cleanup to fold above code into check_decay_read_errors(), and rename it to check_read_error(): diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7135cfaf75db..633aabfea452 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2636,18 +2636,17 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) * since the last recorded read error. * */ -static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) +static bool check_read_errors(struct mddev *mddev, struct md_rdev *rdev) { - long cur_time_mon; + time64_t cur_time_mon = ktime_get_seconds(); unsigned long hours_since_last; - unsigned int read_errors = atomic_read(&rdev->read_errors); - - cur_time_mon = ktime_get_seconds(); + unsigned int read_errors; + unsigned int max_read_errors; if (rdev->last_read_error == 0) { /* first time we've seen a read error */ rdev->last_read_error = cur_time_mon; - return; + goto increase; } hours_since_last = (long)(cur_time_mon - @@ -2660,10 +2659,26 @@ static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) * just set read errors to 0. We do this to avoid * overflowing the shift of read_errors by hours_since_last. */ + read_errors = atomic_read(&rdev->read_errors); if (hours_since_last >= 8 * sizeof(read_errors)) atomic_set(&rdev->read_errors, 0); else atomic_set(&rdev->read_errors, read_errors >> hours_since_last); + +increase: + max_read_errors = atomic_read(&mddev->max_corr_read_errors); + read_errors = atomic_inc_return(&rdev->read_errors); + if (read_errors > max_read_errors) { + pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %u:max %u]\n", + mdname(mddev), rdev->bdev, + read_errors, max_read_errors); + pr_notice("md/raid10:%s: %pg: Failing raid device\n", + mdname(mddev), rdev->bdev); + md_error(mddev, rdev); + return true; + } + + return false; } static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, @@ -2703,7 +2718,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 int sect = 0; /* Offset from r10_bio->sector */ int sectors = r10_bio->sectors; struct md_rdev *rdev; - int max_read_errors = atomic_read(&mddev->max_corr_read_errors); int d = r10_bio->devs[r10_bio->read_slot].devnum; /* still own a reference to this rdev, so it cannot @@ -2716,15 +2730,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 more fix_read_error() attempts */ return; - check_decay_read_errors(mddev, rdev); - atomic_inc(&rdev->read_errors); - if (atomic_read(&rdev->read_errors) > max_read_errors) { - pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", - mdname(mddev), rdev->bdev, - atomic_read(&rdev->read_errors), max_read_errors); - pr_notice("md/raid10:%s: %pg: Failing raid device\n", - mdname(mddev), rdev->bdev); - md_error(mddev, rdev); + if (check_read_errors(mddev, rdev)) { r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; return; } Thanks, Kuai
在 2023/4/27 18:51, Yu Kuai 写道: > This is not critical, but I think it's better do some cleanup to fold > above code into check_decay_read_errors(), and rename it to > check_read_error(): > It seems like a good idea. v2 will include this clean up.
diff --git a/drivers/md/md.c b/drivers/md/md.c index faffbd042925..a365ed122960 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4484,7 +4484,7 @@ __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_stor static ssize_t max_corrected_read_errors_show(struct mddev *mddev, char *page) { - return sprintf(page, "%d\n", + return sprintf(page, "%u\n", atomic_read(&mddev->max_corr_read_errors)); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 4fcfcb350d2b..28cdb2ae0e91 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2727,7 +2727,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 int sect = 0; /* Offset from r10_bio->sector */ int sectors = r10_bio->sectors; struct md_rdev *rdev; - int max_read_errors = atomic_read(&mddev->max_corr_read_errors); + unsigned int max_read_errors = atomic_read(&mddev->max_corr_read_errors); int d = r10_bio->devs[r10_bio->read_slot].devnum; /* still own a reference to this rdev, so it cannot @@ -2743,7 +2743,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 check_decay_read_errors(mddev, rdev); atomic_inc(&rdev->read_errors); if (atomic_read(&rdev->read_errors) > max_read_errors) { - pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", + pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %u:max %u]\n", mdname(mddev), rdev->bdev, atomic_read(&rdev->read_errors), max_read_errors); pr_notice("md/raid10:%s: %pg: Failing raid device\n",