Message ID | Pine.LNX.4.64.0907100947320.2450@hs20-bc2-1.build.redhat.com (mailing list archive) |
---|---|
State | Superseded, archived |
Delegated to: | Alasdair Kergon |
Headers | show |
On 07/10/09 09:49, Mikulas Patocka wrote: > patch to hold back bios --- untested (and not quite optimal because it > scans the "failures" list in fixed intervals), but it shows the approach. > > --- > drivers/md/dm-raid1.c | 10 +++++++--- > drivers/md/dm-region-hash.c | 6 +----- > include/linux/dm-region-hash.h | 3 +-- > 3 files changed, 9 insertions(+), 10 deletions(-) > > Index: linux-2.6.31-rc2-devel/drivers/md/dm-raid1.c > =================================================================== > --- linux-2.6.31-rc2-devel.orig/drivers/md/dm-raid1.c 2009-07-10 14:48:19.000000000 +0200 > +++ linux-2.6.31-rc2-devel/drivers/md/dm-raid1.c 2009-07-10 15:46:11.000000000 +0200 > @@ -535,11 +535,11 @@ static void write_callback(unsigned long > else > uptodate = 1; > > - if (unlikely(!uptodate)) { > + if (unlikely(!uptodate) || !errors_handled(ms)) { > DMERR("All replicated volumes dead, failing I/O"); > /* None of the writes succeeded, fail the I/O. */ > ret = -EIO; > - } else if (errors_handled(ms)) { > + } else { > /* > * Need to raise event. Since raising > * events can block, we need to do it in > @@ -687,8 +687,12 @@ static void do_failures(struct mirror_se > if (!ms->log_failure) { > while ((bio = bio_list_pop(failures))) { > ms->in_sync = 0; > - dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); > + dm_rh_mark_nosync(ms->rh, bio); > + spin_lock_irq(&ms->lock); > + bio_list_add(&ms->failures, bio); > + spin_unlock_irq(&ms->lock); > } > + delayed_wake(ms); > return; > } > > Index: linux-2.6.31-rc2-devel/drivers/md/dm-region-hash.c > =================================================================== > --- linux-2.6.31-rc2-devel.orig/drivers/md/dm-region-hash.c 2009-07-10 14:54:07.000000000 +0200 > +++ linux-2.6.31-rc2-devel/drivers/md/dm-region-hash.c 2009-07-10 15:45:07.000000000 +0200 > @@ -392,8 +392,6 @@ static void complete_resync_work(struct > /* dm_rh_mark_nosync > * @ms > * @bio > - * @done > - * @error > * > * The bio was written on some mirror(s) but failed on other mirror(s). > * We can successfully endio the bio but should avoid the region being > @@ -401,8 +399,7 @@ static void complete_resync_work(struct > * > * This function is _not_ safe in interrupt context! > */ > -void dm_rh_mark_nosync(struct dm_region_hash *rh, > - struct bio *bio, unsigned done, int error) > +void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) > { > unsigned long flags; > struct dm_dirty_log *log = rh->log; > @@ -439,7 +436,6 @@ void dm_rh_mark_nosync(struct dm_region_ > BUG_ON(!list_empty(®->list)); > spin_unlock_irqrestore(&rh->region_lock, flags); > > - bio_endio(bio, error); How do bios queued in ms->failures are processed later? It seems that the bios stay in ms->failures forever, and the upper layer can not receive "success" for those bios. Don't we need a mechanism to block/unblock write bios to fix this issue? > if (recovering) > complete_resync_work(reg, 0); > } > Index: linux-2.6.31-rc2-devel/include/linux/dm-region-hash.h > =================================================================== > --- linux-2.6.31-rc2-devel.orig/include/linux/dm-region-hash.h 2009-07-10 15:45:26.000000000 +0200 > +++ linux-2.6.31-rc2-devel/include/linux/dm-region-hash.h 2009-07-10 15:45:36.000000000 +0200 > @@ -78,8 +78,7 @@ void dm_rh_dec(struct dm_region_hash *rh > /* Delay bios on regions. */ > void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio); > > -void dm_rh_mark_nosync(struct dm_region_hash *rh, > - struct bio *bio, unsigned done, int error); > +void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio); > > /* > * Region recovery control. -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
Takahiro Yasui [tyasui@redhat.com] wrote: > On 07/10/09 09:49, Mikulas Patocka wrote: > > patch to hold back bios --- untested (and not quite optimal because it > > scans the "failures" list in fixed intervals), but it shows the approach. > > > > --- > > drivers/md/dm-raid1.c | 10 +++++++--- > > drivers/md/dm-region-hash.c | 6 +----- > > include/linux/dm-region-hash.h | 3 +-- > > 3 files changed, 9 insertions(+), 10 deletions(-) > > > > Index: linux-2.6.31-rc2-devel/drivers/md/dm-raid1.c > > =================================================================== > > --- linux-2.6.31-rc2-devel.orig/drivers/md/dm-raid1.c 2009-07-10 14:48:19.000000000 +0200 > > +++ linux-2.6.31-rc2-devel/drivers/md/dm-raid1.c 2009-07-10 15:46:11.000000000 +0200 > > @@ -535,11 +535,11 @@ static void write_callback(unsigned long > > else > > uptodate = 1; > > > > - if (unlikely(!uptodate)) { > > + if (unlikely(!uptodate) || !errors_handled(ms)) { > > DMERR("All replicated volumes dead, failing I/O"); > > /* None of the writes succeeded, fail the I/O. */ > > ret = -EIO; > > - } else if (errors_handled(ms)) { > > + } else { > > /* > > * Need to raise event. Since raising > > * events can block, we need to do it in > > @@ -687,8 +687,12 @@ static void do_failures(struct mirror_se > > if (!ms->log_failure) { > > while ((bio = bio_list_pop(failures))) { > > ms->in_sync = 0; > > - dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); > > + dm_rh_mark_nosync(ms->rh, bio); > > + spin_lock_irq(&ms->lock); > > + bio_list_add(&ms->failures, bio); > > + spin_unlock_irq(&ms->lock); > > } > > + delayed_wake(ms); > > return; > > } > > > > Index: linux-2.6.31-rc2-devel/drivers/md/dm-region-hash.c > > =================================================================== > > --- linux-2.6.31-rc2-devel.orig/drivers/md/dm-region-hash.c 2009-07-10 14:54:07.000000000 +0200 > > +++ linux-2.6.31-rc2-devel/drivers/md/dm-region-hash.c 2009-07-10 15:45:07.000000000 +0200 > > @@ -392,8 +392,6 @@ static void complete_resync_work(struct > > /* dm_rh_mark_nosync > > * @ms > > * @bio > > - * @done > > - * @error > > * > > * The bio was written on some mirror(s) but failed on other mirror(s). > > * We can successfully endio the bio but should avoid the region being > > @@ -401,8 +399,7 @@ static void complete_resync_work(struct > > * > > * This function is _not_ safe in interrupt context! > > */ > > -void dm_rh_mark_nosync(struct dm_region_hash *rh, > > - struct bio *bio, unsigned done, int error) > > +void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) > > { > > unsigned long flags; > > struct dm_dirty_log *log = rh->log; > > @@ -439,7 +436,6 @@ void dm_rh_mark_nosync(struct dm_region_ > > BUG_ON(!list_empty(®->list)); > > spin_unlock_irqrestore(&rh->region_lock, flags); > > > > - bio_endio(bio, error); > > How do bios queued in ms->failures are processed later? It seems that > the bios stay in ms->failures forever, and the upper layer can not > receive "success" for those bios. Don't we need a mechanism to block/unblock > write bios to fix this issue? A user level program, dmeventd, may reconfigure the mirror resulting in submitting the queued I/O's after the reconfiguration. -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
malahal@us.ibm.com wrote: > Takahiro Yasui [tyasui@redhat.com] wrote: >> On 07/10/09 09:49, Mikulas Patocka wrote: >>> patch to hold back bios --- untested (and not quite optimal because it >>> scans the "failures" list in fixed intervals), but it shows the approach. >>> >>> --- >>> drivers/md/dm-raid1.c | 10 +++++++--- >>> drivers/md/dm-region-hash.c | 6 +----- >>> include/linux/dm-region-hash.h | 3 +-- >>> 3 files changed, 9 insertions(+), 10 deletions(-) >>> >>> Index: linux-2.6.31-rc2-devel/drivers/md/dm-raid1.c >>> =================================================================== >>> --- linux-2.6.31-rc2-devel.orig/drivers/md/dm-raid1.c 2009-07-10 14:48:19.000000000 +0200 >>> +++ linux-2.6.31-rc2-devel/drivers/md/dm-raid1.c 2009-07-10 15:46:11.000000000 +0200 >>> @@ -535,11 +535,11 @@ static void write_callback(unsigned long >>> else >>> uptodate = 1; >>> >>> - if (unlikely(!uptodate)) { >>> + if (unlikely(!uptodate) || !errors_handled(ms)) { >>> DMERR("All replicated volumes dead, failing I/O"); >>> /* None of the writes succeeded, fail the I/O. */ >>> ret = -EIO; >>> - } else if (errors_handled(ms)) { >>> + } else { >>> /* >>> * Need to raise event. Since raising >>> * events can block, we need to do it in >>> @@ -687,8 +687,12 @@ static void do_failures(struct mirror_se >>> if (!ms->log_failure) { >>> while ((bio = bio_list_pop(failures))) { >>> ms->in_sync = 0; >>> - dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); >>> + dm_rh_mark_nosync(ms->rh, bio); >>> + spin_lock_irq(&ms->lock); >>> + bio_list_add(&ms->failures, bio); >>> + spin_unlock_irq(&ms->lock); >>> } >>> + delayed_wake(ms); >>> return; >>> } >>> >>> Index: linux-2.6.31-rc2-devel/drivers/md/dm-region-hash.c >>> =================================================================== >>> --- linux-2.6.31-rc2-devel.orig/drivers/md/dm-region-hash.c 2009-07-10 14:54:07.000000000 +0200 >>> +++ linux-2.6.31-rc2-devel/drivers/md/dm-region-hash.c 2009-07-10 15:45:07.000000000 +0200 >>> @@ -392,8 +392,6 @@ static void complete_resync_work(struct >>> /* dm_rh_mark_nosync >>> * @ms >>> * @bio >>> - * @done >>> - * @error >>> * >>> * The bio was written on some mirror(s) but failed on other mirror(s). >>> * We can successfully endio the bio but should avoid the region being >>> @@ -401,8 +399,7 @@ static void complete_resync_work(struct >>> * >>> * This function is _not_ safe in interrupt context! >>> */ >>> -void dm_rh_mark_nosync(struct dm_region_hash *rh, >>> - struct bio *bio, unsigned done, int error) >>> +void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) >>> { >>> unsigned long flags; >>> struct dm_dirty_log *log = rh->log; >>> @@ -439,7 +436,6 @@ void dm_rh_mark_nosync(struct dm_region_ >>> BUG_ON(!list_empty(®->list)); >>> spin_unlock_irqrestore(&rh->region_lock, flags); >>> >>> - bio_endio(bio, error); >> How do bios queued in ms->failures are processed later? It seems that >> the bios stay in ms->failures forever, and the upper layer can not >> receive "success" for those bios. Don't we need a mechanism to block/unblock >> write bios to fix this issue? > > A user level program, dmeventd, may reconfigure the mirror resulting in > submitting the queued I/O's after the reconfiguration. In that case, don't we need codes to release bios when the mirror is destroyed? In addition, not only bios for failed write I/Os but also bios sent to out-of-sync regions need to be blocked. Bios sent to out-of-sync regions are processed by generic_make_request() and would return the "success" to upper layer without dm-raid1 notices it. Thanks, Taka -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
On Fri, 10 Jul 2009, Takahiro Yasui wrote: > On 07/10/09 09:49, Mikulas Patocka wrote: > > patch to hold back bios --- untested (and not quite optimal because it > > scans the "failures" list in fixed intervals), but it shows the approach. > > > > --- > > drivers/md/dm-raid1.c | 10 +++++++--- > > drivers/md/dm-region-hash.c | 6 +----- > > include/linux/dm-region-hash.h | 3 +-- > > 3 files changed, 9 insertions(+), 10 deletions(-) > > > > Index: linux-2.6.31-rc2-devel/drivers/md/dm-raid1.c > > =================================================================== > > --- linux-2.6.31-rc2-devel.orig/drivers/md/dm-raid1.c 2009-07-10 14:48:19.000000000 +0200 > > +++ linux-2.6.31-rc2-devel/drivers/md/dm-raid1.c 2009-07-10 15:46:11.000000000 +0200 > > @@ -535,11 +535,11 @@ static void write_callback(unsigned long > > else > > uptodate = 1; > > > > - if (unlikely(!uptodate)) { > > + if (unlikely(!uptodate) || !errors_handled(ms)) { > > DMERR("All replicated volumes dead, failing I/O"); > > /* None of the writes succeeded, fail the I/O. */ > > ret = -EIO; > > - } else if (errors_handled(ms)) { > > + } else { > > /* > > * Need to raise event. Since raising > > * events can block, we need to do it in > > @@ -687,8 +687,12 @@ static void do_failures(struct mirror_se > > if (!ms->log_failure) { > > while ((bio = bio_list_pop(failures))) { > > ms->in_sync = 0; > > - dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); > > + dm_rh_mark_nosync(ms->rh, bio); > > + spin_lock_irq(&ms->lock); > > + bio_list_add(&ms->failures, bio); > > + spin_unlock_irq(&ms->lock); > > } > > + delayed_wake(ms); > > return; > > } > > > > Index: linux-2.6.31-rc2-devel/drivers/md/dm-region-hash.c > > =================================================================== > > --- linux-2.6.31-rc2-devel.orig/drivers/md/dm-region-hash.c 2009-07-10 14:54:07.000000000 +0200 > > +++ linux-2.6.31-rc2-devel/drivers/md/dm-region-hash.c 2009-07-10 15:45:07.000000000 +0200 > > @@ -392,8 +392,6 @@ static void complete_resync_work(struct > > /* dm_rh_mark_nosync > > * @ms > > * @bio > > - * @done > > - * @error > > * > > * The bio was written on some mirror(s) but failed on other mirror(s). > > * We can successfully endio the bio but should avoid the region being > > @@ -401,8 +399,7 @@ static void complete_resync_work(struct > > * > > * This function is _not_ safe in interrupt context! > > */ > > -void dm_rh_mark_nosync(struct dm_region_hash *rh, > > - struct bio *bio, unsigned done, int error) > > +void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) > > { > > unsigned long flags; > > struct dm_dirty_log *log = rh->log; > > @@ -439,7 +436,6 @@ void dm_rh_mark_nosync(struct dm_region_ > > BUG_ON(!list_empty(®->list)); > > spin_unlock_irqrestore(&rh->region_lock, flags); > > > > - bio_endio(bio, error); > > How do bios queued in ms->failures are processed later? It seems that > the bios stay in ms->failures forever, and the upper layer can not > receive "success" for those bios. Don't we need a mechanism to block/unblock > write bios to fix this issue? They are resubmitted with DM_ENDIO_REQUEUE on noflush suspend. My patch has a bug that they aren't --- but I will provide a better patch, also without this periodic polling of ms->failures queue. Mikulas -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
Mikulas Patocka [mpatocka@redhat.com] wrote: > > > How do bios queued in ms->failures are processed later? It seems that > > the bios stay in ms->failures forever, and the upper layer can not > > receive "success" for those bios. Don't we need a mechanism to block/unblock > > write bios to fix this issue? > > They are resubmitted with DM_ENDIO_REQUEUE on noflush suspend. My patch > has a bug that they aren't --- but I will provide a better patch, also > without this periodic polling of ms->failures queue. Trying to verify this patch. Mikulas, did you provide a better patch yet? Does this patch work at all? I would like to verify if this patch works with devices that fail temporarily. I will plan on using dm-flakey devices for testing purposes. Thanks, Malahal. -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
On Sun, 30 Aug 2009, malahal@us.ibm.com wrote: > Mikulas Patocka [mpatocka@redhat.com] wrote: > > > > > How do bios queued in ms->failures are processed later? It seems that > > > the bios stay in ms->failures forever, and the upper layer can not > > > receive "success" for those bios. Don't we need a mechanism to block/unblock > > > write bios to fix this issue? > > > > They are resubmitted with DM_ENDIO_REQUEUE on noflush suspend. My patch > > has a bug that they aren't --- but I will provide a better patch, also > > without this periodic polling of ms->failures queue. > > Trying to verify this patch. Mikulas, did you provide a better patch > yet? Does this patch work at all? > > I would like to verify if this patch works with devices that fail > temporarily. I will plan on using dm-flakey devices for testing purposes. > > Thanks, Malahal. Hi I uploaded patches for this bug at: http://people.redhat.com/mpatocka/patches/kernel/mirror-race/ But note! They were never tried. When I wanted to try them, I found out that dmeventd is totally nonworking in upstream code (it doesn't pass "handle_errors" argument and has some crashes and signal errors), so I didn't test them with dmeventd. Dmeventd must be fixed first, then we can work on this bug. Mikulas -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
Hi Mikulas, On 08/31/09 17:39, Mikulas Patocka wrote: > I uploaded patches for this bug at: > http://people.redhat.com/mpatocka/patches/kernel/mirror-race/ > > But note! They were never tried. When I wanted to try them, I found out > that dmeventd is totally nonworking in upstream code (it doesn't pass > "handle_errors" argument and has some crashes and signal errors), so I > didn't test them with dmeventd. Dmeventd must be fixed first, then we can > work on this bug. Thank you for posting the patch set. I roughly looked at your patch and have several comments. - The flag, "handle_errors," won't be passed to dm-raid1 by dmeventd. lvm commands (e.g. vgchange) or dmsetup pass "handle_errors" when new mirror mapping is created. - As I mentioned before, bios which are sent to out-of-sync regions also need to be blocked because bios to out-of-sync regions are processed by generic_make_request() and would return the "success" to upper layer without dm-raid1 notices it. This might cause data corruption. https://www.redhat.com/archives/dm-devel/2009-July/msg00118.html - The modification of write_callback() looks a little confusing to me. Do all bios need to be blocked? When all legs returns error, the bio should be returned with -EIO to upper layer without being blocked as original code using uptodate flag. I appreciate your comments. Thanks, Taka
Index: linux-2.6.31-rc2-devel/drivers/md/dm-raid1.c =================================================================== --- linux-2.6.31-rc2-devel.orig/drivers/md/dm-raid1.c 2009-07-10 14:48:19.000000000 +0200 +++ linux-2.6.31-rc2-devel/drivers/md/dm-raid1.c 2009-07-10 15:46:11.000000000 +0200 @@ -535,11 +535,11 @@ static void write_callback(unsigned long else uptodate = 1; - if (unlikely(!uptodate)) { + if (unlikely(!uptodate) || !errors_handled(ms)) { DMERR("All replicated volumes dead, failing I/O"); /* None of the writes succeeded, fail the I/O. */ ret = -EIO; - } else if (errors_handled(ms)) { + } else { /* * Need to raise event. Since raising * events can block, we need to do it in @@ -687,8 +687,12 @@ static void do_failures(struct mirror_se if (!ms->log_failure) { while ((bio = bio_list_pop(failures))) { ms->in_sync = 0; - dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); + dm_rh_mark_nosync(ms->rh, bio); + spin_lock_irq(&ms->lock); + bio_list_add(&ms->failures, bio); + spin_unlock_irq(&ms->lock); } + delayed_wake(ms); return; } Index: linux-2.6.31-rc2-devel/drivers/md/dm-region-hash.c =================================================================== --- linux-2.6.31-rc2-devel.orig/drivers/md/dm-region-hash.c 2009-07-10 14:54:07.000000000 +0200 +++ linux-2.6.31-rc2-devel/drivers/md/dm-region-hash.c 2009-07-10 15:45:07.000000000 +0200 @@ -392,8 +392,6 @@ static void complete_resync_work(struct /* dm_rh_mark_nosync * @ms * @bio - * @done - * @error * * The bio was written on some mirror(s) but failed on other mirror(s). * We can successfully endio the bio but should avoid the region being @@ -401,8 +399,7 @@ static void complete_resync_work(struct * * This function is _not_ safe in interrupt context! */ -void dm_rh_mark_nosync(struct dm_region_hash *rh, - struct bio *bio, unsigned done, int error) +void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) { unsigned long flags; struct dm_dirty_log *log = rh->log; @@ -439,7 +436,6 @@ void dm_rh_mark_nosync(struct dm_region_ BUG_ON(!list_empty(®->list)); spin_unlock_irqrestore(&rh->region_lock, flags); - bio_endio(bio, error); if (recovering) complete_resync_work(reg, 0); } Index: linux-2.6.31-rc2-devel/include/linux/dm-region-hash.h =================================================================== --- linux-2.6.31-rc2-devel.orig/include/linux/dm-region-hash.h 2009-07-10 15:45:26.000000000 +0200 +++ linux-2.6.31-rc2-devel/include/linux/dm-region-hash.h 2009-07-10 15:45:36.000000000 +0200 @@ -78,8 +78,7 @@ void dm_rh_dec(struct dm_region_hash *rh /* Delay bios on regions. */ void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio); -void dm_rh_mark_nosync(struct dm_region_hash *rh, - struct bio *bio, unsigned done, int error); +void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio); /* * Region recovery control.