Message ID | 161319521620.422860.17802896302850828411.stgit@magnolia (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | xfs_repair: set needsrepair when dirtying filesystems | expand |
On Fri, Feb 12, 2021 at 09:46:56PM -0800, Darrick J. Wong wrote: > From: Darrick J. Wong <djwong@kernel.org> > > Add an error injection knob so that we can simulate system failure after > a certain number of disk writes. This knob is being added so that we > can check repair's behavior after an arbitrary number of tests. > > Set LIBXFS_DEBUG_WRITE_CRASH={ddev,logdev,rtdev}=nn in the environment > to make libxfs SIGKILL itself after nn writes to the data, log, or rt > devices. Note that this only applies to xfs_buf writes and zero_range. > > Signed-off-by: Darrick J. Wong <djwong@kernel.org> > --- > libxfs/init.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++--- > libxfs/libxfs_io.h | 19 +++++++++++++++ > libxfs/rdwr.c | 6 ++++- > 3 files changed, 88 insertions(+), 5 deletions(-) > > > diff --git a/libxfs/init.c b/libxfs/init.c > index 8a8ce3c4..1ec83791 100644 > --- a/libxfs/init.c > +++ b/libxfs/init.c ... > @@ -614,6 +634,46 @@ libxfs_buftarg_init( > dev_t logdev, > dev_t rtdev) > { > + char *p = getenv("LIBXFS_DEBUG_WRITE_CRASH"); > + unsigned long dfail = 0, lfail = 0, rfail = 0; Was there a reason for using an environment variable now rather than the original command line option? > + > + /* Simulate utility crash after a certain number of writes. */ > + while (p && *p) { > + char *val; > + > + switch (getsubopt(&p, wf_opts, &val)) { > + case WF_DATA: > + if (!val) { > + fprintf(stderr, > + _("ddev write fail requires a parameter\n")); > + exit(1); > + } > + dfail = strtoul(val, NULL, 0); > + break; > + case WF_LOG: > + if (!val) { > + fprintf(stderr, > + _("logdev write fail requires a parameter\n")); > + exit(1); > + } > + lfail = strtoul(val, NULL, 0); > + break; > + case WF_RT: > + if (!val) { > + fprintf(stderr, > + _("rtdev write fail requires a parameter\n")); > + exit(1); > + } > + rfail = strtoul(val, NULL, 0); > + break; > + default: > + fprintf(stderr, _("unknown write fail type %s\n"), > + val); > + exit(1); > + break; > + } > + } > + > if (mp->m_ddev_targp) { > /* should already have all buftargs initialised */ > if (mp->m_ddev_targp->bt_bdev != dev || ... > diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h > index c80e2d59..85485257 100644 > --- a/libxfs/libxfs_io.h > +++ b/libxfs/libxfs_io.h ... > @@ -30,6 +32,23 @@ struct xfs_buftarg { > #define XFS_BUFTARG_LOST_WRITE (1 << 0) > /* A dirty buffer failed the write verifier. */ > #define XFS_BUFTARG_CORRUPT_WRITE (1 << 1) > +/* Simulate failure after a certain number of writes. */ > +#define XFS_BUFTARG_INJECT_WRITE_FAIL (1 << 2) > + > +/* Simulate the system crashing after a write. */ > +static inline void > +xfs_buftarg_trip_write( > + struct xfs_buftarg *btp) > +{ > + if (!(btp->flags & XFS_BUFTARG_INJECT_WRITE_FAIL)) > + return; > + > + pthread_mutex_lock(&btp->lock); > + btp->writes_left--; > + if (!btp->writes_left) > + kill(getpid(), SIGKILL); Can we just exit()? (Same questions for the next patch..) Brian > + pthread_mutex_unlock(&btp->lock); > +} > > extern void libxfs_buftarg_init(struct xfs_mount *mp, dev_t ddev, > dev_t logdev, dev_t rtdev); > diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c > index ca272387..fd456d6b 100644 > --- a/libxfs/rdwr.c > +++ b/libxfs/rdwr.c > @@ -74,8 +74,10 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) > /* try to use special zeroing methods, fall back to writes if needed */ > len_bytes = LIBXFS_BBTOOFF64(len); > error = platform_zero_range(fd, start_offset, len_bytes); > - if (!error) > + if (!error) { > + xfs_buftarg_trip_write(btp); > return 0; > + } > > zsize = min(BDSTRAT_SIZE, BBTOB(len)); > if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) { > @@ -105,6 +107,7 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) > progname, __FUNCTION__); > exit(1); > } > + xfs_buftarg_trip_write(btp); > offset += bytes; > } > free(z); > @@ -860,6 +863,7 @@ libxfs_bwrite( > } else { > bp->b_flags |= LIBXFS_B_UPTODATE; > bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_UNCHECKED); > + xfs_buftarg_trip_write(bp->b_target); > } > return bp->b_error; > } >
On Tue, Feb 16, 2021 at 06:56:45AM -0500, Brian Foster wrote: > On Fri, Feb 12, 2021 at 09:46:56PM -0800, Darrick J. Wong wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > > > Add an error injection knob so that we can simulate system failure after > > a certain number of disk writes. This knob is being added so that we > > can check repair's behavior after an arbitrary number of tests. > > > > Set LIBXFS_DEBUG_WRITE_CRASH={ddev,logdev,rtdev}=nn in the environment > > to make libxfs SIGKILL itself after nn writes to the data, log, or rt > > devices. Note that this only applies to xfs_buf writes and zero_range. > > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org> > > --- > > libxfs/init.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++--- > > libxfs/libxfs_io.h | 19 +++++++++++++++ > > libxfs/rdwr.c | 6 ++++- > > 3 files changed, 88 insertions(+), 5 deletions(-) > > > > > > diff --git a/libxfs/init.c b/libxfs/init.c > > index 8a8ce3c4..1ec83791 100644 > > --- a/libxfs/init.c > > +++ b/libxfs/init.c > ... > > @@ -614,6 +634,46 @@ libxfs_buftarg_init( > > dev_t logdev, > > dev_t rtdev) > > { > > + char *p = getenv("LIBXFS_DEBUG_WRITE_CRASH"); > > + unsigned long dfail = 0, lfail = 0, rfail = 0; > > Was there a reason for using an environment variable now rather than the > original command line option? Well, you said you wanted a generic write error injection hook for libxfs, and this is the simplest way to add that, given that libraries don't have a direct means to parse argc and argv. I mean... this /could/ take the form of an exposed library function that xfs utilities could opt into their own getopt loops, but that's even /more/ infrastructure code that I'd have to write. OTOH there's already precedent for magic environment variables to enable libxfs debug hooks. > > + > > + /* Simulate utility crash after a certain number of writes. */ > > + while (p && *p) { > > + char *val; > > + > > + switch (getsubopt(&p, wf_opts, &val)) { > > + case WF_DATA: > > + if (!val) { > > + fprintf(stderr, > > + _("ddev write fail requires a parameter\n")); > > + exit(1); > > + } > > + dfail = strtoul(val, NULL, 0); > > + break; > > + case WF_LOG: > > + if (!val) { > > + fprintf(stderr, > > + _("logdev write fail requires a parameter\n")); > > + exit(1); > > + } > > + lfail = strtoul(val, NULL, 0); > > + break; > > + case WF_RT: > > + if (!val) { > > + fprintf(stderr, > > + _("rtdev write fail requires a parameter\n")); > > + exit(1); > > + } > > + rfail = strtoul(val, NULL, 0); > > + break; > > + default: > > + fprintf(stderr, _("unknown write fail type %s\n"), > > + val); > > + exit(1); > > + break; > > + } > > + } > > + > > if (mp->m_ddev_targp) { > > /* should already have all buftargs initialised */ > > if (mp->m_ddev_targp->bt_bdev != dev || > ... > > diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h > > index c80e2d59..85485257 100644 > > --- a/libxfs/libxfs_io.h > > +++ b/libxfs/libxfs_io.h > ... > > @@ -30,6 +32,23 @@ struct xfs_buftarg { > > #define XFS_BUFTARG_LOST_WRITE (1 << 0) > > /* A dirty buffer failed the write verifier. */ > > #define XFS_BUFTARG_CORRUPT_WRITE (1 << 1) > > +/* Simulate failure after a certain number of writes. */ > > +#define XFS_BUFTARG_INJECT_WRITE_FAIL (1 << 2) > > + > > +/* Simulate the system crashing after a write. */ > > +static inline void > > +xfs_buftarg_trip_write( > > + struct xfs_buftarg *btp) > > +{ > > + if (!(btp->flags & XFS_BUFTARG_INJECT_WRITE_FAIL)) > > + return; > > + > > + pthread_mutex_lock(&btp->lock); > > + btp->writes_left--; > > + if (!btp->writes_left) > > + kill(getpid(), SIGKILL); > > Can we just exit()? > > (Same questions for the next patch..) The goal of this generic write error injection framework is to simulate total system crashes immediately after a write. SIGKILL and exit are not the same, because atexit handlers don't run if the process forcibly kills itself. --D > > Brian > > > + pthread_mutex_unlock(&btp->lock); > > +} > > > > extern void libxfs_buftarg_init(struct xfs_mount *mp, dev_t ddev, > > dev_t logdev, dev_t rtdev); > > diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c > > index ca272387..fd456d6b 100644 > > --- a/libxfs/rdwr.c > > +++ b/libxfs/rdwr.c > > @@ -74,8 +74,10 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) > > /* try to use special zeroing methods, fall back to writes if needed */ > > len_bytes = LIBXFS_BBTOOFF64(len); > > error = platform_zero_range(fd, start_offset, len_bytes); > > - if (!error) > > + if (!error) { > > + xfs_buftarg_trip_write(btp); > > return 0; > > + } > > > > zsize = min(BDSTRAT_SIZE, BBTOB(len)); > > if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) { > > @@ -105,6 +107,7 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) > > progname, __FUNCTION__); > > exit(1); > > } > > + xfs_buftarg_trip_write(btp); > > offset += bytes; > > } > > free(z); > > @@ -860,6 +863,7 @@ libxfs_bwrite( > > } else { > > bp->b_flags |= LIBXFS_B_UPTODATE; > > bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_UNCHECKED); > > + xfs_buftarg_trip_write(bp->b_target); > > } > > return bp->b_error; > > } > > >
On Wed, Feb 17, 2021 at 08:36:20PM -0800, Darrick J. Wong wrote: > On Tue, Feb 16, 2021 at 06:56:45AM -0500, Brian Foster wrote: > > On Fri, Feb 12, 2021 at 09:46:56PM -0800, Darrick J. Wong wrote: > > > From: Darrick J. Wong <djwong@kernel.org> > > > > > > Add an error injection knob so that we can simulate system failure after > > > a certain number of disk writes. This knob is being added so that we > > > can check repair's behavior after an arbitrary number of tests. > > > > > > Set LIBXFS_DEBUG_WRITE_CRASH={ddev,logdev,rtdev}=nn in the environment > > > to make libxfs SIGKILL itself after nn writes to the data, log, or rt > > > devices. Note that this only applies to xfs_buf writes and zero_range. > > > > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org> > > > --- > > > libxfs/init.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++--- > > > libxfs/libxfs_io.h | 19 +++++++++++++++ > > > libxfs/rdwr.c | 6 ++++- > > > 3 files changed, 88 insertions(+), 5 deletions(-) > > > > > > > > > diff --git a/libxfs/init.c b/libxfs/init.c > > > index 8a8ce3c4..1ec83791 100644 > > > --- a/libxfs/init.c > > > +++ b/libxfs/init.c > > ... > > > @@ -614,6 +634,46 @@ libxfs_buftarg_init( > > > dev_t logdev, > > > dev_t rtdev) > > > { > > > + char *p = getenv("LIBXFS_DEBUG_WRITE_CRASH"); > > > + unsigned long dfail = 0, lfail = 0, rfail = 0; > > > > Was there a reason for using an environment variable now rather than the > > original command line option? > > Well, you said you wanted a generic write error injection hook for > libxfs, and this is the simplest way to add that, given that libraries > don't have a direct means to parse argc and argv. > I think you're misinterpreting my previous feedback. ;) I thought the injection mechanism was too closely tied to an implementation detail (i.e. "fail after updating needsrepair bit") of the application. Instead, I preferred a more generic mechanism (the "fail after so many I/Os," "fail after phase N" approaches in these patches) that covers the original use case. That broadens the potential test coverage and usefulness of the mechanism. > I mean... this /could/ take the form of an exposed library function that > xfs utilities could opt into their own getopt loops, but that's even > /more/ infrastructure code that I'd have to write. > In this case I was just curious why the interface was changed from the previous approach. ISTM it didn't necessarily have to, but I'm not concerned about it either way. ... > > > diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h > > > index c80e2d59..85485257 100644 > > > --- a/libxfs/libxfs_io.h > > > +++ b/libxfs/libxfs_io.h > > ... > > > @@ -30,6 +32,23 @@ struct xfs_buftarg { > > > #define XFS_BUFTARG_LOST_WRITE (1 << 0) > > > /* A dirty buffer failed the write verifier. */ > > > #define XFS_BUFTARG_CORRUPT_WRITE (1 << 1) > > > +/* Simulate failure after a certain number of writes. */ > > > +#define XFS_BUFTARG_INJECT_WRITE_FAIL (1 << 2) > > > + > > > +/* Simulate the system crashing after a write. */ > > > +static inline void > > > +xfs_buftarg_trip_write( > > > + struct xfs_buftarg *btp) > > > +{ > > > + if (!(btp->flags & XFS_BUFTARG_INJECT_WRITE_FAIL)) > > > + return; > > > + > > > + pthread_mutex_lock(&btp->lock); > > > + btp->writes_left--; > > > + if (!btp->writes_left) > > > + kill(getpid(), SIGKILL); > > > > Can we just exit()? > > > > (Same questions for the next patch..) > > The goal of this generic write error injection framework is to simulate > total system crashes immediately after a write. > > SIGKILL and exit are not the same, because atexit handlers don't run if > the process forcibly kills itself. > Can you document this somewhere please? Brian > --D > > > > > Brian > > > > > + pthread_mutex_unlock(&btp->lock); > > > +} > > > > > > extern void libxfs_buftarg_init(struct xfs_mount *mp, dev_t ddev, > > > dev_t logdev, dev_t rtdev); > > > diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c > > > index ca272387..fd456d6b 100644 > > > --- a/libxfs/rdwr.c > > > +++ b/libxfs/rdwr.c > > > @@ -74,8 +74,10 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) > > > /* try to use special zeroing methods, fall back to writes if needed */ > > > len_bytes = LIBXFS_BBTOOFF64(len); > > > error = platform_zero_range(fd, start_offset, len_bytes); > > > - if (!error) > > > + if (!error) { > > > + xfs_buftarg_trip_write(btp); > > > return 0; > > > + } > > > > > > zsize = min(BDSTRAT_SIZE, BBTOB(len)); > > > if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) { > > > @@ -105,6 +107,7 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) > > > progname, __FUNCTION__); > > > exit(1); > > > } > > > + xfs_buftarg_trip_write(btp); > > > offset += bytes; > > > } > > > free(z); > > > @@ -860,6 +863,7 @@ libxfs_bwrite( > > > } else { > > > bp->b_flags |= LIBXFS_B_UPTODATE; > > > bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_UNCHECKED); > > > + xfs_buftarg_trip_write(bp->b_target); > > > } > > > return bp->b_error; > > > } > > > > > >
On Thu, Feb 18, 2021 at 08:02:17AM -0500, Brian Foster wrote: > On Wed, Feb 17, 2021 at 08:36:20PM -0800, Darrick J. Wong wrote: > > On Tue, Feb 16, 2021 at 06:56:45AM -0500, Brian Foster wrote: > > > On Fri, Feb 12, 2021 at 09:46:56PM -0800, Darrick J. Wong wrote: > > > > From: Darrick J. Wong <djwong@kernel.org> > > > > > > > > Add an error injection knob so that we can simulate system failure after > > > > a certain number of disk writes. This knob is being added so that we > > > > can check repair's behavior after an arbitrary number of tests. > > > > > > > > Set LIBXFS_DEBUG_WRITE_CRASH={ddev,logdev,rtdev}=nn in the environment > > > > to make libxfs SIGKILL itself after nn writes to the data, log, or rt > > > > devices. Note that this only applies to xfs_buf writes and zero_range. > > > > > > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org> > > > > --- > > > > libxfs/init.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++--- > > > > libxfs/libxfs_io.h | 19 +++++++++++++++ > > > > libxfs/rdwr.c | 6 ++++- > > > > 3 files changed, 88 insertions(+), 5 deletions(-) > > > > > > > > > > > > diff --git a/libxfs/init.c b/libxfs/init.c > > > > index 8a8ce3c4..1ec83791 100644 > > > > --- a/libxfs/init.c > > > > +++ b/libxfs/init.c > > > ... > > > > @@ -614,6 +634,46 @@ libxfs_buftarg_init( > > > > dev_t logdev, > > > > dev_t rtdev) > > > > { > > > > + char *p = getenv("LIBXFS_DEBUG_WRITE_CRASH"); > > > > + unsigned long dfail = 0, lfail = 0, rfail = 0; > > > > > > Was there a reason for using an environment variable now rather than the > > > original command line option? > > > > Well, you said you wanted a generic write error injection hook for > > libxfs, and this is the simplest way to add that, given that libraries > > don't have a direct means to parse argc and argv. > > > > I think you're misinterpreting my previous feedback. ;) I thought the > injection mechanism was too closely tied to an implementation detail > (i.e. "fail after updating needsrepair bit") of the application. > Instead, I preferred a more generic mechanism (the "fail after so many > I/Os," "fail after phase N" approaches in these patches) that covers the > original use case. That broadens the potential test coverage and > usefulness of the mechanism. Agreed. Admittedly, the test case I wrote for it that kills repair after NR writes (where NR steadily increases) has opened my eyes to how ... stunningly awful xfs_repair (and e2fsck) can be. (As in, you can *very easily* snatch death from the jaws of victory if all you wanted was to fix a minor bitflip somewhere /and/ repair dies...) > > I mean... this /could/ take the form of an exposed library function that > > xfs utilities could opt into their own getopt loops, but that's even > > /more/ infrastructure code that I'd have to write. > > > > In this case I was just curious why the interface was changed from the > previous approach. ISTM it didn't necessarily have to, but I'm not > concerned about it either way. <nod> > ... > > > > diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h > > > > index c80e2d59..85485257 100644 > > > > --- a/libxfs/libxfs_io.h > > > > +++ b/libxfs/libxfs_io.h > > > ... > > > > @@ -30,6 +32,23 @@ struct xfs_buftarg { > > > > #define XFS_BUFTARG_LOST_WRITE (1 << 0) > > > > /* A dirty buffer failed the write verifier. */ > > > > #define XFS_BUFTARG_CORRUPT_WRITE (1 << 1) > > > > +/* Simulate failure after a certain number of writes. */ > > > > +#define XFS_BUFTARG_INJECT_WRITE_FAIL (1 << 2) > > > > + > > > > +/* Simulate the system crashing after a write. */ > > > > +static inline void > > > > +xfs_buftarg_trip_write( > > > > + struct xfs_buftarg *btp) > > > > +{ > > > > + if (!(btp->flags & XFS_BUFTARG_INJECT_WRITE_FAIL)) > > > > + return; > > > > + > > > > + pthread_mutex_lock(&btp->lock); > > > > + btp->writes_left--; > > > > + if (!btp->writes_left) > > > > + kill(getpid(), SIGKILL); > > > > > > Can we just exit()? > > > > > > (Same questions for the next patch..) > > > > The goal of this generic write error injection framework is to simulate > > total system crashes immediately after a write. > > > > SIGKILL and exit are not the same, because atexit handlers don't run if > > the process forcibly kills itself. > > > > Can you document this somewhere please? Will do. --D > Brian > > > --D > > > > > > > > Brian > > > > > > > + pthread_mutex_unlock(&btp->lock); > > > > +} > > > > > > > > extern void libxfs_buftarg_init(struct xfs_mount *mp, dev_t ddev, > > > > dev_t logdev, dev_t rtdev); > > > > diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c > > > > index ca272387..fd456d6b 100644 > > > > --- a/libxfs/rdwr.c > > > > +++ b/libxfs/rdwr.c > > > > @@ -74,8 +74,10 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) > > > > /* try to use special zeroing methods, fall back to writes if needed */ > > > > len_bytes = LIBXFS_BBTOOFF64(len); > > > > error = platform_zero_range(fd, start_offset, len_bytes); > > > > - if (!error) > > > > + if (!error) { > > > > + xfs_buftarg_trip_write(btp); > > > > return 0; > > > > + } > > > > > > > > zsize = min(BDSTRAT_SIZE, BBTOB(len)); > > > > if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) { > > > > @@ -105,6 +107,7 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) > > > > progname, __FUNCTION__); > > > > exit(1); > > > > } > > > > + xfs_buftarg_trip_write(btp); > > > > offset += bytes; > > > > } > > > > free(z); > > > > @@ -860,6 +863,7 @@ libxfs_bwrite( > > > > } else { > > > > bp->b_flags |= LIBXFS_B_UPTODATE; > > > > bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_UNCHECKED); > > > > + xfs_buftarg_trip_write(bp->b_target); > > > > } > > > > return bp->b_error; > > > > } > > > > > > > > > >
diff --git a/libxfs/init.c b/libxfs/init.c index 8a8ce3c4..1ec83791 100644 --- a/libxfs/init.c +++ b/libxfs/init.c @@ -590,7 +590,8 @@ libxfs_initialize_perag( static struct xfs_buftarg * libxfs_buftarg_alloc( struct xfs_mount *mp, - dev_t dev) + dev_t dev, + unsigned long write_fails) { struct xfs_buftarg *btp; @@ -603,10 +604,29 @@ libxfs_buftarg_alloc( btp->bt_mount = mp; btp->bt_bdev = dev; btp->flags = 0; + if (write_fails) { + btp->writes_left = write_fails; + btp->flags |= XFS_BUFTARG_INJECT_WRITE_FAIL; + } + pthread_mutex_init(&btp->lock, NULL); return btp; } +enum libxfs_write_failure_nums { + WF_DATA = 0, + WF_LOG, + WF_RT, + WF_MAX_OPTS, +}; + +static char *wf_opts[] = { + [WF_DATA] = "ddev", + [WF_LOG] = "logdev", + [WF_RT] = "rtdev", + [WF_MAX_OPTS] = NULL, +}; + void libxfs_buftarg_init( struct xfs_mount *mp, @@ -614,6 +634,46 @@ libxfs_buftarg_init( dev_t logdev, dev_t rtdev) { + char *p = getenv("LIBXFS_DEBUG_WRITE_CRASH"); + unsigned long dfail = 0, lfail = 0, rfail = 0; + + /* Simulate utility crash after a certain number of writes. */ + while (p && *p) { + char *val; + + switch (getsubopt(&p, wf_opts, &val)) { + case WF_DATA: + if (!val) { + fprintf(stderr, + _("ddev write fail requires a parameter\n")); + exit(1); + } + dfail = strtoul(val, NULL, 0); + break; + case WF_LOG: + if (!val) { + fprintf(stderr, + _("logdev write fail requires a parameter\n")); + exit(1); + } + lfail = strtoul(val, NULL, 0); + break; + case WF_RT: + if (!val) { + fprintf(stderr, + _("rtdev write fail requires a parameter\n")); + exit(1); + } + rfail = strtoul(val, NULL, 0); + break; + default: + fprintf(stderr, _("unknown write fail type %s\n"), + val); + exit(1); + break; + } + } + if (mp->m_ddev_targp) { /* should already have all buftargs initialised */ if (mp->m_ddev_targp->bt_bdev != dev || @@ -647,12 +707,12 @@ libxfs_buftarg_init( return; } - mp->m_ddev_targp = libxfs_buftarg_alloc(mp, dev); + mp->m_ddev_targp = libxfs_buftarg_alloc(mp, dev, dfail); if (!logdev || logdev == dev) mp->m_logdev_targp = mp->m_ddev_targp; else - mp->m_logdev_targp = libxfs_buftarg_alloc(mp, logdev); - mp->m_rtdev_targp = libxfs_buftarg_alloc(mp, rtdev); + mp->m_logdev_targp = libxfs_buftarg_alloc(mp, logdev, lfail); + mp->m_rtdev_targp = libxfs_buftarg_alloc(mp, rtdev, rfail); } /* diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h index c80e2d59..85485257 100644 --- a/libxfs/libxfs_io.h +++ b/libxfs/libxfs_io.h @@ -22,6 +22,8 @@ struct xfs_perag; */ struct xfs_buftarg { struct xfs_mount *bt_mount; + pthread_mutex_t lock; + unsigned long writes_left; dev_t bt_bdev; unsigned int flags; }; @@ -30,6 +32,23 @@ struct xfs_buftarg { #define XFS_BUFTARG_LOST_WRITE (1 << 0) /* A dirty buffer failed the write verifier. */ #define XFS_BUFTARG_CORRUPT_WRITE (1 << 1) +/* Simulate failure after a certain number of writes. */ +#define XFS_BUFTARG_INJECT_WRITE_FAIL (1 << 2) + +/* Simulate the system crashing after a write. */ +static inline void +xfs_buftarg_trip_write( + struct xfs_buftarg *btp) +{ + if (!(btp->flags & XFS_BUFTARG_INJECT_WRITE_FAIL)) + return; + + pthread_mutex_lock(&btp->lock); + btp->writes_left--; + if (!btp->writes_left) + kill(getpid(), SIGKILL); + pthread_mutex_unlock(&btp->lock); +} extern void libxfs_buftarg_init(struct xfs_mount *mp, dev_t ddev, dev_t logdev, dev_t rtdev); diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c index ca272387..fd456d6b 100644 --- a/libxfs/rdwr.c +++ b/libxfs/rdwr.c @@ -74,8 +74,10 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) /* try to use special zeroing methods, fall back to writes if needed */ len_bytes = LIBXFS_BBTOOFF64(len); error = platform_zero_range(fd, start_offset, len_bytes); - if (!error) + if (!error) { + xfs_buftarg_trip_write(btp); return 0; + } zsize = min(BDSTRAT_SIZE, BBTOB(len)); if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) { @@ -105,6 +107,7 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) progname, __FUNCTION__); exit(1); } + xfs_buftarg_trip_write(btp); offset += bytes; } free(z); @@ -860,6 +863,7 @@ libxfs_bwrite( } else { bp->b_flags |= LIBXFS_B_UPTODATE; bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_UNCHECKED); + xfs_buftarg_trip_write(bp->b_target); } return bp->b_error; }