diff mbox series

[4/8] libxfs: enable tools to check that metadata updates have been committed

Message ID 158216292664.601264.186457838279269618.stgit@magnolia (mailing list archive)
State Deferred, archived
Headers show
Series xfsprogs: actually check that writes succeeded | expand

Commit Message

Darrick J. Wong Feb. 20, 2020, 1:42 a.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Add a new function that will ensure that everything we changed has
landed on stable media, and report the results.  Subsequent commits will
teach the individual programs to report when things go wrong.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 include/xfs_mount.h |    3 +++
 libxfs/init.c       |   43 +++++++++++++++++++++++++++++++++++++++++++
 libxfs/libxfs_io.h  |    2 ++
 libxfs/rdwr.c       |   27 +++++++++++++++++++++++++--
 4 files changed, 73 insertions(+), 2 deletions(-)

Comments

Brian Foster Feb. 20, 2020, 2:06 p.m. UTC | #1
On Wed, Feb 19, 2020 at 05:42:06PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Add a new function that will ensure that everything we changed has
> landed on stable media, and report the results.  Subsequent commits will
> teach the individual programs to report when things go wrong.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  include/xfs_mount.h |    3 +++
>  libxfs/init.c       |   43 +++++++++++++++++++++++++++++++++++++++++++
>  libxfs/libxfs_io.h  |    2 ++
>  libxfs/rdwr.c       |   27 +++++++++++++++++++++++++--
>  4 files changed, 73 insertions(+), 2 deletions(-)
> 
> 
> diff --git a/include/xfs_mount.h b/include/xfs_mount.h
> index 29b3cc1b..c80aaf69 100644
> --- a/include/xfs_mount.h
> +++ b/include/xfs_mount.h
> @@ -187,4 +187,7 @@ extern xfs_mount_t	*libxfs_mount (xfs_mount_t *, xfs_sb_t *,
>  extern void	libxfs_umount (xfs_mount_t *);
>  extern void	libxfs_rtmount_destroy (xfs_mount_t *);
>  
> +void libxfs_flush_devices(struct xfs_mount *mp, int *datadev, int *logdev,
> +		int *rtdev);
> +
>  #endif	/* __XFS_MOUNT_H__ */
> diff --git a/libxfs/init.c b/libxfs/init.c
> index a0d4b7f4..d1d3f4df 100644
> --- a/libxfs/init.c
> +++ b/libxfs/init.c
> @@ -569,6 +569,8 @@ libxfs_buftarg_alloc(
>  	}
>  	btp->bt_mount = mp;
>  	btp->dev = dev;
> +	btp->lost_writes = false;
> +
>  	return btp;
>  }
>  
> @@ -791,6 +793,47 @@ libxfs_rtmount_destroy(xfs_mount_t *mp)
>  	mp->m_rsumip = mp->m_rbmip = NULL;
>  }
>  
> +static inline int
> +libxfs_flush_buftarg(
> +	struct xfs_buftarg	*btp)
> +{
> +	if (btp->lost_writes)
> +		return -ENOTRECOVERABLE;

I'm curious why we'd want to skip the flush just because some writes
happened to fail..? I suppose the fs might be borked, but it seems a
little strange to at least not try the flush, particularly since we
might still flush any of the other two possible devices.

> +
> +	return libxfs_blkdev_issue_flush(btp);
> +}
> +
> +/*
> + * Purge the buffer cache to write all dirty buffers to disk and free all
> + * incore buffers.  Buffers that cannot be written will cause the lost_writes
> + * flag to be set in the buftarg.  If there were no lost writes, flush the
> + * device to make sure the writes made it to stable storage.
> + *
> + * For each device, the return code will be set to -ENOTRECOVERABLE if we
> + * couldn't write something to disk; or the results of the block device flush
> + * operation.

Why not -EIO?

Brian

> + */
> +void
> +libxfs_flush_devices(
> +	struct xfs_mount	*mp,
> +	int			*datadev,
> +	int			*logdev,
> +	int			*rtdev)
> +{
> +	*datadev = *logdev = *rtdev = 0;
> +
> +	libxfs_bcache_purge();
> +
> +	if (mp->m_ddev_targp)
> +		*datadev = libxfs_flush_buftarg(mp->m_ddev_targp);
> +
> +	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
> +		*logdev = libxfs_flush_buftarg(mp->m_logdev_targp);
> +
> +	if (mp->m_rtdev_targp)
> +		*rtdev = libxfs_flush_buftarg(mp->m_rtdev_targp);
> +}
> +
>  /*
>   * Release any resource obtained during a mount.
>   */
> diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h
> index 579df52b..fc0fd060 100644
> --- a/libxfs/libxfs_io.h
> +++ b/libxfs/libxfs_io.h
> @@ -23,10 +23,12 @@ struct xfs_perag;
>  struct xfs_buftarg {
>  	struct xfs_mount	*bt_mount;
>  	dev_t			dev;
> +	bool			lost_writes;
>  };
>  
>  extern void	libxfs_buftarg_init(struct xfs_mount *mp, dev_t ddev,
>  				    dev_t logdev, dev_t rtdev);
> +int libxfs_blkdev_issue_flush(struct xfs_buftarg *btp);
>  
>  #define LIBXFS_BBTOOFF64(bbs)	(((xfs_off_t)(bbs)) << BBSHIFT)
>  
> diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
> index 8b47d438..92e497f9 100644
> --- a/libxfs/rdwr.c
> +++ b/libxfs/rdwr.c
> @@ -17,6 +17,7 @@
>  #include "xfs_inode_fork.h"
>  #include "xfs_inode.h"
>  #include "xfs_trans.h"
> +#include "libfrog/platform.h"
>  
>  #include "libxfs.h"		/* for LIBXFS_EXIT_ON_FAILURE */
>  
> @@ -1227,9 +1228,11 @@ libxfs_brelse(
>  
>  	if (!bp)
>  		return;
> -	if (bp->b_flags & LIBXFS_B_DIRTY)
> +	if (bp->b_flags & LIBXFS_B_DIRTY) {
>  		fprintf(stderr,
>  			"releasing dirty buffer to free list!\n");
> +		bp->b_target->lost_writes = true;
> +	}
>  
>  	pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
>  	list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
> @@ -1248,9 +1251,11 @@ libxfs_bulkrelse(
>  		return 0 ;
>  
>  	list_for_each_entry(bp, list, b_node.cn_mru) {
> -		if (bp->b_flags & LIBXFS_B_DIRTY)
> +		if (bp->b_flags & LIBXFS_B_DIRTY) {
>  			fprintf(stderr,
>  				"releasing dirty buffer (bulk) to free list!\n");
> +			bp->b_target->lost_writes = true;
> +		}
>  		count++;
>  	}
>  
> @@ -1479,6 +1484,24 @@ libxfs_irele(
>  	kmem_cache_free(xfs_inode_zone, ip);
>  }
>  
> +/*
> + * Flush everything dirty in the kernel and disk write caches to stable media.
> + * Returns 0 for success or a negative error code.
> + */
> +int
> +libxfs_blkdev_issue_flush(
> +	struct xfs_buftarg	*btp)
> +{
> +	int			fd, ret;
> +
> +	if (btp->dev == 0)
> +		return 0;
> +
> +	fd = libxfs_device_to_fd(btp->dev);
> +	ret = platform_flush_device(fd, btp->dev);
> +	return ret ? -errno : 0;
> +}
> +
>  /*
>   * Write out a buffer list synchronously.
>   *
>
Darrick J. Wong Feb. 20, 2020, 4:46 p.m. UTC | #2
On Thu, Feb 20, 2020 at 09:06:12AM -0500, Brian Foster wrote:
> On Wed, Feb 19, 2020 at 05:42:06PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Add a new function that will ensure that everything we changed has
> > landed on stable media, and report the results.  Subsequent commits will
> > teach the individual programs to report when things go wrong.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  include/xfs_mount.h |    3 +++
> >  libxfs/init.c       |   43 +++++++++++++++++++++++++++++++++++++++++++
> >  libxfs/libxfs_io.h  |    2 ++
> >  libxfs/rdwr.c       |   27 +++++++++++++++++++++++++--
> >  4 files changed, 73 insertions(+), 2 deletions(-)
> > 
> > 
> > diff --git a/include/xfs_mount.h b/include/xfs_mount.h
> > index 29b3cc1b..c80aaf69 100644
> > --- a/include/xfs_mount.h
> > +++ b/include/xfs_mount.h
> > @@ -187,4 +187,7 @@ extern xfs_mount_t	*libxfs_mount (xfs_mount_t *, xfs_sb_t *,
> >  extern void	libxfs_umount (xfs_mount_t *);
> >  extern void	libxfs_rtmount_destroy (xfs_mount_t *);
> >  
> > +void libxfs_flush_devices(struct xfs_mount *mp, int *datadev, int *logdev,
> > +		int *rtdev);
> > +
> >  #endif	/* __XFS_MOUNT_H__ */
> > diff --git a/libxfs/init.c b/libxfs/init.c
> > index a0d4b7f4..d1d3f4df 100644
> > --- a/libxfs/init.c
> > +++ b/libxfs/init.c
> > @@ -569,6 +569,8 @@ libxfs_buftarg_alloc(
> >  	}
> >  	btp->bt_mount = mp;
> >  	btp->dev = dev;
> > +	btp->lost_writes = false;
> > +
> >  	return btp;
> >  }
> >  
> > @@ -791,6 +793,47 @@ libxfs_rtmount_destroy(xfs_mount_t *mp)
> >  	mp->m_rsumip = mp->m_rbmip = NULL;
> >  }
> >  
> > +static inline int
> > +libxfs_flush_buftarg(
> > +	struct xfs_buftarg	*btp)
> > +{
> > +	if (btp->lost_writes)
> > +		return -ENOTRECOVERABLE;
> 
> I'm curious why we'd want to skip the flush just because some writes
> happened to fail..? I suppose the fs might be borked, but it seems a
> little strange to at least not try the flush, particularly since we
> might still flush any of the other two possible devices.

My thinking here was that if the write verifiers (or the pwrite() calls
themselves) failed then there's no point in telling the disk to flush
its write cache since we already know it's not in sync with the buffer
cache.

> > +
> > +	return libxfs_blkdev_issue_flush(btp);
> > +}
> > +
> > +/*
> > + * Purge the buffer cache to write all dirty buffers to disk and free all
> > + * incore buffers.  Buffers that cannot be written will cause the lost_writes
> > + * flag to be set in the buftarg.  If there were no lost writes, flush the
> > + * device to make sure the writes made it to stable storage.
> > + *
> > + * For each device, the return code will be set to -ENOTRECOVERABLE if we
> > + * couldn't write something to disk; or the results of the block device flush
> > + * operation.
> 
> Why not -EIO?

Originally I thought it might be useful to be able to distinguish
between "dirty buffers never even made it out of the buffer cache" vs.
"dirty buffers were sent to the disk but the disk sent back media
errors", though in the end the userspace tools don't make any
distinction.

That said, looking at this again, maybe I should track write verifier
failure separately so that we can return EFSCORRUPTED for that?

--D

> 
> Brian
> 
> > + */
> > +void
> > +libxfs_flush_devices(
> > +	struct xfs_mount	*mp,
> > +	int			*datadev,
> > +	int			*logdev,
> > +	int			*rtdev)
> > +{
> > +	*datadev = *logdev = *rtdev = 0;
> > +
> > +	libxfs_bcache_purge();
> > +
> > +	if (mp->m_ddev_targp)
> > +		*datadev = libxfs_flush_buftarg(mp->m_ddev_targp);
> > +
> > +	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
> > +		*logdev = libxfs_flush_buftarg(mp->m_logdev_targp);
> > +
> > +	if (mp->m_rtdev_targp)
> > +		*rtdev = libxfs_flush_buftarg(mp->m_rtdev_targp);
> > +}
> > +
> >  /*
> >   * Release any resource obtained during a mount.
> >   */
> > diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h
> > index 579df52b..fc0fd060 100644
> > --- a/libxfs/libxfs_io.h
> > +++ b/libxfs/libxfs_io.h
> > @@ -23,10 +23,12 @@ struct xfs_perag;
> >  struct xfs_buftarg {
> >  	struct xfs_mount	*bt_mount;
> >  	dev_t			dev;
> > +	bool			lost_writes;
> >  };
> >  
> >  extern void	libxfs_buftarg_init(struct xfs_mount *mp, dev_t ddev,
> >  				    dev_t logdev, dev_t rtdev);
> > +int libxfs_blkdev_issue_flush(struct xfs_buftarg *btp);
> >  
> >  #define LIBXFS_BBTOOFF64(bbs)	(((xfs_off_t)(bbs)) << BBSHIFT)
> >  
> > diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
> > index 8b47d438..92e497f9 100644
> > --- a/libxfs/rdwr.c
> > +++ b/libxfs/rdwr.c
> > @@ -17,6 +17,7 @@
> >  #include "xfs_inode_fork.h"
> >  #include "xfs_inode.h"
> >  #include "xfs_trans.h"
> > +#include "libfrog/platform.h"
> >  
> >  #include "libxfs.h"		/* for LIBXFS_EXIT_ON_FAILURE */
> >  
> > @@ -1227,9 +1228,11 @@ libxfs_brelse(
> >  
> >  	if (!bp)
> >  		return;
> > -	if (bp->b_flags & LIBXFS_B_DIRTY)
> > +	if (bp->b_flags & LIBXFS_B_DIRTY) {
> >  		fprintf(stderr,
> >  			"releasing dirty buffer to free list!\n");
> > +		bp->b_target->lost_writes = true;
> > +	}
> >  
> >  	pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
> >  	list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
> > @@ -1248,9 +1251,11 @@ libxfs_bulkrelse(
> >  		return 0 ;
> >  
> >  	list_for_each_entry(bp, list, b_node.cn_mru) {
> > -		if (bp->b_flags & LIBXFS_B_DIRTY)
> > +		if (bp->b_flags & LIBXFS_B_DIRTY) {
> >  			fprintf(stderr,
> >  				"releasing dirty buffer (bulk) to free list!\n");
> > +			bp->b_target->lost_writes = true;
> > +		}
> >  		count++;
> >  	}
> >  
> > @@ -1479,6 +1484,24 @@ libxfs_irele(
> >  	kmem_cache_free(xfs_inode_zone, ip);
> >  }
> >  
> > +/*
> > + * Flush everything dirty in the kernel and disk write caches to stable media.
> > + * Returns 0 for success or a negative error code.
> > + */
> > +int
> > +libxfs_blkdev_issue_flush(
> > +	struct xfs_buftarg	*btp)
> > +{
> > +	int			fd, ret;
> > +
> > +	if (btp->dev == 0)
> > +		return 0;
> > +
> > +	fd = libxfs_device_to_fd(btp->dev);
> > +	ret = platform_flush_device(fd, btp->dev);
> > +	return ret ? -errno : 0;
> > +}
> > +
> >  /*
> >   * Write out a buffer list synchronously.
> >   *
> > 
>
Brian Foster Feb. 20, 2020, 5:58 p.m. UTC | #3
On Thu, Feb 20, 2020 at 08:46:38AM -0800, Darrick J. Wong wrote:
> On Thu, Feb 20, 2020 at 09:06:12AM -0500, Brian Foster wrote:
> > On Wed, Feb 19, 2020 at 05:42:06PM -0800, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Add a new function that will ensure that everything we changed has
> > > landed on stable media, and report the results.  Subsequent commits will
> > > teach the individual programs to report when things go wrong.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > >  include/xfs_mount.h |    3 +++
> > >  libxfs/init.c       |   43 +++++++++++++++++++++++++++++++++++++++++++
> > >  libxfs/libxfs_io.h  |    2 ++
> > >  libxfs/rdwr.c       |   27 +++++++++++++++++++++++++--
> > >  4 files changed, 73 insertions(+), 2 deletions(-)
> > > 
> > > 
> > > diff --git a/include/xfs_mount.h b/include/xfs_mount.h
> > > index 29b3cc1b..c80aaf69 100644
> > > --- a/include/xfs_mount.h
> > > +++ b/include/xfs_mount.h
> > > @@ -187,4 +187,7 @@ extern xfs_mount_t	*libxfs_mount (xfs_mount_t *, xfs_sb_t *,
> > >  extern void	libxfs_umount (xfs_mount_t *);
> > >  extern void	libxfs_rtmount_destroy (xfs_mount_t *);
> > >  
> > > +void libxfs_flush_devices(struct xfs_mount *mp, int *datadev, int *logdev,
> > > +		int *rtdev);
> > > +
> > >  #endif	/* __XFS_MOUNT_H__ */
> > > diff --git a/libxfs/init.c b/libxfs/init.c
> > > index a0d4b7f4..d1d3f4df 100644
> > > --- a/libxfs/init.c
> > > +++ b/libxfs/init.c
> > > @@ -569,6 +569,8 @@ libxfs_buftarg_alloc(
> > >  	}
> > >  	btp->bt_mount = mp;
> > >  	btp->dev = dev;
> > > +	btp->lost_writes = false;
> > > +
> > >  	return btp;
> > >  }
> > >  
> > > @@ -791,6 +793,47 @@ libxfs_rtmount_destroy(xfs_mount_t *mp)
> > >  	mp->m_rsumip = mp->m_rbmip = NULL;
> > >  }
> > >  
> > > +static inline int
> > > +libxfs_flush_buftarg(
> > > +	struct xfs_buftarg	*btp)
> > > +{
> > > +	if (btp->lost_writes)
> > > +		return -ENOTRECOVERABLE;
> > 
> > I'm curious why we'd want to skip the flush just because some writes
> > happened to fail..? I suppose the fs might be borked, but it seems a
> > little strange to at least not try the flush, particularly since we
> > might still flush any of the other two possible devices.
> 
> My thinking here was that if the write verifiers (or the pwrite() calls
> themselves) failed then there's no point in telling the disk to flush
> its write cache since we already know it's not in sync with the buffer
> cache.
> 

I suppose, but it seems there is some value in flushing what we did
write.. That's effectively historical behavior (since we ignored
errors), right?

> > > +
> > > +	return libxfs_blkdev_issue_flush(btp);
> > > +}
> > > +
> > > +/*
> > > + * Purge the buffer cache to write all dirty buffers to disk and free all
> > > + * incore buffers.  Buffers that cannot be written will cause the lost_writes
> > > + * flag to be set in the buftarg.  If there were no lost writes, flush the
> > > + * device to make sure the writes made it to stable storage.
> > > + *
> > > + * For each device, the return code will be set to -ENOTRECOVERABLE if we
> > > + * couldn't write something to disk; or the results of the block device flush
> > > + * operation.
> > 
> > Why not -EIO?
> 
> Originally I thought it might be useful to be able to distinguish
> between "dirty buffers never even made it out of the buffer cache" vs.
> "dirty buffers were sent to the disk but the disk sent back media
> errors", though in the end the userspace tools don't make any
> distinction.
> 
> That said, looking at this again, maybe I should track write verifier
> failure separately so that we can return EFSCORRUPTED for that?
> 

It's not clear to me that anything application level would care much
about verifier failure vs. I/O failure, but I've no objection to doing
something like that either.

Brian

> --D
> 
> > 
> > Brian
> > 
> > > + */
> > > +void
> > > +libxfs_flush_devices(
> > > +	struct xfs_mount	*mp,
> > > +	int			*datadev,
> > > +	int			*logdev,
> > > +	int			*rtdev)
> > > +{
> > > +	*datadev = *logdev = *rtdev = 0;
> > > +
> > > +	libxfs_bcache_purge();
> > > +
> > > +	if (mp->m_ddev_targp)
> > > +		*datadev = libxfs_flush_buftarg(mp->m_ddev_targp);
> > > +
> > > +	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
> > > +		*logdev = libxfs_flush_buftarg(mp->m_logdev_targp);
> > > +
> > > +	if (mp->m_rtdev_targp)
> > > +		*rtdev = libxfs_flush_buftarg(mp->m_rtdev_targp);
> > > +}
> > > +
> > >  /*
> > >   * Release any resource obtained during a mount.
> > >   */
> > > diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h
> > > index 579df52b..fc0fd060 100644
> > > --- a/libxfs/libxfs_io.h
> > > +++ b/libxfs/libxfs_io.h
> > > @@ -23,10 +23,12 @@ struct xfs_perag;
> > >  struct xfs_buftarg {
> > >  	struct xfs_mount	*bt_mount;
> > >  	dev_t			dev;
> > > +	bool			lost_writes;
> > >  };
> > >  
> > >  extern void	libxfs_buftarg_init(struct xfs_mount *mp, dev_t ddev,
> > >  				    dev_t logdev, dev_t rtdev);
> > > +int libxfs_blkdev_issue_flush(struct xfs_buftarg *btp);
> > >  
> > >  #define LIBXFS_BBTOOFF64(bbs)	(((xfs_off_t)(bbs)) << BBSHIFT)
> > >  
> > > diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
> > > index 8b47d438..92e497f9 100644
> > > --- a/libxfs/rdwr.c
> > > +++ b/libxfs/rdwr.c
> > > @@ -17,6 +17,7 @@
> > >  #include "xfs_inode_fork.h"
> > >  #include "xfs_inode.h"
> > >  #include "xfs_trans.h"
> > > +#include "libfrog/platform.h"
> > >  
> > >  #include "libxfs.h"		/* for LIBXFS_EXIT_ON_FAILURE */
> > >  
> > > @@ -1227,9 +1228,11 @@ libxfs_brelse(
> > >  
> > >  	if (!bp)
> > >  		return;
> > > -	if (bp->b_flags & LIBXFS_B_DIRTY)
> > > +	if (bp->b_flags & LIBXFS_B_DIRTY) {
> > >  		fprintf(stderr,
> > >  			"releasing dirty buffer to free list!\n");
> > > +		bp->b_target->lost_writes = true;
> > > +	}
> > >  
> > >  	pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
> > >  	list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
> > > @@ -1248,9 +1251,11 @@ libxfs_bulkrelse(
> > >  		return 0 ;
> > >  
> > >  	list_for_each_entry(bp, list, b_node.cn_mru) {
> > > -		if (bp->b_flags & LIBXFS_B_DIRTY)
> > > +		if (bp->b_flags & LIBXFS_B_DIRTY) {
> > >  			fprintf(stderr,
> > >  				"releasing dirty buffer (bulk) to free list!\n");
> > > +			bp->b_target->lost_writes = true;
> > > +		}
> > >  		count++;
> > >  	}
> > >  
> > > @@ -1479,6 +1484,24 @@ libxfs_irele(
> > >  	kmem_cache_free(xfs_inode_zone, ip);
> > >  }
> > >  
> > > +/*
> > > + * Flush everything dirty in the kernel and disk write caches to stable media.
> > > + * Returns 0 for success or a negative error code.
> > > + */
> > > +int
> > > +libxfs_blkdev_issue_flush(
> > > +	struct xfs_buftarg	*btp)
> > > +{
> > > +	int			fd, ret;
> > > +
> > > +	if (btp->dev == 0)
> > > +		return 0;
> > > +
> > > +	fd = libxfs_device_to_fd(btp->dev);
> > > +	ret = platform_flush_device(fd, btp->dev);
> > > +	return ret ? -errno : 0;
> > > +}
> > > +
> > >  /*
> > >   * Write out a buffer list synchronously.
> > >   *
> > > 
> > 
>
Darrick J. Wong Feb. 20, 2020, 6:26 p.m. UTC | #4
On Thu, Feb 20, 2020 at 12:58:50PM -0500, Brian Foster wrote:
> On Thu, Feb 20, 2020 at 08:46:38AM -0800, Darrick J. Wong wrote:
> > On Thu, Feb 20, 2020 at 09:06:12AM -0500, Brian Foster wrote:
> > > On Wed, Feb 19, 2020 at 05:42:06PM -0800, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Add a new function that will ensure that everything we changed has
> > > > landed on stable media, and report the results.  Subsequent commits will
> > > > teach the individual programs to report when things go wrong.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > >  include/xfs_mount.h |    3 +++
> > > >  libxfs/init.c       |   43 +++++++++++++++++++++++++++++++++++++++++++
> > > >  libxfs/libxfs_io.h  |    2 ++
> > > >  libxfs/rdwr.c       |   27 +++++++++++++++++++++++++--
> > > >  4 files changed, 73 insertions(+), 2 deletions(-)
> > > > 
> > > > 
> > > > diff --git a/include/xfs_mount.h b/include/xfs_mount.h
> > > > index 29b3cc1b..c80aaf69 100644
> > > > --- a/include/xfs_mount.h
> > > > +++ b/include/xfs_mount.h
> > > > @@ -187,4 +187,7 @@ extern xfs_mount_t	*libxfs_mount (xfs_mount_t *, xfs_sb_t *,
> > > >  extern void	libxfs_umount (xfs_mount_t *);
> > > >  extern void	libxfs_rtmount_destroy (xfs_mount_t *);
> > > >  
> > > > +void libxfs_flush_devices(struct xfs_mount *mp, int *datadev, int *logdev,
> > > > +		int *rtdev);
> > > > +
> > > >  #endif	/* __XFS_MOUNT_H__ */
> > > > diff --git a/libxfs/init.c b/libxfs/init.c
> > > > index a0d4b7f4..d1d3f4df 100644
> > > > --- a/libxfs/init.c
> > > > +++ b/libxfs/init.c
> > > > @@ -569,6 +569,8 @@ libxfs_buftarg_alloc(
> > > >  	}
> > > >  	btp->bt_mount = mp;
> > > >  	btp->dev = dev;
> > > > +	btp->lost_writes = false;
> > > > +
> > > >  	return btp;
> > > >  }
> > > >  
> > > > @@ -791,6 +793,47 @@ libxfs_rtmount_destroy(xfs_mount_t *mp)
> > > >  	mp->m_rsumip = mp->m_rbmip = NULL;
> > > >  }
> > > >  
> > > > +static inline int
> > > > +libxfs_flush_buftarg(
> > > > +	struct xfs_buftarg	*btp)
> > > > +{
> > > > +	if (btp->lost_writes)
> > > > +		return -ENOTRECOVERABLE;
> > > 
> > > I'm curious why we'd want to skip the flush just because some writes
> > > happened to fail..? I suppose the fs might be borked, but it seems a
> > > little strange to at least not try the flush, particularly since we
> > > might still flush any of the other two possible devices.
> > 
> > My thinking here was that if the write verifiers (or the pwrite() calls
> > themselves) failed then there's no point in telling the disk to flush
> > its write cache since we already know it's not in sync with the buffer
> > cache.
> > 
> 
> I suppose, but it seems there is some value in flushing what we did
> write.. That's effectively historical behavior (since we ignored
> errors), right?

It's the historical behavior, yes.  I don't think it makes much sense,
but OTOH I'm not opposed to restoring that.

> > > > +
> > > > +	return libxfs_blkdev_issue_flush(btp);
> > > > +}
> > > > +
> > > > +/*
> > > > + * Purge the buffer cache to write all dirty buffers to disk and free all
> > > > + * incore buffers.  Buffers that cannot be written will cause the lost_writes
> > > > + * flag to be set in the buftarg.  If there were no lost writes, flush the
> > > > + * device to make sure the writes made it to stable storage.
> > > > + *
> > > > + * For each device, the return code will be set to -ENOTRECOVERABLE if we
> > > > + * couldn't write something to disk; or the results of the block device flush
> > > > + * operation.
> > > 
> > > Why not -EIO?
> > 
> > Originally I thought it might be useful to be able to distinguish
> > between "dirty buffers never even made it out of the buffer cache" vs.
> > "dirty buffers were sent to the disk but the disk sent back media
> > errors", though in the end the userspace tools don't make any
> > distinction.
> > 
> > That said, looking at this again, maybe I should track write verifier
> > failure separately so that we can return EFSCORRUPTED for that?
> > 
> 
> It's not clear to me that anything application level would care much
> about verifier failure vs. I/O failure, but I've no objection to doing
> something like that either.

Yeah.  The single usecase I can think of is where repair trips over a
write verifier and we should make it really obvious to the sysadmin that
repair is buggy and needs either (a) an upgrade or (b) a complaint filed
on linux-xfs.

--D

> Brian
> 
> > --D
> > 
> > > 
> > > Brian
> > > 
> > > > + */
> > > > +void
> > > > +libxfs_flush_devices(
> > > > +	struct xfs_mount	*mp,
> > > > +	int			*datadev,
> > > > +	int			*logdev,
> > > > +	int			*rtdev)
> > > > +{
> > > > +	*datadev = *logdev = *rtdev = 0;
> > > > +
> > > > +	libxfs_bcache_purge();
> > > > +
> > > > +	if (mp->m_ddev_targp)
> > > > +		*datadev = libxfs_flush_buftarg(mp->m_ddev_targp);
> > > > +
> > > > +	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
> > > > +		*logdev = libxfs_flush_buftarg(mp->m_logdev_targp);
> > > > +
> > > > +	if (mp->m_rtdev_targp)
> > > > +		*rtdev = libxfs_flush_buftarg(mp->m_rtdev_targp);
> > > > +}
> > > > +
> > > >  /*
> > > >   * Release any resource obtained during a mount.
> > > >   */
> > > > diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h
> > > > index 579df52b..fc0fd060 100644
> > > > --- a/libxfs/libxfs_io.h
> > > > +++ b/libxfs/libxfs_io.h
> > > > @@ -23,10 +23,12 @@ struct xfs_perag;
> > > >  struct xfs_buftarg {
> > > >  	struct xfs_mount	*bt_mount;
> > > >  	dev_t			dev;
> > > > +	bool			lost_writes;
> > > >  };
> > > >  
> > > >  extern void	libxfs_buftarg_init(struct xfs_mount *mp, dev_t ddev,
> > > >  				    dev_t logdev, dev_t rtdev);
> > > > +int libxfs_blkdev_issue_flush(struct xfs_buftarg *btp);
> > > >  
> > > >  #define LIBXFS_BBTOOFF64(bbs)	(((xfs_off_t)(bbs)) << BBSHIFT)
> > > >  
> > > > diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
> > > > index 8b47d438..92e497f9 100644
> > > > --- a/libxfs/rdwr.c
> > > > +++ b/libxfs/rdwr.c
> > > > @@ -17,6 +17,7 @@
> > > >  #include "xfs_inode_fork.h"
> > > >  #include "xfs_inode.h"
> > > >  #include "xfs_trans.h"
> > > > +#include "libfrog/platform.h"
> > > >  
> > > >  #include "libxfs.h"		/* for LIBXFS_EXIT_ON_FAILURE */
> > > >  
> > > > @@ -1227,9 +1228,11 @@ libxfs_brelse(
> > > >  
> > > >  	if (!bp)
> > > >  		return;
> > > > -	if (bp->b_flags & LIBXFS_B_DIRTY)
> > > > +	if (bp->b_flags & LIBXFS_B_DIRTY) {
> > > >  		fprintf(stderr,
> > > >  			"releasing dirty buffer to free list!\n");
> > > > +		bp->b_target->lost_writes = true;
> > > > +	}
> > > >  
> > > >  	pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
> > > >  	list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
> > > > @@ -1248,9 +1251,11 @@ libxfs_bulkrelse(
> > > >  		return 0 ;
> > > >  
> > > >  	list_for_each_entry(bp, list, b_node.cn_mru) {
> > > > -		if (bp->b_flags & LIBXFS_B_DIRTY)
> > > > +		if (bp->b_flags & LIBXFS_B_DIRTY) {
> > > >  			fprintf(stderr,
> > > >  				"releasing dirty buffer (bulk) to free list!\n");
> > > > +			bp->b_target->lost_writes = true;
> > > > +		}
> > > >  		count++;
> > > >  	}
> > > >  
> > > > @@ -1479,6 +1484,24 @@ libxfs_irele(
> > > >  	kmem_cache_free(xfs_inode_zone, ip);
> > > >  }
> > > >  
> > > > +/*
> > > > + * Flush everything dirty in the kernel and disk write caches to stable media.
> > > > + * Returns 0 for success or a negative error code.
> > > > + */
> > > > +int
> > > > +libxfs_blkdev_issue_flush(
> > > > +	struct xfs_buftarg	*btp)
> > > > +{
> > > > +	int			fd, ret;
> > > > +
> > > > +	if (btp->dev == 0)
> > > > +		return 0;
> > > > +
> > > > +	fd = libxfs_device_to_fd(btp->dev);
> > > > +	ret = platform_flush_device(fd, btp->dev);
> > > > +	return ret ? -errno : 0;
> > > > +}
> > > > +
> > > >  /*
> > > >   * Write out a buffer list synchronously.
> > > >   *
> > > > 
> > > 
> > 
>
Brian Foster Feb. 20, 2020, 6:50 p.m. UTC | #5
On Thu, Feb 20, 2020 at 10:26:42AM -0800, Darrick J. Wong wrote:
> On Thu, Feb 20, 2020 at 12:58:50PM -0500, Brian Foster wrote:
> > On Thu, Feb 20, 2020 at 08:46:38AM -0800, Darrick J. Wong wrote:
> > > On Thu, Feb 20, 2020 at 09:06:12AM -0500, Brian Foster wrote:
> > > > On Wed, Feb 19, 2020 at 05:42:06PM -0800, Darrick J. Wong wrote:
> > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > 
> > > > > Add a new function that will ensure that everything we changed has
> > > > > landed on stable media, and report the results.  Subsequent commits will
> > > > > teach the individual programs to report when things go wrong.
> > > > > 
> > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > ---
> > > > >  include/xfs_mount.h |    3 +++
> > > > >  libxfs/init.c       |   43 +++++++++++++++++++++++++++++++++++++++++++
> > > > >  libxfs/libxfs_io.h  |    2 ++
> > > > >  libxfs/rdwr.c       |   27 +++++++++++++++++++++++++--
> > > > >  4 files changed, 73 insertions(+), 2 deletions(-)
> > > > > 
> > > > > 
> > > > > diff --git a/include/xfs_mount.h b/include/xfs_mount.h
> > > > > index 29b3cc1b..c80aaf69 100644
> > > > > --- a/include/xfs_mount.h
> > > > > +++ b/include/xfs_mount.h
> > > > > @@ -187,4 +187,7 @@ extern xfs_mount_t	*libxfs_mount (xfs_mount_t *, xfs_sb_t *,
> > > > >  extern void	libxfs_umount (xfs_mount_t *);
> > > > >  extern void	libxfs_rtmount_destroy (xfs_mount_t *);
> > > > >  
> > > > > +void libxfs_flush_devices(struct xfs_mount *mp, int *datadev, int *logdev,
> > > > > +		int *rtdev);
> > > > > +
> > > > >  #endif	/* __XFS_MOUNT_H__ */
> > > > > diff --git a/libxfs/init.c b/libxfs/init.c
> > > > > index a0d4b7f4..d1d3f4df 100644
> > > > > --- a/libxfs/init.c
> > > > > +++ b/libxfs/init.c
> > > > > @@ -569,6 +569,8 @@ libxfs_buftarg_alloc(
> > > > >  	}
> > > > >  	btp->bt_mount = mp;
> > > > >  	btp->dev = dev;
> > > > > +	btp->lost_writes = false;
> > > > > +
> > > > >  	return btp;
> > > > >  }
> > > > >  
> > > > > @@ -791,6 +793,47 @@ libxfs_rtmount_destroy(xfs_mount_t *mp)
> > > > >  	mp->m_rsumip = mp->m_rbmip = NULL;
> > > > >  }
> > > > >  
> > > > > +static inline int
> > > > > +libxfs_flush_buftarg(
> > > > > +	struct xfs_buftarg	*btp)
> > > > > +{
> > > > > +	if (btp->lost_writes)
> > > > > +		return -ENOTRECOVERABLE;
> > > > 
> > > > I'm curious why we'd want to skip the flush just because some writes
> > > > happened to fail..? I suppose the fs might be borked, but it seems a
> > > > little strange to at least not try the flush, particularly since we
> > > > might still flush any of the other two possible devices.
> > > 
> > > My thinking here was that if the write verifiers (or the pwrite() calls
> > > themselves) failed then there's no point in telling the disk to flush
> > > its write cache since we already know it's not in sync with the buffer
> > > cache.
> > > 
> > 
> > I suppose, but it seems there is some value in flushing what we did
> > write.. That's effectively historical behavior (since we ignored
> > errors), right?
> 
> It's the historical behavior, yes.  I don't think it makes much sense,
> but OTOH I'm not opposed to restoring that.
> 

The way I think about it is if repair or something happens to rewrite a
bunch of metadata structures, etc. and then a particular I/O happens to
fail, we'll still end up with a corrupted fs in the end, but I don't see
that as a reason not to care about the integrity of the data that might
have been successfully written. We're most likely borked either way,
this just seems a bit less risky (and also less of a wart/landmine given
that the close codepath is still going to do the flush anyways).

> > > > > +
> > > > > +	return libxfs_blkdev_issue_flush(btp);
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Purge the buffer cache to write all dirty buffers to disk and free all
> > > > > + * incore buffers.  Buffers that cannot be written will cause the lost_writes
> > > > > + * flag to be set in the buftarg.  If there were no lost writes, flush the
> > > > > + * device to make sure the writes made it to stable storage.
> > > > > + *
> > > > > + * For each device, the return code will be set to -ENOTRECOVERABLE if we
> > > > > + * couldn't write something to disk; or the results of the block device flush
> > > > > + * operation.
> > > > 
> > > > Why not -EIO?
> > > 
> > > Originally I thought it might be useful to be able to distinguish
> > > between "dirty buffers never even made it out of the buffer cache" vs.
> > > "dirty buffers were sent to the disk but the disk sent back media
> > > errors", though in the end the userspace tools don't make any
> > > distinction.
> > > 
> > > That said, looking at this again, maybe I should track write verifier
> > > failure separately so that we can return EFSCORRUPTED for that?
> > > 
> > 
> > It's not clear to me that anything application level would care much
> > about verifier failure vs. I/O failure, but I've no objection to doing
> > something like that either.
> 
> Yeah.  The single usecase I can think of is where repair trips over a
> write verifier and we should make it really obvious to the sysadmin that
> repair is buggy and needs either (a) an upgrade or (b) a complaint filed
> on linux-xfs.
> 

We do have the write verifier failure messages, but yeah, I can see that
being a more accurate distinction between -EIO and -EFSCORRUPTED.

Brian

> --D
> 
> > Brian
> > 
> > > --D
> > > 
> > > > 
> > > > Brian
> > > > 
> > > > > + */
> > > > > +void
> > > > > +libxfs_flush_devices(
> > > > > +	struct xfs_mount	*mp,
> > > > > +	int			*datadev,
> > > > > +	int			*logdev,
> > > > > +	int			*rtdev)
> > > > > +{
> > > > > +	*datadev = *logdev = *rtdev = 0;
> > > > > +
> > > > > +	libxfs_bcache_purge();
> > > > > +
> > > > > +	if (mp->m_ddev_targp)
> > > > > +		*datadev = libxfs_flush_buftarg(mp->m_ddev_targp);
> > > > > +
> > > > > +	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
> > > > > +		*logdev = libxfs_flush_buftarg(mp->m_logdev_targp);
> > > > > +
> > > > > +	if (mp->m_rtdev_targp)
> > > > > +		*rtdev = libxfs_flush_buftarg(mp->m_rtdev_targp);
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * Release any resource obtained during a mount.
> > > > >   */
> > > > > diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h
> > > > > index 579df52b..fc0fd060 100644
> > > > > --- a/libxfs/libxfs_io.h
> > > > > +++ b/libxfs/libxfs_io.h
> > > > > @@ -23,10 +23,12 @@ struct xfs_perag;
> > > > >  struct xfs_buftarg {
> > > > >  	struct xfs_mount	*bt_mount;
> > > > >  	dev_t			dev;
> > > > > +	bool			lost_writes;
> > > > >  };
> > > > >  
> > > > >  extern void	libxfs_buftarg_init(struct xfs_mount *mp, dev_t ddev,
> > > > >  				    dev_t logdev, dev_t rtdev);
> > > > > +int libxfs_blkdev_issue_flush(struct xfs_buftarg *btp);
> > > > >  
> > > > >  #define LIBXFS_BBTOOFF64(bbs)	(((xfs_off_t)(bbs)) << BBSHIFT)
> > > > >  
> > > > > diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
> > > > > index 8b47d438..92e497f9 100644
> > > > > --- a/libxfs/rdwr.c
> > > > > +++ b/libxfs/rdwr.c
> > > > > @@ -17,6 +17,7 @@
> > > > >  #include "xfs_inode_fork.h"
> > > > >  #include "xfs_inode.h"
> > > > >  #include "xfs_trans.h"
> > > > > +#include "libfrog/platform.h"
> > > > >  
> > > > >  #include "libxfs.h"		/* for LIBXFS_EXIT_ON_FAILURE */
> > > > >  
> > > > > @@ -1227,9 +1228,11 @@ libxfs_brelse(
> > > > >  
> > > > >  	if (!bp)
> > > > >  		return;
> > > > > -	if (bp->b_flags & LIBXFS_B_DIRTY)
> > > > > +	if (bp->b_flags & LIBXFS_B_DIRTY) {
> > > > >  		fprintf(stderr,
> > > > >  			"releasing dirty buffer to free list!\n");
> > > > > +		bp->b_target->lost_writes = true;
> > > > > +	}
> > > > >  
> > > > >  	pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
> > > > >  	list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
> > > > > @@ -1248,9 +1251,11 @@ libxfs_bulkrelse(
> > > > >  		return 0 ;
> > > > >  
> > > > >  	list_for_each_entry(bp, list, b_node.cn_mru) {
> > > > > -		if (bp->b_flags & LIBXFS_B_DIRTY)
> > > > > +		if (bp->b_flags & LIBXFS_B_DIRTY) {
> > > > >  			fprintf(stderr,
> > > > >  				"releasing dirty buffer (bulk) to free list!\n");
> > > > > +			bp->b_target->lost_writes = true;
> > > > > +		}
> > > > >  		count++;
> > > > >  	}
> > > > >  
> > > > > @@ -1479,6 +1484,24 @@ libxfs_irele(
> > > > >  	kmem_cache_free(xfs_inode_zone, ip);
> > > > >  }
> > > > >  
> > > > > +/*
> > > > > + * Flush everything dirty in the kernel and disk write caches to stable media.
> > > > > + * Returns 0 for success or a negative error code.
> > > > > + */
> > > > > +int
> > > > > +libxfs_blkdev_issue_flush(
> > > > > +	struct xfs_buftarg	*btp)
> > > > > +{
> > > > > +	int			fd, ret;
> > > > > +
> > > > > +	if (btp->dev == 0)
> > > > > +		return 0;
> > > > > +
> > > > > +	fd = libxfs_device_to_fd(btp->dev);
> > > > > +	ret = platform_flush_device(fd, btp->dev);
> > > > > +	return ret ? -errno : 0;
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * Write out a buffer list synchronously.
> > > > >   *
> > > > > 
> > > > 
> > > 
> > 
>
Dave Chinner Feb. 20, 2020, 11:40 p.m. UTC | #6
On Wed, Feb 19, 2020 at 05:42:06PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Add a new function that will ensure that everything we changed has
> landed on stable media, and report the results.  Subsequent commits will
> teach the individual programs to report when things go wrong.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  include/xfs_mount.h |    3 +++
>  libxfs/init.c       |   43 +++++++++++++++++++++++++++++++++++++++++++
>  libxfs/libxfs_io.h  |    2 ++
>  libxfs/rdwr.c       |   27 +++++++++++++++++++++++++--
>  4 files changed, 73 insertions(+), 2 deletions(-)
> 
> 
> diff --git a/include/xfs_mount.h b/include/xfs_mount.h
> index 29b3cc1b..c80aaf69 100644
> --- a/include/xfs_mount.h
> +++ b/include/xfs_mount.h
> @@ -187,4 +187,7 @@ extern xfs_mount_t	*libxfs_mount (xfs_mount_t *, xfs_sb_t *,
>  extern void	libxfs_umount (xfs_mount_t *);
>  extern void	libxfs_rtmount_destroy (xfs_mount_t *);
>  
> +void libxfs_flush_devices(struct xfs_mount *mp, int *datadev, int *logdev,
> +		int *rtdev);
> +
>  #endif	/* __XFS_MOUNT_H__ */
> diff --git a/libxfs/init.c b/libxfs/init.c
> index a0d4b7f4..d1d3f4df 100644
> --- a/libxfs/init.c
> +++ b/libxfs/init.c
> @@ -569,6 +569,8 @@ libxfs_buftarg_alloc(
>  	}
>  	btp->bt_mount = mp;
>  	btp->dev = dev;
> +	btp->lost_writes = false;
> +
>  	return btp;
>  }
>  
> @@ -791,6 +793,47 @@ libxfs_rtmount_destroy(xfs_mount_t *mp)
>  	mp->m_rsumip = mp->m_rbmip = NULL;
>  }
>  
> +static inline int
> +libxfs_flush_buftarg(
> +	struct xfs_buftarg	*btp)
> +{
> +	if (btp->lost_writes)
> +		return -ENOTRECOVERABLE;
> +
> +	return libxfs_blkdev_issue_flush(btp);
> +}
> +
> +/*
> + * Purge the buffer cache to write all dirty buffers to disk and free all
> + * incore buffers.  Buffers that cannot be written will cause the lost_writes
> + * flag to be set in the buftarg.  If there were no lost writes, flush the
> + * device to make sure the writes made it to stable storage.
> + *
> + * For each device, the return code will be set to -ENOTRECOVERABLE if we
> + * couldn't write something to disk; or the results of the block device flush
> + * operation.
> + */
> +void
> +libxfs_flush_devices(
> +	struct xfs_mount	*mp,
> +	int			*datadev,
> +	int			*logdev,
> +	int			*rtdev)
> +{
> +	*datadev = *logdev = *rtdev = 0;
> +
> +	libxfs_bcache_purge();
> +
> +	if (mp->m_ddev_targp)
> +		*datadev = libxfs_flush_buftarg(mp->m_ddev_targp);
> +
> +	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
> +		*logdev = libxfs_flush_buftarg(mp->m_logdev_targp);
> +
> +	if (mp->m_rtdev_targp)
> +		*rtdev = libxfs_flush_buftarg(mp->m_rtdev_targp);
> +}

So one of the things I'm trying to do is make this use similar code
to the kernel. basically this close process is equivalent of
xfs_wait_buftarg() which waits for all references to buffers to
got away, and if any buffer it tagged with WRITE_FAIL then it issues
and alert before it frees the buffer.

IOWs, any sort of delayed/async write failure that hasn't been
otherwise caught by the async/delwri code is caught by the device
close code....

Doing things like this (storing a "lost writes" flag on the buftarg)
I think is just going to make it harder to switch to kernel
equivalent code because it just introduces even more of an impedance
mismatch between userspace and kernel code...

Cheers,

Dave.
Darrick J. Wong Feb. 21, 2020, 12:33 a.m. UTC | #7
On Fri, Feb 21, 2020 at 10:40:55AM +1100, Dave Chinner wrote:
> On Wed, Feb 19, 2020 at 05:42:06PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Add a new function that will ensure that everything we changed has
> > landed on stable media, and report the results.  Subsequent commits will
> > teach the individual programs to report when things go wrong.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  include/xfs_mount.h |    3 +++
> >  libxfs/init.c       |   43 +++++++++++++++++++++++++++++++++++++++++++
> >  libxfs/libxfs_io.h  |    2 ++
> >  libxfs/rdwr.c       |   27 +++++++++++++++++++++++++--
> >  4 files changed, 73 insertions(+), 2 deletions(-)
> > 
> > 
> > diff --git a/include/xfs_mount.h b/include/xfs_mount.h
> > index 29b3cc1b..c80aaf69 100644
> > --- a/include/xfs_mount.h
> > +++ b/include/xfs_mount.h
> > @@ -187,4 +187,7 @@ extern xfs_mount_t	*libxfs_mount (xfs_mount_t *, xfs_sb_t *,
> >  extern void	libxfs_umount (xfs_mount_t *);
> >  extern void	libxfs_rtmount_destroy (xfs_mount_t *);
> >  
> > +void libxfs_flush_devices(struct xfs_mount *mp, int *datadev, int *logdev,
> > +		int *rtdev);
> > +
> >  #endif	/* __XFS_MOUNT_H__ */
> > diff --git a/libxfs/init.c b/libxfs/init.c
> > index a0d4b7f4..d1d3f4df 100644
> > --- a/libxfs/init.c
> > +++ b/libxfs/init.c
> > @@ -569,6 +569,8 @@ libxfs_buftarg_alloc(
> >  	}
> >  	btp->bt_mount = mp;
> >  	btp->dev = dev;
> > +	btp->lost_writes = false;
> > +
> >  	return btp;
> >  }
> >  
> > @@ -791,6 +793,47 @@ libxfs_rtmount_destroy(xfs_mount_t *mp)
> >  	mp->m_rsumip = mp->m_rbmip = NULL;
> >  }
> >  
> > +static inline int
> > +libxfs_flush_buftarg(
> > +	struct xfs_buftarg	*btp)
> > +{
> > +	if (btp->lost_writes)
> > +		return -ENOTRECOVERABLE;
> > +
> > +	return libxfs_blkdev_issue_flush(btp);
> > +}
> > +
> > +/*
> > + * Purge the buffer cache to write all dirty buffers to disk and free all
> > + * incore buffers.  Buffers that cannot be written will cause the lost_writes
> > + * flag to be set in the buftarg.  If there were no lost writes, flush the
> > + * device to make sure the writes made it to stable storage.
> > + *
> > + * For each device, the return code will be set to -ENOTRECOVERABLE if we
> > + * couldn't write something to disk; or the results of the block device flush
> > + * operation.
> > + */
> > +void
> > +libxfs_flush_devices(
> > +	struct xfs_mount	*mp,
> > +	int			*datadev,
> > +	int			*logdev,
> > +	int			*rtdev)
> > +{
> > +	*datadev = *logdev = *rtdev = 0;
> > +
> > +	libxfs_bcache_purge();
> > +
> > +	if (mp->m_ddev_targp)
> > +		*datadev = libxfs_flush_buftarg(mp->m_ddev_targp);
> > +
> > +	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
> > +		*logdev = libxfs_flush_buftarg(mp->m_logdev_targp);
> > +
> > +	if (mp->m_rtdev_targp)
> > +		*rtdev = libxfs_flush_buftarg(mp->m_rtdev_targp);
> > +}
> 
> So one of the things I'm trying to do is make this use similar code
> to the kernel. basically this close process is equivalent of
> xfs_wait_buftarg() which waits for all references to buffers to
> got away, and if any buffer it tagged with WRITE_FAIL then it issues
> and alert before it frees the buffer.
> 
> IOWs, any sort of delayed/async write failure that hasn't been
> otherwise caught by the async/delwri code is caught by the device
> close code....
> 
> Doing things like this (storing a "lost writes" flag on the buftarg)
> I think is just going to make it harder to switch to kernel
> equivalent code because it just introduces even more of an impedance
> mismatch between userspace and kernel code...

Hmm.  In today's version of the code I've taken Brian's suggestions and
hidden all of the flushing and whinging details inside libxfs_umount.
This eliminates all of the external API and dependency hell (except that
_umount now returns The Usual Errorcode) which will make it easier(?) to
rip all of that out some day when we're ready to land your libaio port.
Maybe?

In the meantime, I really /do/ need to solve the user complaints about
xfs_repair spewing evidence on stderr that it hasn't fully fixed the fs
and yet exits with 0 status.

It would be much easier for me to design my patchsets to minimize your
rebasing headaches if you patchbombed your development tree on a
semi-regular basis like I do, if nothing else so that we can all see
what could be in the pipeline. :)

(I dunno, maybe we need a separate git backup^W^Wpatchbomb list. :P)

--D

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
diff mbox series

Patch

diff --git a/include/xfs_mount.h b/include/xfs_mount.h
index 29b3cc1b..c80aaf69 100644
--- a/include/xfs_mount.h
+++ b/include/xfs_mount.h
@@ -187,4 +187,7 @@  extern xfs_mount_t	*libxfs_mount (xfs_mount_t *, xfs_sb_t *,
 extern void	libxfs_umount (xfs_mount_t *);
 extern void	libxfs_rtmount_destroy (xfs_mount_t *);
 
+void libxfs_flush_devices(struct xfs_mount *mp, int *datadev, int *logdev,
+		int *rtdev);
+
 #endif	/* __XFS_MOUNT_H__ */
diff --git a/libxfs/init.c b/libxfs/init.c
index a0d4b7f4..d1d3f4df 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -569,6 +569,8 @@  libxfs_buftarg_alloc(
 	}
 	btp->bt_mount = mp;
 	btp->dev = dev;
+	btp->lost_writes = false;
+
 	return btp;
 }
 
@@ -791,6 +793,47 @@  libxfs_rtmount_destroy(xfs_mount_t *mp)
 	mp->m_rsumip = mp->m_rbmip = NULL;
 }
 
+static inline int
+libxfs_flush_buftarg(
+	struct xfs_buftarg	*btp)
+{
+	if (btp->lost_writes)
+		return -ENOTRECOVERABLE;
+
+	return libxfs_blkdev_issue_flush(btp);
+}
+
+/*
+ * Purge the buffer cache to write all dirty buffers to disk and free all
+ * incore buffers.  Buffers that cannot be written will cause the lost_writes
+ * flag to be set in the buftarg.  If there were no lost writes, flush the
+ * device to make sure the writes made it to stable storage.
+ *
+ * For each device, the return code will be set to -ENOTRECOVERABLE if we
+ * couldn't write something to disk; or the results of the block device flush
+ * operation.
+ */
+void
+libxfs_flush_devices(
+	struct xfs_mount	*mp,
+	int			*datadev,
+	int			*logdev,
+	int			*rtdev)
+{
+	*datadev = *logdev = *rtdev = 0;
+
+	libxfs_bcache_purge();
+
+	if (mp->m_ddev_targp)
+		*datadev = libxfs_flush_buftarg(mp->m_ddev_targp);
+
+	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
+		*logdev = libxfs_flush_buftarg(mp->m_logdev_targp);
+
+	if (mp->m_rtdev_targp)
+		*rtdev = libxfs_flush_buftarg(mp->m_rtdev_targp);
+}
+
 /*
  * Release any resource obtained during a mount.
  */
diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h
index 579df52b..fc0fd060 100644
--- a/libxfs/libxfs_io.h
+++ b/libxfs/libxfs_io.h
@@ -23,10 +23,12 @@  struct xfs_perag;
 struct xfs_buftarg {
 	struct xfs_mount	*bt_mount;
 	dev_t			dev;
+	bool			lost_writes;
 };
 
 extern void	libxfs_buftarg_init(struct xfs_mount *mp, dev_t ddev,
 				    dev_t logdev, dev_t rtdev);
+int libxfs_blkdev_issue_flush(struct xfs_buftarg *btp);
 
 #define LIBXFS_BBTOOFF64(bbs)	(((xfs_off_t)(bbs)) << BBSHIFT)
 
diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
index 8b47d438..92e497f9 100644
--- a/libxfs/rdwr.c
+++ b/libxfs/rdwr.c
@@ -17,6 +17,7 @@ 
 #include "xfs_inode_fork.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
+#include "libfrog/platform.h"
 
 #include "libxfs.h"		/* for LIBXFS_EXIT_ON_FAILURE */
 
@@ -1227,9 +1228,11 @@  libxfs_brelse(
 
 	if (!bp)
 		return;
-	if (bp->b_flags & LIBXFS_B_DIRTY)
+	if (bp->b_flags & LIBXFS_B_DIRTY) {
 		fprintf(stderr,
 			"releasing dirty buffer to free list!\n");
+		bp->b_target->lost_writes = true;
+	}
 
 	pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
 	list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
@@ -1248,9 +1251,11 @@  libxfs_bulkrelse(
 		return 0 ;
 
 	list_for_each_entry(bp, list, b_node.cn_mru) {
-		if (bp->b_flags & LIBXFS_B_DIRTY)
+		if (bp->b_flags & LIBXFS_B_DIRTY) {
 			fprintf(stderr,
 				"releasing dirty buffer (bulk) to free list!\n");
+			bp->b_target->lost_writes = true;
+		}
 		count++;
 	}
 
@@ -1479,6 +1484,24 @@  libxfs_irele(
 	kmem_cache_free(xfs_inode_zone, ip);
 }
 
+/*
+ * Flush everything dirty in the kernel and disk write caches to stable media.
+ * Returns 0 for success or a negative error code.
+ */
+int
+libxfs_blkdev_issue_flush(
+	struct xfs_buftarg	*btp)
+{
+	int			fd, ret;
+
+	if (btp->dev == 0)
+		return 0;
+
+	fd = libxfs_device_to_fd(btp->dev);
+	ret = platform_flush_device(fd, btp->dev);
+	return ret ? -errno : 0;
+}
+
 /*
  * Write out a buffer list synchronously.
  *