diff mbox series

[07/10] xfs_repair: set NEEDSREPAIR when we deliberately corrupt directories

Message ID 161284384405.3057868.8114203697655713495.stgit@magnolia (mailing list archive)
State Superseded
Headers show
Series xfs: add the ability to flag a fs for repair | expand

Commit Message

Darrick J. Wong Feb. 9, 2021, 4:10 a.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

There are a few places in xfs_repair's directory checking code where we
deliberately corrupt a directory entry as a sentinel to trigger a
correction in later repair phase.  In the mean time, the filesystem is
inconsistent, so set the needsrepair flag to force a re-run of repair if
the system goes down.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/agheader.h   |    2 ++
 repair/dir2.c       |    3 +++
 repair/phase6.c     |    7 +++++++
 repair/xfs_repair.c |   37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+)

Comments

Christoph Hellwig Feb. 9, 2021, 9:13 a.m. UTC | #1
On Mon, Feb 08, 2021 at 08:10:44PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> There are a few places in xfs_repair's directory checking code where we
> deliberately corrupt a directory entry as a sentinel to trigger a
> correction in later repair phase.  In the mean time, the filesystem is
> inconsistent, so set the needsrepair flag to force a re-run of repair if
> the system goes down.

I guess this is an improvement over what we have no, but I suspect an
in-core side band way to notify the later phases would be much better
than these corrupt sentinel values..
Brian Foster Feb. 9, 2021, 5:20 p.m. UTC | #2
On Mon, Feb 08, 2021 at 08:10:44PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> There are a few places in xfs_repair's directory checking code where we
> deliberately corrupt a directory entry as a sentinel to trigger a
> correction in later repair phase.  In the mean time, the filesystem is
> inconsistent, so set the needsrepair flag to force a re-run of repair if
> the system goes down.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---

Hmm.. this seems orthogonal to the rest of the series. I'm sure we can
come up with various additional uses for the bit, but it seems a little
odd to me that repair might set it in some cases after a crash but not
others (if the filesystem happens to already be corrupt, for example).

Brian

>  repair/agheader.h   |    2 ++
>  repair/dir2.c       |    3 +++
>  repair/phase6.c     |    7 +++++++
>  repair/xfs_repair.c |   37 +++++++++++++++++++++++++++++++++++++
>  4 files changed, 49 insertions(+)
> 
> 
> diff --git a/repair/agheader.h b/repair/agheader.h
> index a63827c8..fa6fe596 100644
> --- a/repair/agheader.h
> +++ b/repair/agheader.h
> @@ -82,3 +82,5 @@ typedef struct fs_geo_list  {
>  #define XR_AG_AGF	0x2
>  #define XR_AG_AGI	0x4
>  #define XR_AG_SB_SEC	0x8
> +
> +void force_needsrepair(struct xfs_mount *mp);
> diff --git a/repair/dir2.c b/repair/dir2.c
> index eabdb4f2..922b8a3e 100644
> --- a/repair/dir2.c
> +++ b/repair/dir2.c
> @@ -15,6 +15,7 @@
>  #include "da_util.h"
>  #include "prefetch.h"
>  #include "progress.h"
> +#include "agheader.h"
>  
>  /*
>   * Known bad inode list.  These are seen when the leaf and node
> @@ -774,6 +775,7 @@ _("entry at block %u offset %" PRIdPTR " in directory inode %" PRIu64
>  				do_warn(
>  _("\tclearing inode number in entry at offset %" PRIdPTR "...\n"),
>  					(intptr_t)ptr - (intptr_t)d);
> +				force_needsrepair(mp);
>  				dep->name[0] = '/';
>  				*dirty = 1;
>  			} else {
> @@ -914,6 +916,7 @@ _("entry \"%*.*s\" in directory inode %" PRIu64 " points to self: "),
>  		 */
>  		if (junkit) {
>  			if (!no_modify) {
> +				force_needsrepair(mp);
>  				dep->name[0] = '/';
>  				*dirty = 1;
>  				do_warn(_("clearing entry\n"));
> diff --git a/repair/phase6.c b/repair/phase6.c
> index 14464bef..5ecbe9b2 100644
> --- a/repair/phase6.c
> +++ b/repair/phase6.c
> @@ -1649,6 +1649,7 @@ longform_dir2_entry_check_data(
>  			if (entry_junked(
>  	_("entry \"%s\" in directory inode %" PRIu64 " points to non-existent inode %" PRIu64 ""),
>  					fname, ip->i_ino, inum)) {
> +				force_needsrepair(mp);
>  				dep->name[0] = '/';
>  				libxfs_dir2_data_log_entry(&da, bp, dep);
>  			}
> @@ -1666,6 +1667,7 @@ longform_dir2_entry_check_data(
>  			if (entry_junked(
>  	_("entry \"%s\" in directory inode %" PRIu64 " points to free inode %" PRIu64),
>  					fname, ip->i_ino, inum)) {
> +				force_needsrepair(mp);
>  				dep->name[0] = '/';
>  				libxfs_dir2_data_log_entry(&da, bp, dep);
>  			}
> @@ -1684,6 +1686,7 @@ longform_dir2_entry_check_data(
>  				if (entry_junked(
>  	_("%s (ino %" PRIu64 ") in root (%" PRIu64 ") is not a directory"),
>  						ORPHANAGE, inum, ip->i_ino)) {
> +					force_needsrepair(mp);
>  					dep->name[0] = '/';
>  					libxfs_dir2_data_log_entry(&da, bp, dep);
>  				}
> @@ -1706,6 +1709,7 @@ longform_dir2_entry_check_data(
>  			if (entry_junked(
>  	_("entry \"%s\" (ino %" PRIu64 ") in dir %" PRIu64 " is a duplicate name"),
>  					fname, inum, ip->i_ino)) {
> +				force_needsrepair(mp);
>  				dep->name[0] = '/';
>  				libxfs_dir2_data_log_entry(&da, bp, dep);
>  			}
> @@ -1737,6 +1741,7 @@ longform_dir2_entry_check_data(
>  				if (entry_junked(
>  	_("entry \"%s\" (ino %" PRIu64 ") in dir %" PRIu64 " is not in the the first block"), fname,
>  						inum, ip->i_ino)) {
> +					force_needsrepair(mp);
>  					dep->name[0] = '/';
>  					libxfs_dir2_data_log_entry(&da, bp, dep);
>  				}
> @@ -1764,6 +1769,7 @@ longform_dir2_entry_check_data(
>  				if (entry_junked(
>  	_("entry \"%s\" in dir %" PRIu64 " is not the first entry"),
>  						fname, inum, ip->i_ino)) {
> +					force_needsrepair(mp);
>  					dep->name[0] = '/';
>  					libxfs_dir2_data_log_entry(&da, bp, dep);
>  				}
> @@ -1852,6 +1858,7 @@ _("entry \"%s\" in dir inode %" PRIu64 " inconsistent with .. value (%" PRIu64 "
>  				orphanage_ino = 0;
>  			nbad++;
>  			if (!no_modify)  {
> +				force_needsrepair(mp);
>  				dep->name[0] = '/';
>  				libxfs_dir2_data_log_entry(&da, bp, dep);
>  				if (verbose)
> diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
> index f607afcb..9dc73854 100644
> --- a/repair/xfs_repair.c
> +++ b/repair/xfs_repair.c
> @@ -754,6 +754,43 @@ clear_needsrepair(
>  		libxfs_buf_relse(bp);
>  }
>  
> +/*
> + * Mark the filesystem as needing repair.  This should only be called by code
> + * that deliberately sets invalid sentinel values in the on-disk metadata to
> + * trigger a later reconstruction, and only after we've settled the primary
> + * super contents (i.e. after phase 1).
> + */
> +void
> +force_needsrepair(
> +	struct xfs_mount	*mp)
> +{
> +	struct xfs_buf		*bp;
> +	int			error;
> +
> +	if (!xfs_sb_version_hascrc(&mp->m_sb) ||
> +	    xfs_sb_version_needsrepair(&mp->m_sb))
> +		return;
> +
> +	bp = libxfs_getsb(mp);
> +	if (!bp || bp->b_error) {
> +		do_log(
> +	_("couldn't get superblock to set needsrepair, err=%d\n"),
> +				bp ? bp->b_error : ENOMEM);
> +		return;
> +	} else {
> +		mp->m_sb.sb_features_incompat |=
> +				XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR;
> +		libxfs_sb_to_disk(bp->b_addr, &mp->m_sb);
> +
> +		/* Force the primary super to disk immediately. */
> +		error = -libxfs_bwrite(bp);
> +		if (error)
> +			do_log(_("couldn't force needsrepair, err=%d\n"), error);
> +	}
> +	if (bp)
> +		libxfs_buf_relse(bp);
> +}
> +
>  int
>  main(int argc, char **argv)
>  {
>
Darrick J. Wong Feb. 9, 2021, 6:35 p.m. UTC | #3
On Tue, Feb 09, 2021 at 12:20:59PM -0500, Brian Foster wrote:
> On Mon, Feb 08, 2021 at 08:10:44PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > There are a few places in xfs_repair's directory checking code where we
> > deliberately corrupt a directory entry as a sentinel to trigger a
> > correction in later repair phase.  In the mean time, the filesystem is
> > inconsistent, so set the needsrepair flag to force a re-run of repair if
> > the system goes down.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > ---
> 
> Hmm.. this seems orthogonal to the rest of the series. I'm sure we can
> come up with various additional uses for the bit, but it seems a little
> odd to me that repair might set it in some cases after a crash but not
> others (if the filesystem happens to already be corrupt, for example).

<nod> Another option I thought of is to add a hook to the buffer cache
so that the first time anyone tries to bwrite a buffer (either directly
or via a delwri list or normal buffer cache writeback) we'll also set
needsrepair on the ondisk primary super.  That would protect us against
other scenarios like crashing after writing a new AGF but before writing
the new AGI, where the fs is left in an indeterminate state.

Hmm, maybe I should pursue /that/ instead.

--D

> Brian
> 
> >  repair/agheader.h   |    2 ++
> >  repair/dir2.c       |    3 +++
> >  repair/phase6.c     |    7 +++++++
> >  repair/xfs_repair.c |   37 +++++++++++++++++++++++++++++++++++++
> >  4 files changed, 49 insertions(+)
> > 
> > 
> > diff --git a/repair/agheader.h b/repair/agheader.h
> > index a63827c8..fa6fe596 100644
> > --- a/repair/agheader.h
> > +++ b/repair/agheader.h
> > @@ -82,3 +82,5 @@ typedef struct fs_geo_list  {
> >  #define XR_AG_AGF	0x2
> >  #define XR_AG_AGI	0x4
> >  #define XR_AG_SB_SEC	0x8
> > +
> > +void force_needsrepair(struct xfs_mount *mp);
> > diff --git a/repair/dir2.c b/repair/dir2.c
> > index eabdb4f2..922b8a3e 100644
> > --- a/repair/dir2.c
> > +++ b/repair/dir2.c
> > @@ -15,6 +15,7 @@
> >  #include "da_util.h"
> >  #include "prefetch.h"
> >  #include "progress.h"
> > +#include "agheader.h"
> >  
> >  /*
> >   * Known bad inode list.  These are seen when the leaf and node
> > @@ -774,6 +775,7 @@ _("entry at block %u offset %" PRIdPTR " in directory inode %" PRIu64
> >  				do_warn(
> >  _("\tclearing inode number in entry at offset %" PRIdPTR "...\n"),
> >  					(intptr_t)ptr - (intptr_t)d);
> > +				force_needsrepair(mp);
> >  				dep->name[0] = '/';
> >  				*dirty = 1;
> >  			} else {
> > @@ -914,6 +916,7 @@ _("entry \"%*.*s\" in directory inode %" PRIu64 " points to self: "),
> >  		 */
> >  		if (junkit) {
> >  			if (!no_modify) {
> > +				force_needsrepair(mp);
> >  				dep->name[0] = '/';
> >  				*dirty = 1;
> >  				do_warn(_("clearing entry\n"));
> > diff --git a/repair/phase6.c b/repair/phase6.c
> > index 14464bef..5ecbe9b2 100644
> > --- a/repair/phase6.c
> > +++ b/repair/phase6.c
> > @@ -1649,6 +1649,7 @@ longform_dir2_entry_check_data(
> >  			if (entry_junked(
> >  	_("entry \"%s\" in directory inode %" PRIu64 " points to non-existent inode %" PRIu64 ""),
> >  					fname, ip->i_ino, inum)) {
> > +				force_needsrepair(mp);
> >  				dep->name[0] = '/';
> >  				libxfs_dir2_data_log_entry(&da, bp, dep);
> >  			}
> > @@ -1666,6 +1667,7 @@ longform_dir2_entry_check_data(
> >  			if (entry_junked(
> >  	_("entry \"%s\" in directory inode %" PRIu64 " points to free inode %" PRIu64),
> >  					fname, ip->i_ino, inum)) {
> > +				force_needsrepair(mp);
> >  				dep->name[0] = '/';
> >  				libxfs_dir2_data_log_entry(&da, bp, dep);
> >  			}
> > @@ -1684,6 +1686,7 @@ longform_dir2_entry_check_data(
> >  				if (entry_junked(
> >  	_("%s (ino %" PRIu64 ") in root (%" PRIu64 ") is not a directory"),
> >  						ORPHANAGE, inum, ip->i_ino)) {
> > +					force_needsrepair(mp);
> >  					dep->name[0] = '/';
> >  					libxfs_dir2_data_log_entry(&da, bp, dep);
> >  				}
> > @@ -1706,6 +1709,7 @@ longform_dir2_entry_check_data(
> >  			if (entry_junked(
> >  	_("entry \"%s\" (ino %" PRIu64 ") in dir %" PRIu64 " is a duplicate name"),
> >  					fname, inum, ip->i_ino)) {
> > +				force_needsrepair(mp);
> >  				dep->name[0] = '/';
> >  				libxfs_dir2_data_log_entry(&da, bp, dep);
> >  			}
> > @@ -1737,6 +1741,7 @@ longform_dir2_entry_check_data(
> >  				if (entry_junked(
> >  	_("entry \"%s\" (ino %" PRIu64 ") in dir %" PRIu64 " is not in the the first block"), fname,
> >  						inum, ip->i_ino)) {
> > +					force_needsrepair(mp);
> >  					dep->name[0] = '/';
> >  					libxfs_dir2_data_log_entry(&da, bp, dep);
> >  				}
> > @@ -1764,6 +1769,7 @@ longform_dir2_entry_check_data(
> >  				if (entry_junked(
> >  	_("entry \"%s\" in dir %" PRIu64 " is not the first entry"),
> >  						fname, inum, ip->i_ino)) {
> > +					force_needsrepair(mp);
> >  					dep->name[0] = '/';
> >  					libxfs_dir2_data_log_entry(&da, bp, dep);
> >  				}
> > @@ -1852,6 +1858,7 @@ _("entry \"%s\" in dir inode %" PRIu64 " inconsistent with .. value (%" PRIu64 "
> >  				orphanage_ino = 0;
> >  			nbad++;
> >  			if (!no_modify)  {
> > +				force_needsrepair(mp);
> >  				dep->name[0] = '/';
> >  				libxfs_dir2_data_log_entry(&da, bp, dep);
> >  				if (verbose)
> > diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
> > index f607afcb..9dc73854 100644
> > --- a/repair/xfs_repair.c
> > +++ b/repair/xfs_repair.c
> > @@ -754,6 +754,43 @@ clear_needsrepair(
> >  		libxfs_buf_relse(bp);
> >  }
> >  
> > +/*
> > + * Mark the filesystem as needing repair.  This should only be called by code
> > + * that deliberately sets invalid sentinel values in the on-disk metadata to
> > + * trigger a later reconstruction, and only after we've settled the primary
> > + * super contents (i.e. after phase 1).
> > + */
> > +void
> > +force_needsrepair(
> > +	struct xfs_mount	*mp)
> > +{
> > +	struct xfs_buf		*bp;
> > +	int			error;
> > +
> > +	if (!xfs_sb_version_hascrc(&mp->m_sb) ||
> > +	    xfs_sb_version_needsrepair(&mp->m_sb))
> > +		return;
> > +
> > +	bp = libxfs_getsb(mp);
> > +	if (!bp || bp->b_error) {
> > +		do_log(
> > +	_("couldn't get superblock to set needsrepair, err=%d\n"),
> > +				bp ? bp->b_error : ENOMEM);
> > +		return;
> > +	} else {
> > +		mp->m_sb.sb_features_incompat |=
> > +				XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR;
> > +		libxfs_sb_to_disk(bp->b_addr, &mp->m_sb);
> > +
> > +		/* Force the primary super to disk immediately. */
> > +		error = -libxfs_bwrite(bp);
> > +		if (error)
> > +			do_log(_("couldn't force needsrepair, err=%d\n"), error);
> > +	}
> > +	if (bp)
> > +		libxfs_buf_relse(bp);
> > +}
> > +
> >  int
> >  main(int argc, char **argv)
> >  {
> > 
>
Darrick J. Wong Feb. 9, 2021, 6:45 p.m. UTC | #4
On Tue, Feb 09, 2021 at 09:13:36AM +0000, Christoph Hellwig wrote:
> On Mon, Feb 08, 2021 at 08:10:44PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > There are a few places in xfs_repair's directory checking code where we
> > deliberately corrupt a directory entry as a sentinel to trigger a
> > correction in later repair phase.  In the mean time, the filesystem is
> > inconsistent, so set the needsrepair flag to force a re-run of repair if
> > the system goes down.
> 
> I guess this is an improvement over what we have no, but I suspect an
> in-core side band way to notify the later phases would be much better
> than these corrupt sentinel values..

It would be, but we'd still have to track which directory entries are
thee ones that we don't want to preserve.  We could rewrite the
directory entry to point to a plausible inode number that isn't
allocated, but that runs the risk that someone will come along and
allocate that inode for lost+found and then we're really in a mess.

Alternately, I suppose we could rewrite the directory entry to be a
DHT_WHITEOUT entry that doesn't look like any that the kernel would
produce ... insofar as I'm not even sure what the real ones were
supposed to look like, given the weird left turn that overlayfs took on
that. :(

--D
Brian Foster Feb. 9, 2021, 7:14 p.m. UTC | #5
On Tue, Feb 09, 2021 at 10:35:42AM -0800, Darrick J. Wong wrote:
> On Tue, Feb 09, 2021 at 12:20:59PM -0500, Brian Foster wrote:
> > On Mon, Feb 08, 2021 at 08:10:44PM -0800, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <djwong@kernel.org>
> > > 
> > > There are a few places in xfs_repair's directory checking code where we
> > > deliberately corrupt a directory entry as a sentinel to trigger a
> > > correction in later repair phase.  In the mean time, the filesystem is
> > > inconsistent, so set the needsrepair flag to force a re-run of repair if
> > > the system goes down.
> > > 
> > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > ---
> > 
> > Hmm.. this seems orthogonal to the rest of the series. I'm sure we can
> > come up with various additional uses for the bit, but it seems a little
> > odd to me that repair might set it in some cases after a crash but not
> > others (if the filesystem happens to already be corrupt, for example).
> 
> <nod> Another option I thought of is to add a hook to the buffer cache
> so that the first time anyone tries to bwrite a buffer (either directly
> or via a delwri list or normal buffer cache writeback) we'll also set
> needsrepair on the ondisk primary super.  That would protect us against
> other scenarios like crashing after writing a new AGF but before writing
> the new AGI, where the fs is left in an indeterminate state.
> 

Yeah, that _seems_ more appropriate to me. It's not immediately clear to
me what the implementation should look like, but in general behavior
that sets needsrepair on first modification and clears it as a final
step sounds more practical. Then the behavior can be easily explained as
"once repair starts on an fs, it must be completed before it is allowed
to mount." I do think this should be lifted off for a followon series so
we can make progress on the feature upgrade bits without growing more
requirements and complexity..

Brian

> Hmm, maybe I should pursue /that/ instead.
> 
> --D
> 
> > Brian
> > 
> > >  repair/agheader.h   |    2 ++
> > >  repair/dir2.c       |    3 +++
> > >  repair/phase6.c     |    7 +++++++
> > >  repair/xfs_repair.c |   37 +++++++++++++++++++++++++++++++++++++
> > >  4 files changed, 49 insertions(+)
> > > 
> > > 
> > > diff --git a/repair/agheader.h b/repair/agheader.h
> > > index a63827c8..fa6fe596 100644
> > > --- a/repair/agheader.h
> > > +++ b/repair/agheader.h
> > > @@ -82,3 +82,5 @@ typedef struct fs_geo_list  {
> > >  #define XR_AG_AGF	0x2
> > >  #define XR_AG_AGI	0x4
> > >  #define XR_AG_SB_SEC	0x8
> > > +
> > > +void force_needsrepair(struct xfs_mount *mp);
> > > diff --git a/repair/dir2.c b/repair/dir2.c
> > > index eabdb4f2..922b8a3e 100644
> > > --- a/repair/dir2.c
> > > +++ b/repair/dir2.c
> > > @@ -15,6 +15,7 @@
> > >  #include "da_util.h"
> > >  #include "prefetch.h"
> > >  #include "progress.h"
> > > +#include "agheader.h"
> > >  
> > >  /*
> > >   * Known bad inode list.  These are seen when the leaf and node
> > > @@ -774,6 +775,7 @@ _("entry at block %u offset %" PRIdPTR " in directory inode %" PRIu64
> > >  				do_warn(
> > >  _("\tclearing inode number in entry at offset %" PRIdPTR "...\n"),
> > >  					(intptr_t)ptr - (intptr_t)d);
> > > +				force_needsrepair(mp);
> > >  				dep->name[0] = '/';
> > >  				*dirty = 1;
> > >  			} else {
> > > @@ -914,6 +916,7 @@ _("entry \"%*.*s\" in directory inode %" PRIu64 " points to self: "),
> > >  		 */
> > >  		if (junkit) {
> > >  			if (!no_modify) {
> > > +				force_needsrepair(mp);
> > >  				dep->name[0] = '/';
> > >  				*dirty = 1;
> > >  				do_warn(_("clearing entry\n"));
> > > diff --git a/repair/phase6.c b/repair/phase6.c
> > > index 14464bef..5ecbe9b2 100644
> > > --- a/repair/phase6.c
> > > +++ b/repair/phase6.c
> > > @@ -1649,6 +1649,7 @@ longform_dir2_entry_check_data(
> > >  			if (entry_junked(
> > >  	_("entry \"%s\" in directory inode %" PRIu64 " points to non-existent inode %" PRIu64 ""),
> > >  					fname, ip->i_ino, inum)) {
> > > +				force_needsrepair(mp);
> > >  				dep->name[0] = '/';
> > >  				libxfs_dir2_data_log_entry(&da, bp, dep);
> > >  			}
> > > @@ -1666,6 +1667,7 @@ longform_dir2_entry_check_data(
> > >  			if (entry_junked(
> > >  	_("entry \"%s\" in directory inode %" PRIu64 " points to free inode %" PRIu64),
> > >  					fname, ip->i_ino, inum)) {
> > > +				force_needsrepair(mp);
> > >  				dep->name[0] = '/';
> > >  				libxfs_dir2_data_log_entry(&da, bp, dep);
> > >  			}
> > > @@ -1684,6 +1686,7 @@ longform_dir2_entry_check_data(
> > >  				if (entry_junked(
> > >  	_("%s (ino %" PRIu64 ") in root (%" PRIu64 ") is not a directory"),
> > >  						ORPHANAGE, inum, ip->i_ino)) {
> > > +					force_needsrepair(mp);
> > >  					dep->name[0] = '/';
> > >  					libxfs_dir2_data_log_entry(&da, bp, dep);
> > >  				}
> > > @@ -1706,6 +1709,7 @@ longform_dir2_entry_check_data(
> > >  			if (entry_junked(
> > >  	_("entry \"%s\" (ino %" PRIu64 ") in dir %" PRIu64 " is a duplicate name"),
> > >  					fname, inum, ip->i_ino)) {
> > > +				force_needsrepair(mp);
> > >  				dep->name[0] = '/';
> > >  				libxfs_dir2_data_log_entry(&da, bp, dep);
> > >  			}
> > > @@ -1737,6 +1741,7 @@ longform_dir2_entry_check_data(
> > >  				if (entry_junked(
> > >  	_("entry \"%s\" (ino %" PRIu64 ") in dir %" PRIu64 " is not in the the first block"), fname,
> > >  						inum, ip->i_ino)) {
> > > +					force_needsrepair(mp);
> > >  					dep->name[0] = '/';
> > >  					libxfs_dir2_data_log_entry(&da, bp, dep);
> > >  				}
> > > @@ -1764,6 +1769,7 @@ longform_dir2_entry_check_data(
> > >  				if (entry_junked(
> > >  	_("entry \"%s\" in dir %" PRIu64 " is not the first entry"),
> > >  						fname, inum, ip->i_ino)) {
> > > +					force_needsrepair(mp);
> > >  					dep->name[0] = '/';
> > >  					libxfs_dir2_data_log_entry(&da, bp, dep);
> > >  				}
> > > @@ -1852,6 +1858,7 @@ _("entry \"%s\" in dir inode %" PRIu64 " inconsistent with .. value (%" PRIu64 "
> > >  				orphanage_ino = 0;
> > >  			nbad++;
> > >  			if (!no_modify)  {
> > > +				force_needsrepair(mp);
> > >  				dep->name[0] = '/';
> > >  				libxfs_dir2_data_log_entry(&da, bp, dep);
> > >  				if (verbose)
> > > diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
> > > index f607afcb..9dc73854 100644
> > > --- a/repair/xfs_repair.c
> > > +++ b/repair/xfs_repair.c
> > > @@ -754,6 +754,43 @@ clear_needsrepair(
> > >  		libxfs_buf_relse(bp);
> > >  }
> > >  
> > > +/*
> > > + * Mark the filesystem as needing repair.  This should only be called by code
> > > + * that deliberately sets invalid sentinel values in the on-disk metadata to
> > > + * trigger a later reconstruction, and only after we've settled the primary
> > > + * super contents (i.e. after phase 1).
> > > + */
> > > +void
> > > +force_needsrepair(
> > > +	struct xfs_mount	*mp)
> > > +{
> > > +	struct xfs_buf		*bp;
> > > +	int			error;
> > > +
> > > +	if (!xfs_sb_version_hascrc(&mp->m_sb) ||
> > > +	    xfs_sb_version_needsrepair(&mp->m_sb))
> > > +		return;
> > > +
> > > +	bp = libxfs_getsb(mp);
> > > +	if (!bp || bp->b_error) {
> > > +		do_log(
> > > +	_("couldn't get superblock to set needsrepair, err=%d\n"),
> > > +				bp ? bp->b_error : ENOMEM);
> > > +		return;
> > > +	} else {
> > > +		mp->m_sb.sb_features_incompat |=
> > > +				XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR;
> > > +		libxfs_sb_to_disk(bp->b_addr, &mp->m_sb);
> > > +
> > > +		/* Force the primary super to disk immediately. */
> > > +		error = -libxfs_bwrite(bp);
> > > +		if (error)
> > > +			do_log(_("couldn't force needsrepair, err=%d\n"), error);
> > > +	}
> > > +	if (bp)
> > > +		libxfs_buf_relse(bp);
> > > +}
> > > +
> > >  int
> > >  main(int argc, char **argv)
> > >  {
> > > 
> > 
>
Darrick J. Wong Feb. 9, 2021, 7:43 p.m. UTC | #6
On Tue, Feb 09, 2021 at 02:14:22PM -0500, Brian Foster wrote:
> On Tue, Feb 09, 2021 at 10:35:42AM -0800, Darrick J. Wong wrote:
> > On Tue, Feb 09, 2021 at 12:20:59PM -0500, Brian Foster wrote:
> > > On Mon, Feb 08, 2021 at 08:10:44PM -0800, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <djwong@kernel.org>
> > > > 
> > > > There are a few places in xfs_repair's directory checking code where we
> > > > deliberately corrupt a directory entry as a sentinel to trigger a
> > > > correction in later repair phase.  In the mean time, the filesystem is
> > > > inconsistent, so set the needsrepair flag to force a re-run of repair if
> > > > the system goes down.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > > ---
> > > 
> > > Hmm.. this seems orthogonal to the rest of the series. I'm sure we can
> > > come up with various additional uses for the bit, but it seems a little
> > > odd to me that repair might set it in some cases after a crash but not
> > > others (if the filesystem happens to already be corrupt, for example).
> > 
> > <nod> Another option I thought of is to add a hook to the buffer cache
> > so that the first time anyone tries to bwrite a buffer (either directly
> > or via a delwri list or normal buffer cache writeback) we'll also set
> > needsrepair on the ondisk primary super.  That would protect us against
> > other scenarios like crashing after writing a new AGF but before writing
> > the new AGI, where the fs is left in an indeterminate state.
> > 
> 
> Yeah, that _seems_ more appropriate to me. It's not immediately clear to
> me what the implementation should look like, but in general behavior
> that sets needsrepair on first modification and clears it as a final
> step sounds more practical. Then the behavior can be easily explained as
> "once repair starts on an fs, it must be completed before it is allowed
> to mount." I do think this should be lifted off for a followon series so
> we can make progress on the feature upgrade bits without growing more
> requirements and complexity..

Oh, definitely. I'll withdraw this patch for now in the interests of
getting everything else going for Eric. :)

--D

> Brian
> 
> > Hmm, maybe I should pursue /that/ instead.
> > 
> > --D
> > 
> > > Brian
> > > 
> > > >  repair/agheader.h   |    2 ++
> > > >  repair/dir2.c       |    3 +++
> > > >  repair/phase6.c     |    7 +++++++
> > > >  repair/xfs_repair.c |   37 +++++++++++++++++++++++++++++++++++++
> > > >  4 files changed, 49 insertions(+)
> > > > 
> > > > 
> > > > diff --git a/repair/agheader.h b/repair/agheader.h
> > > > index a63827c8..fa6fe596 100644
> > > > --- a/repair/agheader.h
> > > > +++ b/repair/agheader.h
> > > > @@ -82,3 +82,5 @@ typedef struct fs_geo_list  {
> > > >  #define XR_AG_AGF	0x2
> > > >  #define XR_AG_AGI	0x4
> > > >  #define XR_AG_SB_SEC	0x8
> > > > +
> > > > +void force_needsrepair(struct xfs_mount *mp);
> > > > diff --git a/repair/dir2.c b/repair/dir2.c
> > > > index eabdb4f2..922b8a3e 100644
> > > > --- a/repair/dir2.c
> > > > +++ b/repair/dir2.c
> > > > @@ -15,6 +15,7 @@
> > > >  #include "da_util.h"
> > > >  #include "prefetch.h"
> > > >  #include "progress.h"
> > > > +#include "agheader.h"
> > > >  
> > > >  /*
> > > >   * Known bad inode list.  These are seen when the leaf and node
> > > > @@ -774,6 +775,7 @@ _("entry at block %u offset %" PRIdPTR " in directory inode %" PRIu64
> > > >  				do_warn(
> > > >  _("\tclearing inode number in entry at offset %" PRIdPTR "...\n"),
> > > >  					(intptr_t)ptr - (intptr_t)d);
> > > > +				force_needsrepair(mp);
> > > >  				dep->name[0] = '/';
> > > >  				*dirty = 1;
> > > >  			} else {
> > > > @@ -914,6 +916,7 @@ _("entry \"%*.*s\" in directory inode %" PRIu64 " points to self: "),
> > > >  		 */
> > > >  		if (junkit) {
> > > >  			if (!no_modify) {
> > > > +				force_needsrepair(mp);
> > > >  				dep->name[0] = '/';
> > > >  				*dirty = 1;
> > > >  				do_warn(_("clearing entry\n"));
> > > > diff --git a/repair/phase6.c b/repair/phase6.c
> > > > index 14464bef..5ecbe9b2 100644
> > > > --- a/repair/phase6.c
> > > > +++ b/repair/phase6.c
> > > > @@ -1649,6 +1649,7 @@ longform_dir2_entry_check_data(
> > > >  			if (entry_junked(
> > > >  	_("entry \"%s\" in directory inode %" PRIu64 " points to non-existent inode %" PRIu64 ""),
> > > >  					fname, ip->i_ino, inum)) {
> > > > +				force_needsrepair(mp);
> > > >  				dep->name[0] = '/';
> > > >  				libxfs_dir2_data_log_entry(&da, bp, dep);
> > > >  			}
> > > > @@ -1666,6 +1667,7 @@ longform_dir2_entry_check_data(
> > > >  			if (entry_junked(
> > > >  	_("entry \"%s\" in directory inode %" PRIu64 " points to free inode %" PRIu64),
> > > >  					fname, ip->i_ino, inum)) {
> > > > +				force_needsrepair(mp);
> > > >  				dep->name[0] = '/';
> > > >  				libxfs_dir2_data_log_entry(&da, bp, dep);
> > > >  			}
> > > > @@ -1684,6 +1686,7 @@ longform_dir2_entry_check_data(
> > > >  				if (entry_junked(
> > > >  	_("%s (ino %" PRIu64 ") in root (%" PRIu64 ") is not a directory"),
> > > >  						ORPHANAGE, inum, ip->i_ino)) {
> > > > +					force_needsrepair(mp);
> > > >  					dep->name[0] = '/';
> > > >  					libxfs_dir2_data_log_entry(&da, bp, dep);
> > > >  				}
> > > > @@ -1706,6 +1709,7 @@ longform_dir2_entry_check_data(
> > > >  			if (entry_junked(
> > > >  	_("entry \"%s\" (ino %" PRIu64 ") in dir %" PRIu64 " is a duplicate name"),
> > > >  					fname, inum, ip->i_ino)) {
> > > > +				force_needsrepair(mp);
> > > >  				dep->name[0] = '/';
> > > >  				libxfs_dir2_data_log_entry(&da, bp, dep);
> > > >  			}
> > > > @@ -1737,6 +1741,7 @@ longform_dir2_entry_check_data(
> > > >  				if (entry_junked(
> > > >  	_("entry \"%s\" (ino %" PRIu64 ") in dir %" PRIu64 " is not in the the first block"), fname,
> > > >  						inum, ip->i_ino)) {
> > > > +					force_needsrepair(mp);
> > > >  					dep->name[0] = '/';
> > > >  					libxfs_dir2_data_log_entry(&da, bp, dep);
> > > >  				}
> > > > @@ -1764,6 +1769,7 @@ longform_dir2_entry_check_data(
> > > >  				if (entry_junked(
> > > >  	_("entry \"%s\" in dir %" PRIu64 " is not the first entry"),
> > > >  						fname, inum, ip->i_ino)) {
> > > > +					force_needsrepair(mp);
> > > >  					dep->name[0] = '/';
> > > >  					libxfs_dir2_data_log_entry(&da, bp, dep);
> > > >  				}
> > > > @@ -1852,6 +1858,7 @@ _("entry \"%s\" in dir inode %" PRIu64 " inconsistent with .. value (%" PRIu64 "
> > > >  				orphanage_ino = 0;
> > > >  			nbad++;
> > > >  			if (!no_modify)  {
> > > > +				force_needsrepair(mp);
> > > >  				dep->name[0] = '/';
> > > >  				libxfs_dir2_data_log_entry(&da, bp, dep);
> > > >  				if (verbose)
> > > > diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
> > > > index f607afcb..9dc73854 100644
> > > > --- a/repair/xfs_repair.c
> > > > +++ b/repair/xfs_repair.c
> > > > @@ -754,6 +754,43 @@ clear_needsrepair(
> > > >  		libxfs_buf_relse(bp);
> > > >  }
> > > >  
> > > > +/*
> > > > + * Mark the filesystem as needing repair.  This should only be called by code
> > > > + * that deliberately sets invalid sentinel values in the on-disk metadata to
> > > > + * trigger a later reconstruction, and only after we've settled the primary
> > > > + * super contents (i.e. after phase 1).
> > > > + */
> > > > +void
> > > > +force_needsrepair(
> > > > +	struct xfs_mount	*mp)
> > > > +{
> > > > +	struct xfs_buf		*bp;
> > > > +	int			error;
> > > > +
> > > > +	if (!xfs_sb_version_hascrc(&mp->m_sb) ||
> > > > +	    xfs_sb_version_needsrepair(&mp->m_sb))
> > > > +		return;
> > > > +
> > > > +	bp = libxfs_getsb(mp);
> > > > +	if (!bp || bp->b_error) {
> > > > +		do_log(
> > > > +	_("couldn't get superblock to set needsrepair, err=%d\n"),
> > > > +				bp ? bp->b_error : ENOMEM);
> > > > +		return;
> > > > +	} else {
> > > > +		mp->m_sb.sb_features_incompat |=
> > > > +				XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR;
> > > > +		libxfs_sb_to_disk(bp->b_addr, &mp->m_sb);
> > > > +
> > > > +		/* Force the primary super to disk immediately. */
> > > > +		error = -libxfs_bwrite(bp);
> > > > +		if (error)
> > > > +			do_log(_("couldn't force needsrepair, err=%d\n"), error);
> > > > +	}
> > > > +	if (bp)
> > > > +		libxfs_buf_relse(bp);
> > > > +}
> > > > +
> > > >  int
> > > >  main(int argc, char **argv)
> > > >  {
> > > > 
> > > 
> > 
>
Eric Sandeen Feb. 10, 2021, 8:19 p.m. UTC | #7
On 2/9/21 1:43 PM, Darrick J. Wong wrote:
> On Tue, Feb 09, 2021 at 02:14:22PM -0500, Brian Foster wrote:
>> On Tue, Feb 09, 2021 at 10:35:42AM -0800, Darrick J. Wong wrote:
>>> On Tue, Feb 09, 2021 at 12:20:59PM -0500, Brian Foster wrote:
>>>> On Mon, Feb 08, 2021 at 08:10:44PM -0800, Darrick J. Wong wrote:
>>>>> From: Darrick J. Wong <djwong@kernel.org>
>>>>>
>>>>> There are a few places in xfs_repair's directory checking code where we
>>>>> deliberately corrupt a directory entry as a sentinel to trigger a
>>>>> correction in later repair phase.  In the mean time, the filesystem is
>>>>> inconsistent, so set the needsrepair flag to force a re-run of repair if
>>>>> the system goes down.
>>>>>
>>>>> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
>>>>> ---
>>>>
>>>> Hmm.. this seems orthogonal to the rest of the series. I'm sure we can
>>>> come up with various additional uses for the bit, but it seems a little
>>>> odd to me that repair might set it in some cases after a crash but not
>>>> others (if the filesystem happens to already be corrupt, for example).
>>>
>>> <nod> Another option I thought of is to add a hook to the buffer cache
>>> so that the first time anyone tries to bwrite a buffer (either directly
>>> or via a delwri list or normal buffer cache writeback) we'll also set
>>> needsrepair on the ondisk primary super.  That would protect us against
>>> other scenarios like crashing after writing a new AGF but before writing
>>> the new AGI, where the fs is left in an indeterminate state.
>>>
>>
>> Yeah, that _seems_ more appropriate to me. It's not immediately clear to
>> me what the implementation should look like, but in general behavior
>> that sets needsrepair on first modification and clears it as a final
>> step sounds more practical. Then the behavior can be easily explained as
>> "once repair starts on an fs, it must be completed before it is allowed
>> to mount." I do think this should be lifted off for a followon series so
>> we can make progress on the feature upgrade bits without growing more
>> requirements and complexity..
> 
> Oh, definitely. I'll withdraw this patch for now in the interests of
> getting everything else going for Eric. :)

Noted, I'll drop this one for now, thanks.

-Eric
diff mbox series

Patch

diff --git a/repair/agheader.h b/repair/agheader.h
index a63827c8..fa6fe596 100644
--- a/repair/agheader.h
+++ b/repair/agheader.h
@@ -82,3 +82,5 @@  typedef struct fs_geo_list  {
 #define XR_AG_AGF	0x2
 #define XR_AG_AGI	0x4
 #define XR_AG_SB_SEC	0x8
+
+void force_needsrepair(struct xfs_mount *mp);
diff --git a/repair/dir2.c b/repair/dir2.c
index eabdb4f2..922b8a3e 100644
--- a/repair/dir2.c
+++ b/repair/dir2.c
@@ -15,6 +15,7 @@ 
 #include "da_util.h"
 #include "prefetch.h"
 #include "progress.h"
+#include "agheader.h"
 
 /*
  * Known bad inode list.  These are seen when the leaf and node
@@ -774,6 +775,7 @@  _("entry at block %u offset %" PRIdPTR " in directory inode %" PRIu64
 				do_warn(
 _("\tclearing inode number in entry at offset %" PRIdPTR "...\n"),
 					(intptr_t)ptr - (intptr_t)d);
+				force_needsrepair(mp);
 				dep->name[0] = '/';
 				*dirty = 1;
 			} else {
@@ -914,6 +916,7 @@  _("entry \"%*.*s\" in directory inode %" PRIu64 " points to self: "),
 		 */
 		if (junkit) {
 			if (!no_modify) {
+				force_needsrepair(mp);
 				dep->name[0] = '/';
 				*dirty = 1;
 				do_warn(_("clearing entry\n"));
diff --git a/repair/phase6.c b/repair/phase6.c
index 14464bef..5ecbe9b2 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -1649,6 +1649,7 @@  longform_dir2_entry_check_data(
 			if (entry_junked(
 	_("entry \"%s\" in directory inode %" PRIu64 " points to non-existent inode %" PRIu64 ""),
 					fname, ip->i_ino, inum)) {
+				force_needsrepair(mp);
 				dep->name[0] = '/';
 				libxfs_dir2_data_log_entry(&da, bp, dep);
 			}
@@ -1666,6 +1667,7 @@  longform_dir2_entry_check_data(
 			if (entry_junked(
 	_("entry \"%s\" in directory inode %" PRIu64 " points to free inode %" PRIu64),
 					fname, ip->i_ino, inum)) {
+				force_needsrepair(mp);
 				dep->name[0] = '/';
 				libxfs_dir2_data_log_entry(&da, bp, dep);
 			}
@@ -1684,6 +1686,7 @@  longform_dir2_entry_check_data(
 				if (entry_junked(
 	_("%s (ino %" PRIu64 ") in root (%" PRIu64 ") is not a directory"),
 						ORPHANAGE, inum, ip->i_ino)) {
+					force_needsrepair(mp);
 					dep->name[0] = '/';
 					libxfs_dir2_data_log_entry(&da, bp, dep);
 				}
@@ -1706,6 +1709,7 @@  longform_dir2_entry_check_data(
 			if (entry_junked(
 	_("entry \"%s\" (ino %" PRIu64 ") in dir %" PRIu64 " is a duplicate name"),
 					fname, inum, ip->i_ino)) {
+				force_needsrepair(mp);
 				dep->name[0] = '/';
 				libxfs_dir2_data_log_entry(&da, bp, dep);
 			}
@@ -1737,6 +1741,7 @@  longform_dir2_entry_check_data(
 				if (entry_junked(
 	_("entry \"%s\" (ino %" PRIu64 ") in dir %" PRIu64 " is not in the the first block"), fname,
 						inum, ip->i_ino)) {
+					force_needsrepair(mp);
 					dep->name[0] = '/';
 					libxfs_dir2_data_log_entry(&da, bp, dep);
 				}
@@ -1764,6 +1769,7 @@  longform_dir2_entry_check_data(
 				if (entry_junked(
 	_("entry \"%s\" in dir %" PRIu64 " is not the first entry"),
 						fname, inum, ip->i_ino)) {
+					force_needsrepair(mp);
 					dep->name[0] = '/';
 					libxfs_dir2_data_log_entry(&da, bp, dep);
 				}
@@ -1852,6 +1858,7 @@  _("entry \"%s\" in dir inode %" PRIu64 " inconsistent with .. value (%" PRIu64 "
 				orphanage_ino = 0;
 			nbad++;
 			if (!no_modify)  {
+				force_needsrepair(mp);
 				dep->name[0] = '/';
 				libxfs_dir2_data_log_entry(&da, bp, dep);
 				if (verbose)
diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
index f607afcb..9dc73854 100644
--- a/repair/xfs_repair.c
+++ b/repair/xfs_repair.c
@@ -754,6 +754,43 @@  clear_needsrepair(
 		libxfs_buf_relse(bp);
 }
 
+/*
+ * Mark the filesystem as needing repair.  This should only be called by code
+ * that deliberately sets invalid sentinel values in the on-disk metadata to
+ * trigger a later reconstruction, and only after we've settled the primary
+ * super contents (i.e. after phase 1).
+ */
+void
+force_needsrepair(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb) ||
+	    xfs_sb_version_needsrepair(&mp->m_sb))
+		return;
+
+	bp = libxfs_getsb(mp);
+	if (!bp || bp->b_error) {
+		do_log(
+	_("couldn't get superblock to set needsrepair, err=%d\n"),
+				bp ? bp->b_error : ENOMEM);
+		return;
+	} else {
+		mp->m_sb.sb_features_incompat |=
+				XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR;
+		libxfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+
+		/* Force the primary super to disk immediately. */
+		error = -libxfs_bwrite(bp);
+		if (error)
+			do_log(_("couldn't force needsrepair, err=%d\n"), error);
+	}
+	if (bp)
+		libxfs_buf_relse(bp);
+}
+
 int
 main(int argc, char **argv)
 {