Message ID | 171212151192.1535150.13198476701217286884.stgit@frogsfrogsfrogs (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | [1/3] xfs: pass xfs_buf lookup flags to xfs_*read_agi | expand |
On Tue, Apr 02, 2024 at 10:18:31PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <djwong@kernel.org> > > While reviewing the next patch which fixes an ABBA deadlock between the > AGI and a directory ILOCK, someone asked a question about why we're > holding the AGI in the first place. The reason for that is to quiesce > the inode structures for that AG while we do a repair. > > I then realized that the xrep_dinode_findmode invokes xchk_iscan_iter, > which walks the inobts (and hence the AGIs) to find all the inodes. > This itself is also an ABBA vector, since the damaged inode could be in > AG 5, which we hold while we scan AG 0 for directories. 5 -> 0 is not > allowed. > > To address this, modify the iscan to allow trylock of the AGI buffer > using the flags argument to xfs_ialloc_read_agi that the previous patch > added. Well, I guess we need this as a quick fix, but any scheme based on trylock and return is just fundamentally broken. Same for the next one.
On Fri, Apr 05, 2024 at 07:54:26AM -0700, Christoph Hellwig wrote: > > > > To address this, modify the iscan to allow trylock of the AGI buffer > > using the flags argument to xfs_ialloc_read_agi that the previous patch > > added. > > Well, I guess we need this as a quick fix, but any scheme based on > trylock and return is just fundamentally broken. > > Same for the next one. And if this just sounded a little frustrated it with, but it should be counted as a grudging: Reviewed-by: Christoph Hellwig <hch@lst.de> for this and the next patch.
On Tue, Apr 02, 2024 at 10:18:31PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <djwong@kernel.org> > > While reviewing the next patch which fixes an ABBA deadlock between the > AGI and a directory ILOCK, someone asked a question about why we're > holding the AGI in the first place. The reason for that is to quiesce > the inode structures for that AG while we do a repair. > > I then realized that the xrep_dinode_findmode invokes xchk_iscan_iter, > which walks the inobts (and hence the AGIs) to find all the inodes. > This itself is also an ABBA vector, since the damaged inode could be in > AG 5, which we hold while we scan AG 0 for directories. 5 -> 0 is not > allowed. > > To address this, modify the iscan to allow trylock of the AGI buffer > using the flags argument to xfs_ialloc_read_agi that the previous patch > added. > > Signed-off-by: Darrick J. Wong <djwong@kernel.org> > --- > fs/xfs/scrub/inode_repair.c | 1 + > fs/xfs/scrub/iscan.c | 36 +++++++++++++++++++++++++++++++++++- > fs/xfs/scrub/iscan.h | 15 +++++++++++++++ > fs/xfs/scrub/trace.h | 10 ++++++++-- > 4 files changed, 59 insertions(+), 3 deletions(-) > > > diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c > index eab380e95ef40..35da0193c919e 100644 > --- a/fs/xfs/scrub/inode_repair.c > +++ b/fs/xfs/scrub/inode_repair.c > @@ -356,6 +356,7 @@ xrep_dinode_find_mode( > * so there's a real possibility that _iscan_iter can return EBUSY. > */ > xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); > + xchk_iscan_set_agi_trylock(&ri->ftype_iscan); > ri->ftype_iscan.skip_ino = sc->sm->sm_ino; > ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; > while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { > diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c > index 66ba0fbd059e0..736ce7c9de6a8 100644 > --- a/fs/xfs/scrub/iscan.c > +++ b/fs/xfs/scrub/iscan.c > @@ -243,6 +243,40 @@ xchk_iscan_finish( > mutex_unlock(&iscan->lock); > } > > +/* > + * Grab the AGI to advance the inode scan. Returns 0 if *agi_bpp is now set, > + * -ECANCELED if the live scan aborted, -EBUSY if the AGI could not be grabbed, > + * or the usual negative errno. > + */ > +STATIC int > +xchk_iscan_read_agi( > + struct xchk_iscan *iscan, > + struct xfs_perag *pag, > + struct xfs_buf **agi_bpp) > +{ > + struct xfs_scrub *sc = iscan->sc; > + unsigned long relax; > + int ret; > + > + if (!xchk_iscan_agi_trylock(iscan)) > + return xfs_ialloc_read_agi(pag, sc->tp, 0, agi_bpp); > + > + relax = msecs_to_jiffies(iscan->iget_retry_delay); > + do { > + ret = xfs_ialloc_read_agi(pag, sc->tp, XFS_IALLOC_FLAG_TRYLOCK, > + agi_bpp); Why is this using xfs_ialloc_read_agi() and not xfs_read_agi()? How do we get here without the perag AGI state not already initialised? i.e. if you just use xfs_read_agi(), all the code that has to plumb flags into xfs_ialloc_read_agi() goes away and this change because a lot less intrusive.... > + if (ret != -EAGAIN) > + return ret; > + if (!iscan->iget_timeout || > + time_is_before_jiffies(iscan->__iget_deadline)) > + return -EBUSY; > + > + trace_xchk_iscan_agi_retry_wait(iscan); > + } while (!schedule_timeout_killable(relax) && > + !xchk_iscan_aborted(iscan)); > + return -ECANCELED; > +} > + > /* > * Advance ino to the next inode that the inobt thinks is allocated, being > * careful to jump to the next AG if we've reached the right end of this AG's > @@ -281,7 +315,7 @@ xchk_iscan_advance( > if (!pag) > return -ECANCELED; > > - ret = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp); > + ret = xchk_iscan_read_agi(iscan, pag, &agi_bp); > if (ret) > goto out_pag; > > diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h > index 71f657552dfac..c9da8f7721f66 100644 > --- a/fs/xfs/scrub/iscan.h > +++ b/fs/xfs/scrub/iscan.h > @@ -59,6 +59,9 @@ struct xchk_iscan { > /* Set if the scan has been aborted due to some event in the fs. */ > #define XCHK_ISCAN_OPSTATE_ABORTED (1) > > +/* Use trylock to acquire the AGI */ > +#define XCHK_ISCAN_OPSTATE_TRYLOCK_AGI (2) > + > static inline bool > xchk_iscan_aborted(const struct xchk_iscan *iscan) > { > @@ -71,6 +74,18 @@ xchk_iscan_abort(struct xchk_iscan *iscan) > set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); > } > > +static inline bool > +xchk_iscan_agi_trylock(const struct xchk_iscan *iscan) > +{ > + return test_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate); > +} Function does not actually do any locking, but the name implies it is actually doing a trylock operation. Perhaps xchk_iscan_agi_needs_trylock()? -Dave.
On Mon, Apr 08, 2024 at 08:34:04AM +1000, Dave Chinner wrote: > On Tue, Apr 02, 2024 at 10:18:31PM -0700, Darrick J. Wong wrote: > > From: Darrick J. Wong <djwong@kernel.org> > > > > While reviewing the next patch which fixes an ABBA deadlock between the > > AGI and a directory ILOCK, someone asked a question about why we're > > holding the AGI in the first place. The reason for that is to quiesce > > the inode structures for that AG while we do a repair. > > > > I then realized that the xrep_dinode_findmode invokes xchk_iscan_iter, > > which walks the inobts (and hence the AGIs) to find all the inodes. > > This itself is also an ABBA vector, since the damaged inode could be in > > AG 5, which we hold while we scan AG 0 for directories. 5 -> 0 is not > > allowed. > > > > To address this, modify the iscan to allow trylock of the AGI buffer > > using the flags argument to xfs_ialloc_read_agi that the previous patch > > added. > > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org> > > --- > > fs/xfs/scrub/inode_repair.c | 1 + > > fs/xfs/scrub/iscan.c | 36 +++++++++++++++++++++++++++++++++++- > > fs/xfs/scrub/iscan.h | 15 +++++++++++++++ > > fs/xfs/scrub/trace.h | 10 ++++++++-- > > 4 files changed, 59 insertions(+), 3 deletions(-) > > > > > > diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c > > index eab380e95ef40..35da0193c919e 100644 > > --- a/fs/xfs/scrub/inode_repair.c > > +++ b/fs/xfs/scrub/inode_repair.c > > @@ -356,6 +356,7 @@ xrep_dinode_find_mode( > > * so there's a real possibility that _iscan_iter can return EBUSY. > > */ > > xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); > > + xchk_iscan_set_agi_trylock(&ri->ftype_iscan); > > ri->ftype_iscan.skip_ino = sc->sm->sm_ino; > > ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; > > while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { > > diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c > > index 66ba0fbd059e0..736ce7c9de6a8 100644 > > --- a/fs/xfs/scrub/iscan.c > > +++ b/fs/xfs/scrub/iscan.c > > @@ -243,6 +243,40 @@ xchk_iscan_finish( > > mutex_unlock(&iscan->lock); > > } > > > > +/* > > + * Grab the AGI to advance the inode scan. Returns 0 if *agi_bpp is now set, > > + * -ECANCELED if the live scan aborted, -EBUSY if the AGI could not be grabbed, > > + * or the usual negative errno. > > + */ > > +STATIC int > > +xchk_iscan_read_agi( > > + struct xchk_iscan *iscan, > > + struct xfs_perag *pag, > > + struct xfs_buf **agi_bpp) > > +{ > > + struct xfs_scrub *sc = iscan->sc; > > + unsigned long relax; > > + int ret; > > + > > + if (!xchk_iscan_agi_trylock(iscan)) > > + return xfs_ialloc_read_agi(pag, sc->tp, 0, agi_bpp); > > + > > + relax = msecs_to_jiffies(iscan->iget_retry_delay); > > + do { > > + ret = xfs_ialloc_read_agi(pag, sc->tp, XFS_IALLOC_FLAG_TRYLOCK, > > + agi_bpp); > > Why is this using xfs_ialloc_read_agi() and not xfs_read_agi()? > How do we get here without the perag AGI state not already > initialised? !finobt filesystems won't have called xfs_ialloc_read_agi to initialize the incore per-ag reservation during mount. That's a bit of a corner case since there shouldn't be /so/ many filesystems without finobt these days, but it's theoretically possible. > i.e. if you just use xfs_read_agi(), all the code that has to plumb > flags into xfs_ialloc_read_agi() goes away and this change because a > lot less intrusive.... > > > + if (ret != -EAGAIN) > > + return ret; > > + if (!iscan->iget_timeout || > > + time_is_before_jiffies(iscan->__iget_deadline)) > > + return -EBUSY; > > + > > + trace_xchk_iscan_agi_retry_wait(iscan); > > + } while (!schedule_timeout_killable(relax) && > > + !xchk_iscan_aborted(iscan)); > > + return -ECANCELED; > > +} > > + > > /* > > * Advance ino to the next inode that the inobt thinks is allocated, being > > * careful to jump to the next AG if we've reached the right end of this AG's > > @@ -281,7 +315,7 @@ xchk_iscan_advance( > > if (!pag) > > return -ECANCELED; > > > > - ret = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp); > > + ret = xchk_iscan_read_agi(iscan, pag, &agi_bp); > > if (ret) > > goto out_pag; > > > > diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h > > index 71f657552dfac..c9da8f7721f66 100644 > > --- a/fs/xfs/scrub/iscan.h > > +++ b/fs/xfs/scrub/iscan.h > > @@ -59,6 +59,9 @@ struct xchk_iscan { > > /* Set if the scan has been aborted due to some event in the fs. */ > > #define XCHK_ISCAN_OPSTATE_ABORTED (1) > > > > +/* Use trylock to acquire the AGI */ > > +#define XCHK_ISCAN_OPSTATE_TRYLOCK_AGI (2) > > + > > static inline bool > > xchk_iscan_aborted(const struct xchk_iscan *iscan) > > { > > @@ -71,6 +74,18 @@ xchk_iscan_abort(struct xchk_iscan *iscan) > > set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); > > } > > > > +static inline bool > > +xchk_iscan_agi_trylock(const struct xchk_iscan *iscan) > > +{ > > + return test_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate); > > +} > > Function does not actually do any locking, but the name implies it > is actually doing a trylock operation. Perhaps > xchk_iscan_agi_needs_trylock()? Ok. I apologize for the rage of the last few days. I need a loooooooong vacation. --D > > -Dave. > -- > Dave Chinner > david@fromorbit.com >
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index eab380e95ef40..35da0193c919e 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -356,6 +356,7 @@ xrep_dinode_find_mode( * so there's a real possibility that _iscan_iter can return EBUSY. */ xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); + xchk_iscan_set_agi_trylock(&ri->ftype_iscan); ri->ftype_iscan.skip_ino = sc->sm->sm_ino; ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c index 66ba0fbd059e0..736ce7c9de6a8 100644 --- a/fs/xfs/scrub/iscan.c +++ b/fs/xfs/scrub/iscan.c @@ -243,6 +243,40 @@ xchk_iscan_finish( mutex_unlock(&iscan->lock); } +/* + * Grab the AGI to advance the inode scan. Returns 0 if *agi_bpp is now set, + * -ECANCELED if the live scan aborted, -EBUSY if the AGI could not be grabbed, + * or the usual negative errno. + */ +STATIC int +xchk_iscan_read_agi( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_buf **agi_bpp) +{ + struct xfs_scrub *sc = iscan->sc; + unsigned long relax; + int ret; + + if (!xchk_iscan_agi_trylock(iscan)) + return xfs_ialloc_read_agi(pag, sc->tp, 0, agi_bpp); + + relax = msecs_to_jiffies(iscan->iget_retry_delay); + do { + ret = xfs_ialloc_read_agi(pag, sc->tp, XFS_IALLOC_FLAG_TRYLOCK, + agi_bpp); + if (ret != -EAGAIN) + return ret; + if (!iscan->iget_timeout || + time_is_before_jiffies(iscan->__iget_deadline)) + return -EBUSY; + + trace_xchk_iscan_agi_retry_wait(iscan); + } while (!schedule_timeout_killable(relax) && + !xchk_iscan_aborted(iscan)); + return -ECANCELED; +} + /* * Advance ino to the next inode that the inobt thinks is allocated, being * careful to jump to the next AG if we've reached the right end of this AG's @@ -281,7 +315,7 @@ xchk_iscan_advance( if (!pag) return -ECANCELED; - ret = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp); + ret = xchk_iscan_read_agi(iscan, pag, &agi_bp); if (ret) goto out_pag; diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h index 71f657552dfac..c9da8f7721f66 100644 --- a/fs/xfs/scrub/iscan.h +++ b/fs/xfs/scrub/iscan.h @@ -59,6 +59,9 @@ struct xchk_iscan { /* Set if the scan has been aborted due to some event in the fs. */ #define XCHK_ISCAN_OPSTATE_ABORTED (1) +/* Use trylock to acquire the AGI */ +#define XCHK_ISCAN_OPSTATE_TRYLOCK_AGI (2) + static inline bool xchk_iscan_aborted(const struct xchk_iscan *iscan) { @@ -71,6 +74,18 @@ xchk_iscan_abort(struct xchk_iscan *iscan) set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); } +static inline bool +xchk_iscan_agi_trylock(const struct xchk_iscan *iscan) +{ + return test_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate); +} + +static inline void +xchk_iscan_set_agi_trylock(struct xchk_iscan *iscan) +{ + set_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate); +} + void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout, unsigned int iget_retry_delay, struct xchk_iscan *iscan); void xchk_iscan_teardown(struct xchk_iscan *iscan); diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 5b294be52c558..b1c7c79760d44 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1300,7 +1300,7 @@ TRACE_EVENT(xchk_iscan_iget_batch, __entry->unavail) ); -TRACE_EVENT(xchk_iscan_iget_retry_wait, +DECLARE_EVENT_CLASS(xchk_iscan_retry_wait_class, TP_PROTO(struct xchk_iscan *iscan), TP_ARGS(iscan), TP_STRUCT__entry( @@ -1326,7 +1326,13 @@ TRACE_EVENT(xchk_iscan_iget_retry_wait, __entry->remaining, __entry->iget_timeout, __entry->retry_delay) -); +) +#define DEFINE_ISCAN_RETRY_WAIT_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_retry_wait_class, name, \ + TP_PROTO(struct xchk_iscan *iscan), \ + TP_ARGS(iscan)) +DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_iget_retry_wait); +DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_agi_retry_wait); TRACE_EVENT(xchk_nlinks_collect_dirent, TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp,