diff mbox series

[11/11] xfs: create a polled function to force inode inactivation

Message ID 161543200190.1947934.3117722394191799491.stgit@magnolia (mailing list archive)
State Superseded
Headers show
Series xfs: deferred inode inactivation | expand

Commit Message

Darrick J. Wong March 11, 2021, 3:06 a.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

Create a polled version of xfs_inactive_force so that we can force
inactivation while holding a lock (usually the umount lock) without
tripping over the softlockup timer.  This is for callers that hold vfs
locks while calling inactivation, which is currently unmount, iunlink
processing during mount, and rw->ro remount.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_icache.c |   38 +++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_icache.h |    1 +
 fs/xfs/xfs_mount.c  |    2 +-
 fs/xfs/xfs_mount.h  |    5 +++++
 fs/xfs/xfs_super.c  |    3 ++-
 5 files changed, 46 insertions(+), 3 deletions(-)

Comments

Dave Chinner March 23, 2021, 10:31 p.m. UTC | #1
On Wed, Mar 10, 2021 at 07:06:41PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Create a polled version of xfs_inactive_force so that we can force
> inactivation while holding a lock (usually the umount lock) without
> tripping over the softlockup timer.  This is for callers that hold vfs
> locks while calling inactivation, which is currently unmount, iunlink
> processing during mount, and rw->ro remount.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  fs/xfs/xfs_icache.c |   38 +++++++++++++++++++++++++++++++++++++-
>  fs/xfs/xfs_icache.h |    1 +
>  fs/xfs/xfs_mount.c  |    2 +-
>  fs/xfs/xfs_mount.h  |    5 +++++
>  fs/xfs/xfs_super.c  |    3 ++-
>  5 files changed, 46 insertions(+), 3 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index d5f580b92e48..9db2beb4e732 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -25,6 +25,7 @@
>  #include "xfs_ialloc.h"
>  
>  #include <linux/iversion.h>
> +#include <linux/nmi.h>

This stuff goes in fs/xfs/xfs_linux.h, not here.

>  
>  /*
>   * Allocate and initialise an xfs_inode.
> @@ -2067,8 +2068,12 @@ xfs_inodegc_free_space(
>  	struct xfs_mount	*mp,
>  	struct xfs_eofblocks	*eofb)
>  {
> -	return xfs_inode_walk(mp, XFS_INODE_WALK_INACTIVE,
> +	int			error;
> +
> +	error = xfs_inode_walk(mp, XFS_INODE_WALK_INACTIVE,
>  			xfs_inactive_inode, eofb, XFS_ICI_INACTIVE_TAG);
> +	wake_up(&mp->m_inactive_wait);
> +	return error;
>  }
>  
>  /* Try to get inode inactivation moving. */
> @@ -2138,6 +2143,37 @@ xfs_inodegc_force(
>  	flush_workqueue(mp->m_gc_workqueue);
>  }
>  
> +/*
> + * Force all inode inactivation work to run immediately, and poll until the
> + * work is complete.  Callers should only use this function if they must
> + * inactivate inodes while holding VFS locks, and must be prepared to prevent
> + * or to wait for inodes that are queued for inactivation while this runs.
> + */
> +void
> +xfs_inodegc_force_poll(
> +	struct xfs_mount	*mp)
> +{
> +	struct xfs_perag	*pag;
> +	xfs_agnumber_t		agno;
> +	bool			queued = false;
> +
> +	for_each_perag_tag(mp, agno, pag, XFS_ICI_INACTIVE_TAG)
> +		queued |= xfs_inodegc_force_pag(pag);
> +	if (!queued)
> +		return;
> +
> +	/*
> +	 * Touch the softlockup watchdog every 1/10th of a second while there
> +	 * are still inactivation-tagged inodes in the filesystem.
> +	 */
> +	while (!wait_event_timeout(mp->m_inactive_wait,
> +				   !radix_tree_tagged(&mp->m_perag_tree,
> +						      XFS_ICI_INACTIVE_TAG),
> +				   HZ / 10)) {
> +		touch_softlockup_watchdog();
> +	}
> +}

This looks like a deadlock waiting to be tripped over. As long as
there is something still able to queue inodes for inactivation,
that radix tree tag check will always trigger and put us back to
sleep.

Also, in terms of workqueues, this is a "sync flush" i because we
are waiting for it. e.g. the difference between cancel_work() and
cancel_work_sync() is that the later waits for all the work in
progress to complete before returning and the former doesn't wait...

Cheers,

Dave.
Darrick J. Wong March 24, 2021, 3:34 a.m. UTC | #2
On Wed, Mar 24, 2021 at 09:31:58AM +1100, Dave Chinner wrote:
> On Wed, Mar 10, 2021 at 07:06:41PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Create a polled version of xfs_inactive_force so that we can force
> > inactivation while holding a lock (usually the umount lock) without
> > tripping over the softlockup timer.  This is for callers that hold vfs
> > locks while calling inactivation, which is currently unmount, iunlink
> > processing during mount, and rw->ro remount.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > ---
> >  fs/xfs/xfs_icache.c |   38 +++++++++++++++++++++++++++++++++++++-
> >  fs/xfs/xfs_icache.h |    1 +
> >  fs/xfs/xfs_mount.c  |    2 +-
> >  fs/xfs/xfs_mount.h  |    5 +++++
> >  fs/xfs/xfs_super.c  |    3 ++-
> >  5 files changed, 46 insertions(+), 3 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> > index d5f580b92e48..9db2beb4e732 100644
> > --- a/fs/xfs/xfs_icache.c
> > +++ b/fs/xfs/xfs_icache.c
> > @@ -25,6 +25,7 @@
> >  #include "xfs_ialloc.h"
> >  
> >  #include <linux/iversion.h>
> > +#include <linux/nmi.h>
> 
> This stuff goes in fs/xfs/xfs_linux.h, not here.
> 
> >  
> >  /*
> >   * Allocate and initialise an xfs_inode.
> > @@ -2067,8 +2068,12 @@ xfs_inodegc_free_space(
> >  	struct xfs_mount	*mp,
> >  	struct xfs_eofblocks	*eofb)
> >  {
> > -	return xfs_inode_walk(mp, XFS_INODE_WALK_INACTIVE,
> > +	int			error;
> > +
> > +	error = xfs_inode_walk(mp, XFS_INODE_WALK_INACTIVE,
> >  			xfs_inactive_inode, eofb, XFS_ICI_INACTIVE_TAG);
> > +	wake_up(&mp->m_inactive_wait);
> > +	return error;
> >  }
> >  
> >  /* Try to get inode inactivation moving. */
> > @@ -2138,6 +2143,37 @@ xfs_inodegc_force(
> >  	flush_workqueue(mp->m_gc_workqueue);
> >  }
> >  
> > +/*
> > + * Force all inode inactivation work to run immediately, and poll until the
> > + * work is complete.  Callers should only use this function if they must
> > + * inactivate inodes while holding VFS locks, and must be prepared to prevent
> > + * or to wait for inodes that are queued for inactivation while this runs.
> > + */
> > +void
> > +xfs_inodegc_force_poll(
> > +	struct xfs_mount	*mp)
> > +{
> > +	struct xfs_perag	*pag;
> > +	xfs_agnumber_t		agno;
> > +	bool			queued = false;
> > +
> > +	for_each_perag_tag(mp, agno, pag, XFS_ICI_INACTIVE_TAG)
> > +		queued |= xfs_inodegc_force_pag(pag);
> > +	if (!queued)
> > +		return;
> > +
> > +	/*
> > +	 * Touch the softlockup watchdog every 1/10th of a second while there
> > +	 * are still inactivation-tagged inodes in the filesystem.
> > +	 */
> > +	while (!wait_event_timeout(mp->m_inactive_wait,
> > +				   !radix_tree_tagged(&mp->m_perag_tree,
> > +						      XFS_ICI_INACTIVE_TAG),
> > +				   HZ / 10)) {
> > +		touch_softlockup_watchdog();
> > +	}
> > +}
> 
> This looks like a deadlock waiting to be tripped over. As long as
> there is something still able to queue inodes for inactivation,
> that radix tree tag check will always trigger and put us back to
> sleep.

Yes, I know this is a total livelock vector.  This ugly function exists
to avoid stall warnings when the VFS has called us with s_umount held
and there are a lot of inodes to process.

As the function comment points out, callers must prevent anyone else
from inactivating inodes or be prepared to deal with the consequences,
which the current callers are prepared to do.

I can't think of a better way to handle this, since we need to bail out
of the workqueue flush periodically to make the softlockup thing happy.
Alternately we could just let the stall warnings happen and deal with
the people who file a bug for every stack trace they see the kernel emit.

> Also, in terms of workqueues, this is a "sync flush" i because we
> are waiting for it. e.g. the difference between cancel_work() and
> cancel_work_sync() is that the later waits for all the work in
> progress to complete before returning and the former doesn't wait...

Yeah, I'll change all the xfs_inodegc_force-* -> xfs_inodegc_flush_*.

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index d5f580b92e48..9db2beb4e732 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -25,6 +25,7 @@ 
 #include "xfs_ialloc.h"
 
 #include <linux/iversion.h>
+#include <linux/nmi.h>
 
 /*
  * Allocate and initialise an xfs_inode.
@@ -2067,8 +2068,12 @@  xfs_inodegc_free_space(
 	struct xfs_mount	*mp,
 	struct xfs_eofblocks	*eofb)
 {
-	return xfs_inode_walk(mp, XFS_INODE_WALK_INACTIVE,
+	int			error;
+
+	error = xfs_inode_walk(mp, XFS_INODE_WALK_INACTIVE,
 			xfs_inactive_inode, eofb, XFS_ICI_INACTIVE_TAG);
+	wake_up(&mp->m_inactive_wait);
+	return error;
 }
 
 /* Try to get inode inactivation moving. */
@@ -2138,6 +2143,37 @@  xfs_inodegc_force(
 	flush_workqueue(mp->m_gc_workqueue);
 }
 
+/*
+ * Force all inode inactivation work to run immediately, and poll until the
+ * work is complete.  Callers should only use this function if they must
+ * inactivate inodes while holding VFS locks, and must be prepared to prevent
+ * or to wait for inodes that are queued for inactivation while this runs.
+ */
+void
+xfs_inodegc_force_poll(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	bool			queued = false;
+
+	for_each_perag_tag(mp, agno, pag, XFS_ICI_INACTIVE_TAG)
+		queued |= xfs_inodegc_force_pag(pag);
+	if (!queued)
+		return;
+
+	/*
+	 * Touch the softlockup watchdog every 1/10th of a second while there
+	 * are still inactivation-tagged inodes in the filesystem.
+	 */
+	while (!wait_event_timeout(mp->m_inactive_wait,
+				   !radix_tree_tagged(&mp->m_perag_tree,
+						      XFS_ICI_INACTIVE_TAG),
+				   HZ / 10)) {
+		touch_softlockup_watchdog();
+	}
+}
+
 /* Stop all queued inactivation work. */
 void
 xfs_inodegc_stop(
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 9d5a1f4c0369..80a79bace641 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -84,6 +84,7 @@  void xfs_blockgc_start(struct xfs_mount *mp);
 
 void xfs_inodegc_worker(struct work_struct *work);
 void xfs_inodegc_force(struct xfs_mount *mp);
+void xfs_inodegc_force_poll(struct xfs_mount *mp);
 void xfs_inodegc_stop(struct xfs_mount *mp);
 void xfs_inodegc_start(struct xfs_mount *mp);
 int xfs_inodegc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index a5963061485c..1012b1b361ba 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1109,7 +1109,7 @@  xfs_unmountfs(
 	 * Since this can involve finobt updates, do it now before we lose the
 	 * per-AG space reservations.
 	 */
-	xfs_inodegc_force(mp);
+	xfs_inodegc_force_poll(mp);
 
 	xfs_blockgc_stop(mp);
 	xfs_fs_unreserve_ag_blocks(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 835c07d00cd7..23d9888d2b82 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -213,6 +213,11 @@  typedef struct xfs_mount {
 	unsigned int		*m_errortag;
 	struct xfs_kobj		m_errortag_kobj;
 #endif
+	/*
+	 * Use this to wait for the inode inactivation workqueue to finish
+	 * inactivating all the inodes.
+	 */
+	struct wait_queue_head	m_inactive_wait;
 } xfs_mount_t;
 
 #define M_IGEO(mp)		(&(mp)->m_ino_geo)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 566e5657c1b0..8329a3efced7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1754,7 +1754,7 @@  xfs_remount_ro(
 	 * Since this can involve finobt updates, do it now before we lose the
 	 * per-AG space reservations.
 	 */
-	xfs_inodegc_force(mp);
+	xfs_inodegc_force_poll(mp);
 
 	/* Free the per-AG metadata reservation pool. */
 	error = xfs_fs_unreserve_ag_blocks(mp);
@@ -1880,6 +1880,7 @@  static int xfs_init_fs_context(
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 	mp->m_kobj.kobject.kset = xfs_kset;
+	init_waitqueue_head(&mp->m_inactive_wait);
 	/*
 	 * We don't create the finobt per-ag space reservation until after log
 	 * recovery, so we must set this to true so that an ifree transaction