[19/30] xfs: allow multiple reclaimers per AG
diff mbox series

Message ID 20200604074606.266213-20-david@fromorbit.com
State Superseded
Headers show
Series
  • xfs: rework inode flushing to make inode reclaim fully asynchronous
Related show

Commit Message

Dave Chinner June 4, 2020, 7:45 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Inode reclaim will still throttle direct reclaim on the per-ag
reclaim locks. This is no longer necessary as reclaim can run
non-blocking now. Hence we can remove these locks so that we don't
arbitrarily block reclaimers just because there are more direct
reclaimers than there are AGs.

This can result in multiple reclaimers working on the same range of
an AG, but this doesn't cause any apparent issues. Optimising the
spread of concurrent reclaimers for best efficiency can be done in a
future patchset.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_icache.c | 31 ++++++++++++-------------------
 fs/xfs/xfs_mount.c  |  4 ----
 fs/xfs/xfs_mount.h  |  1 -
 3 files changed, 12 insertions(+), 24 deletions(-)

Comments

Brian Foster June 5, 2020, 4:26 p.m. UTC | #1
On Thu, Jun 04, 2020 at 05:45:55PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Inode reclaim will still throttle direct reclaim on the per-ag
> reclaim locks. This is no longer necessary as reclaim can run
> non-blocking now. Hence we can remove these locks so that we don't
> arbitrarily block reclaimers just because there are more direct
> reclaimers than there are AGs.
> 
> This can result in multiple reclaimers working on the same range of
> an AG, but this doesn't cause any apparent issues. Optimising the
> spread of concurrent reclaimers for best efficiency can be done in a
> future patchset.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_icache.c | 31 ++++++++++++-------------------
>  fs/xfs/xfs_mount.c  |  4 ----
>  fs/xfs/xfs_mount.h  |  1 -
>  3 files changed, 12 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index 74032316ce5cc..c4ba8d7bc45bc 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
...
> @@ -1298,11 +1293,9 @@ xfs_reclaim_inodes_ag(
>  
>  		} while (nr_found && !done && *nr_to_scan > 0);
>  
> -		if (trylock && !done)
> -			pag->pag_ici_reclaim_cursor = first_index;
> -		else
> -			pag->pag_ici_reclaim_cursor = 0;
> -		mutex_unlock(&pag->pag_ici_reclaim_lock);
> +		if (done)
> +			first_index = 0;
> +		WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);

I thought the [READ|WRITE]_ONCE() macros had to do with ordering, not
necessarily atomicity. Is this write safe if we're running a 32-bit
kernel, for example? Outside of that the broader functional change seems
reasonable.

Brian

>  		xfs_perag_put(pag);
>  	}
>  	return skipped;
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index d5dcf98698600..03158b42a1943 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -148,7 +148,6 @@ xfs_free_perag(
>  		ASSERT(atomic_read(&pag->pag_ref) == 0);
>  		xfs_iunlink_destroy(pag);
>  		xfs_buf_hash_destroy(pag);
> -		mutex_destroy(&pag->pag_ici_reclaim_lock);
>  		call_rcu(&pag->rcu_head, __xfs_free_perag);
>  	}
>  }
> @@ -200,7 +199,6 @@ xfs_initialize_perag(
>  		pag->pag_agno = index;
>  		pag->pag_mount = mp;
>  		spin_lock_init(&pag->pag_ici_lock);
> -		mutex_init(&pag->pag_ici_reclaim_lock);
>  		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
>  		if (xfs_buf_hash_init(pag))
>  			goto out_free_pag;
> @@ -242,7 +240,6 @@ xfs_initialize_perag(
>  out_hash_destroy:
>  	xfs_buf_hash_destroy(pag);
>  out_free_pag:
> -	mutex_destroy(&pag->pag_ici_reclaim_lock);
>  	kmem_free(pag);
>  out_unwind_new_pags:
>  	/* unwind any prior newly initialized pags */
> @@ -252,7 +249,6 @@ xfs_initialize_perag(
>  			break;
>  		xfs_buf_hash_destroy(pag);
>  		xfs_iunlink_destroy(pag);
> -		mutex_destroy(&pag->pag_ici_reclaim_lock);
>  		kmem_free(pag);
>  	}
>  	return error;
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 3725d25ad97e8..a72cfcaa4ad12 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -354,7 +354,6 @@ typedef struct xfs_perag {
>  	spinlock_t	pag_ici_lock;	/* incore inode cache lock */
>  	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
>  	int		pag_ici_reclaimable;	/* reclaimable inodes */
> -	struct mutex	pag_ici_reclaim_lock;	/* serialisation point */
>  	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
>  
>  	/* buffer cache index */
> -- 
> 2.26.2.761.g0e0b3e54be
>
Dave Chinner June 5, 2020, 9:07 p.m. UTC | #2
On Fri, Jun 05, 2020 at 12:26:11PM -0400, Brian Foster wrote:
> On Thu, Jun 04, 2020 at 05:45:55PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Inode reclaim will still throttle direct reclaim on the per-ag
> > reclaim locks. This is no longer necessary as reclaim can run
> > non-blocking now. Hence we can remove these locks so that we don't
> > arbitrarily block reclaimers just because there are more direct
> > reclaimers than there are AGs.
> > 
> > This can result in multiple reclaimers working on the same range of
> > an AG, but this doesn't cause any apparent issues. Optimising the
> > spread of concurrent reclaimers for best efficiency can be done in a
> > future patchset.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_icache.c | 31 ++++++++++++-------------------
> >  fs/xfs/xfs_mount.c  |  4 ----
> >  fs/xfs/xfs_mount.h  |  1 -
> >  3 files changed, 12 insertions(+), 24 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> > index 74032316ce5cc..c4ba8d7bc45bc 100644
> > --- a/fs/xfs/xfs_icache.c
> > +++ b/fs/xfs/xfs_icache.c
> ...
> > @@ -1298,11 +1293,9 @@ xfs_reclaim_inodes_ag(
> >  
> >  		} while (nr_found && !done && *nr_to_scan > 0);
> >  
> > -		if (trylock && !done)
> > -			pag->pag_ici_reclaim_cursor = first_index;
> > -		else
> > -			pag->pag_ici_reclaim_cursor = 0;
> > -		mutex_unlock(&pag->pag_ici_reclaim_lock);
> > +		if (done)
> > +			first_index = 0;
> > +		WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
> 
> I thought the [READ|WRITE]_ONCE() macros had to do with ordering, not
> necessarily atomicity. Is this write safe if we're running a 32-bit
> kernel, for example? Outside of that the broader functional change seems
> reasonable.

They are used for documenting intentional data races now, too.
That's what these are - we don't care about serialisation, but there
are static checkers that will now spew "data race" warnings because
multiple threads can race reading and writing unserialised
variables.

It is safe on 32 bit machines because these variables are 32 bit on
32 bit machines, and reads/writes of 32 bit variables on 32 bit
machines are atomic (though not serialised).

Cheers,

Dave.
Brian Foster June 8, 2020, 4:44 p.m. UTC | #3
On Sat, Jun 06, 2020 at 07:07:46AM +1000, Dave Chinner wrote:
> On Fri, Jun 05, 2020 at 12:26:11PM -0400, Brian Foster wrote:
> > On Thu, Jun 04, 2020 at 05:45:55PM +1000, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > Inode reclaim will still throttle direct reclaim on the per-ag
> > > reclaim locks. This is no longer necessary as reclaim can run
> > > non-blocking now. Hence we can remove these locks so that we don't
> > > arbitrarily block reclaimers just because there are more direct
> > > reclaimers than there are AGs.
> > > 
> > > This can result in multiple reclaimers working on the same range of
> > > an AG, but this doesn't cause any apparent issues. Optimising the
> > > spread of concurrent reclaimers for best efficiency can be done in a
> > > future patchset.
> > > 
> > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > >  fs/xfs/xfs_icache.c | 31 ++++++++++++-------------------
> > >  fs/xfs/xfs_mount.c  |  4 ----
> > >  fs/xfs/xfs_mount.h  |  1 -
> > >  3 files changed, 12 insertions(+), 24 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> > > index 74032316ce5cc..c4ba8d7bc45bc 100644
> > > --- a/fs/xfs/xfs_icache.c
> > > +++ b/fs/xfs/xfs_icache.c
> > ...
> > > @@ -1298,11 +1293,9 @@ xfs_reclaim_inodes_ag(
> > >  
> > >  		} while (nr_found && !done && *nr_to_scan > 0);
> > >  
> > > -		if (trylock && !done)
> > > -			pag->pag_ici_reclaim_cursor = first_index;
> > > -		else
> > > -			pag->pag_ici_reclaim_cursor = 0;
> > > -		mutex_unlock(&pag->pag_ici_reclaim_lock);
> > > +		if (done)
> > > +			first_index = 0;
> > > +		WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
> > 
> > I thought the [READ|WRITE]_ONCE() macros had to do with ordering, not
> > necessarily atomicity. Is this write safe if we're running a 32-bit
> > kernel, for example? Outside of that the broader functional change seems
> > reasonable.
> 
> They are used for documenting intentional data races now, too.
> That's what these are - we don't care about serialisation, but there
> are static checkers that will now spew "data race" warnings because
> multiple threads can race reading and writing unserialised
> variables.
> 

I wasn't aware of that. I'm not sure how widely known that is so it
might be worth a one liner comment to ensure these are preserved (if
they survive the end of the series).

> It is safe on 32 bit machines because these variables are 32 bit on
> 32 bit machines, and reads/writes of 32 bit variables on 32 bit
> machines are atomic (though not serialised).
> 

Ah, right. I was thinking they were always 64-bit but that is not the
case. With that:

Reviewed-by: Brian Foster <bfoster@redhat.com>

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>

Patch
diff mbox series

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 74032316ce5cc..c4ba8d7bc45bc 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1211,12 +1211,9 @@  xfs_reclaim_inodes_ag(
 	int			*nr_to_scan)
 {
 	struct xfs_perag	*pag;
-	xfs_agnumber_t		ag;
-	int			trylock = flags & SYNC_TRYLOCK;
-	int			skipped;
+	xfs_agnumber_t		ag = 0;
+	int			skipped = 0;
 
-	ag = 0;
-	skipped = 0;
 	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
 		unsigned long	first_index = 0;
 		int		done = 0;
@@ -1224,15 +1221,13 @@  xfs_reclaim_inodes_ag(
 
 		ag = pag->pag_agno + 1;
 
-		if (trylock) {
-			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
-				skipped++;
-				xfs_perag_put(pag);
-				continue;
-			}
-			first_index = pag->pag_ici_reclaim_cursor;
-		} else
-			mutex_lock(&pag->pag_ici_reclaim_lock);
+		/*
+		 * If the cursor is not zero, we haven't scanned the whole AG
+		 * so we might have skipped inodes here.
+		 */
+		first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
+		if (first_index)
+			skipped++;
 
 		do {
 			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
@@ -1298,11 +1293,9 @@  xfs_reclaim_inodes_ag(
 
 		} while (nr_found && !done && *nr_to_scan > 0);
 
-		if (trylock && !done)
-			pag->pag_ici_reclaim_cursor = first_index;
-		else
-			pag->pag_ici_reclaim_cursor = 0;
-		mutex_unlock(&pag->pag_ici_reclaim_lock);
+		if (done)
+			first_index = 0;
+		WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
 		xfs_perag_put(pag);
 	}
 	return skipped;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d5dcf98698600..03158b42a1943 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -148,7 +148,6 @@  xfs_free_perag(
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		xfs_iunlink_destroy(pag);
 		xfs_buf_hash_destroy(pag);
-		mutex_destroy(&pag->pag_ici_reclaim_lock);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
 	}
 }
@@ -200,7 +199,6 @@  xfs_initialize_perag(
 		pag->pag_agno = index;
 		pag->pag_mount = mp;
 		spin_lock_init(&pag->pag_ici_lock);
-		mutex_init(&pag->pag_ici_reclaim_lock);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 		if (xfs_buf_hash_init(pag))
 			goto out_free_pag;
@@ -242,7 +240,6 @@  xfs_initialize_perag(
 out_hash_destroy:
 	xfs_buf_hash_destroy(pag);
 out_free_pag:
-	mutex_destroy(&pag->pag_ici_reclaim_lock);
 	kmem_free(pag);
 out_unwind_new_pags:
 	/* unwind any prior newly initialized pags */
@@ -252,7 +249,6 @@  xfs_initialize_perag(
 			break;
 		xfs_buf_hash_destroy(pag);
 		xfs_iunlink_destroy(pag);
-		mutex_destroy(&pag->pag_ici_reclaim_lock);
 		kmem_free(pag);
 	}
 	return error;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 3725d25ad97e8..a72cfcaa4ad12 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -354,7 +354,6 @@  typedef struct xfs_perag {
 	spinlock_t	pag_ici_lock;	/* incore inode cache lock */
 	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
-	struct mutex	pag_ici_reclaim_lock;	/* serialisation point */
 	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 
 	/* buffer cache index */