diff mbox series

[24/27] libxfs: add a buftarg cache shrinker implementation

Message ID 20201015072155.1631135-25-david@fromorbit.com (mailing list archive)
State New
Headers show
Series xfsprogs: xfs_buf unification and AIO | expand

Commit Message

Dave Chinner Oct. 15, 2020, 7:21 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Add a list_lru scanner that runs from the memory pressure detection
to free an amount of the buffer cache that will keep the cache from
growing when there is memory pressure.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 libxfs/buftarg.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

Comments

Darrick J. Wong Oct. 15, 2020, 6:01 p.m. UTC | #1
On Thu, Oct 15, 2020 at 06:21:52PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Add a list_lru scanner that runs from the memory pressure detection
> to free an amount of the buffer cache that will keep the cache from
> growing when there is memory pressure.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  libxfs/buftarg.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 51 insertions(+)
> 
> diff --git a/libxfs/buftarg.c b/libxfs/buftarg.c
> index 6c7142d41eb1..8332bf3341b6 100644
> --- a/libxfs/buftarg.c
> +++ b/libxfs/buftarg.c
> @@ -62,6 +62,19 @@ xfs_buftarg_setsize_early(
>  	return xfs_buftarg_setsize(btp, bsize);
>  }
>  
> +static void
> +dispose_list(
> +	struct list_head	*dispose)
> +{
> +	struct xfs_buf		*bp;
> +
> +	while (!list_empty(dispose)) {
> +		bp = list_first_entry(dispose, struct xfs_buf, b_lru);
> +		list_del_init(&bp->b_lru);
> +		xfs_buf_rele(bp);
> +	}
> +}
> +
>  /*
>   * Scan a chunk of the buffer cache and drop LRU reference counts. If the
>   * count goes to zero, dispose of the buffer.
> @@ -70,6 +83,13 @@ static void
>  xfs_buftarg_shrink(
>  	struct xfs_buftarg	*btc)
>  {
> +	struct list_lru		*lru = &btc->bt_lru;
> +	struct xfs_buf		*bp;
> +	int			count;
> +	int			progress = 16384;
> +	int			rotate = 0;
> +	LIST_HEAD(dispose);
> +
>  	/*
>  	 * Make the fact we are in memory reclaim externally visible. This
>  	 * allows buffer cache allocation throttling while we are trying to
> @@ -79,6 +99,37 @@ xfs_buftarg_shrink(
>  
>  	fprintf(stderr, "Got memory pressure event. Shrinking caches!\n");
>  
> +	spin_lock(&lru->l_lock);
> +	count = lru->l_count / 50;	/* 2% */

If I'm reading this correctly, we react to a memory pressure event by
trying to skim 2% of the oldest disposable buffers off the buftarg LRU?
And every 16384 loop iterations we'll dispose the list even if we
haven't gotten our 2% yet?  How did you arrive at 2%?

(Also, I'm assuming that some of these stderr printfs will at some point
get turned into tracepoints or dbg_printf or the like?)

--D

> +	fprintf(stderr, "cache size before %ld/%d\n", lru->l_count, count);
> +	while (count-- > 0 && !list_empty(&lru->l_lru)) {
> +		bp = list_first_entry(&lru->l_lru, struct xfs_buf, b_lru);
> +		spin_lock(&bp->b_lock);
> +		if (!atomic_add_unless(&bp->b_lru_ref, -1, 1)) {
> +			atomic_set(&bp->b_lru_ref, 0);
> +			bp->b_state |= XFS_BSTATE_DISPOSE;
> +			list_move(&bp->b_lru, &dispose);
> +			lru->l_count--;
> +		} else {
> +			rotate++;
> +			list_move_tail(&bp->b_lru, &lru->l_lru);
> +		}
> +
> +		spin_unlock(&bp->b_lock);
> +		if (--progress == 0) {
> +			fprintf(stderr, "Disposing! rotated %d, lru %ld\n", rotate, lru->l_count);
> +			spin_unlock(&lru->l_lock);
> +			dispose_list(&dispose);
> +			spin_lock(&lru->l_lock);
> +			progress = 16384;
> +			rotate = 0;
> +		}
> +	}
> +	spin_unlock(&lru->l_lock);
> +
> +	dispose_list(&dispose);
> +	fprintf(stderr, "cache size after %ld, count remaining %d\n", lru->l_count, count);
> +
>  	/*
>  	 * Now we've free a bunch of memory, trim the heap down to release the
>  	 * freed memory back to the kernel and reduce the pressure we are
> -- 
> 2.28.0
>
Dave Chinner Oct. 15, 2020, 9:33 p.m. UTC | #2
On Thu, Oct 15, 2020 at 11:01:41AM -0700, Darrick J. Wong wrote:
> On Thu, Oct 15, 2020 at 06:21:52PM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Add a list_lru scanner that runs from the memory pressure detection
> > to free an amount of the buffer cache that will keep the cache from
> > growing when there is memory pressure.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  libxfs/buftarg.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 51 insertions(+)
> > 
> > diff --git a/libxfs/buftarg.c b/libxfs/buftarg.c
> > index 6c7142d41eb1..8332bf3341b6 100644
> > --- a/libxfs/buftarg.c
> > +++ b/libxfs/buftarg.c
> > @@ -62,6 +62,19 @@ xfs_buftarg_setsize_early(
> >  	return xfs_buftarg_setsize(btp, bsize);
> >  }
> >  
> > +static void
> > +dispose_list(
> > +	struct list_head	*dispose)
> > +{
> > +	struct xfs_buf		*bp;
> > +
> > +	while (!list_empty(dispose)) {
> > +		bp = list_first_entry(dispose, struct xfs_buf, b_lru);
> > +		list_del_init(&bp->b_lru);
> > +		xfs_buf_rele(bp);
> > +	}
> > +}
> > +
> >  /*
> >   * Scan a chunk of the buffer cache and drop LRU reference counts. If the
> >   * count goes to zero, dispose of the buffer.
> > @@ -70,6 +83,13 @@ static void
> >  xfs_buftarg_shrink(
> >  	struct xfs_buftarg	*btc)
> >  {
> > +	struct list_lru		*lru = &btc->bt_lru;
> > +	struct xfs_buf		*bp;
> > +	int			count;
> > +	int			progress = 16384;
> > +	int			rotate = 0;
> > +	LIST_HEAD(dispose);
> > +
> >  	/*
> >  	 * Make the fact we are in memory reclaim externally visible. This
> >  	 * allows buffer cache allocation throttling while we are trying to
> > @@ -79,6 +99,37 @@ xfs_buftarg_shrink(
> >  
> >  	fprintf(stderr, "Got memory pressure event. Shrinking caches!\n");
> >  
> > +	spin_lock(&lru->l_lock);
> > +	count = lru->l_count / 50;	/* 2% */
> 
> If I'm reading this correctly, we react to a memory pressure event by
> trying to skim 2% of the oldest disposable buffers off the buftarg LRU?
> And every 16384 loop iterations we'll dispose the list even if we
> haven't gotten our 2% yet?  How did you arrive at 2%?

Yup, 2% was the number I came up with. It's a trade-off between
scanning enough to keep the cache growth in check but not so much as
to trash the entire cache as stall events roll in. Also, the PSI
monitor will report at most 1 event per second with the current
config, so the amount of work the shrinker does doesn't need to
consume lots of time.

The system I was testing on ended up OOMing at around 1.2M cached
buffers. Hence each invocation was scanning ~25-30k buffers every
invocation. This was sufficient to control memory usage, without the
PSI stall event tail trashing the cache once repair-triggered memory
pressure had been brought under control

The "progress" thing is just a way of batching up the work so that
we free memory sooner. THe number of 16384 was from when I was
discovering how this behaved and I was trimming up to 50% of the
cache in a singel event. I needed some kind of progress indicaction
while it stalled for seconds freeing memory. It may stay, it may go.
We'll see.

> (Also, I'm assuming that some of these stderr printfs will at some point
> get turned into tracepoints or dbg_printf or the like?)

Maybe. I would prefer the tracepoint model over dbg_printf(), but
that's not something I'm thinking about right now...

Note that this shrinker does not rotate buffers by default. The
kernel rotates buffers once through the LRU before they are
reclaimed. If I try to do that with PSI events, then we OOM kill
because the system is completely out of memory by the time 5-6
events have been delivered and then we get OOM killed. Hence it
reclaims immediately, but that can be tuned for repair by converting
the cache priorities for buffers in LRU references...

Cheers,

Dave.
diff mbox series

Patch

diff --git a/libxfs/buftarg.c b/libxfs/buftarg.c
index 6c7142d41eb1..8332bf3341b6 100644
--- a/libxfs/buftarg.c
+++ b/libxfs/buftarg.c
@@ -62,6 +62,19 @@  xfs_buftarg_setsize_early(
 	return xfs_buftarg_setsize(btp, bsize);
 }
 
+static void
+dispose_list(
+	struct list_head	*dispose)
+{
+	struct xfs_buf		*bp;
+
+	while (!list_empty(dispose)) {
+		bp = list_first_entry(dispose, struct xfs_buf, b_lru);
+		list_del_init(&bp->b_lru);
+		xfs_buf_rele(bp);
+	}
+}
+
 /*
  * Scan a chunk of the buffer cache and drop LRU reference counts. If the
  * count goes to zero, dispose of the buffer.
@@ -70,6 +83,13 @@  static void
 xfs_buftarg_shrink(
 	struct xfs_buftarg	*btc)
 {
+	struct list_lru		*lru = &btc->bt_lru;
+	struct xfs_buf		*bp;
+	int			count;
+	int			progress = 16384;
+	int			rotate = 0;
+	LIST_HEAD(dispose);
+
 	/*
 	 * Make the fact we are in memory reclaim externally visible. This
 	 * allows buffer cache allocation throttling while we are trying to
@@ -79,6 +99,37 @@  xfs_buftarg_shrink(
 
 	fprintf(stderr, "Got memory pressure event. Shrinking caches!\n");
 
+	spin_lock(&lru->l_lock);
+	count = lru->l_count / 50;	/* 2% */
+	fprintf(stderr, "cache size before %ld/%d\n", lru->l_count, count);
+	while (count-- > 0 && !list_empty(&lru->l_lru)) {
+		bp = list_first_entry(&lru->l_lru, struct xfs_buf, b_lru);
+		spin_lock(&bp->b_lock);
+		if (!atomic_add_unless(&bp->b_lru_ref, -1, 1)) {
+			atomic_set(&bp->b_lru_ref, 0);
+			bp->b_state |= XFS_BSTATE_DISPOSE;
+			list_move(&bp->b_lru, &dispose);
+			lru->l_count--;
+		} else {
+			rotate++;
+			list_move_tail(&bp->b_lru, &lru->l_lru);
+		}
+
+		spin_unlock(&bp->b_lock);
+		if (--progress == 0) {
+			fprintf(stderr, "Disposing! rotated %d, lru %ld\n", rotate, lru->l_count);
+			spin_unlock(&lru->l_lock);
+			dispose_list(&dispose);
+			spin_lock(&lru->l_lock);
+			progress = 16384;
+			rotate = 0;
+		}
+	}
+	spin_unlock(&lru->l_lock);
+
+	dispose_list(&dispose);
+	fprintf(stderr, "cache size after %ld, count remaining %d\n", lru->l_count, count);
+
 	/*
 	 * Now we've free a bunch of memory, trim the heap down to release the
 	 * freed memory back to the kernel and reduce the pressure we are