diff mbox series

[23/27] libxfs: use PSI information to detect memory pressure

Message ID 20201015072155.1631135-24-david@fromorbit.com (mailing list archive)
State Deferred, archived
Headers show
Series xfsprogs: xfs_buf unification and AIO | expand

Commit Message

Dave Chinner Oct. 15, 2020, 7:21 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

The buffer cache needs to have a reliable trigger for shrinking
the cache. Modern kernels track and report memory pressure events to
the userspace via the Pressure Stall Interface (PSI). Create a PSI
memory pressure monitoring thread to listen for memory pressure
events and use that to drive buffer cache shrinking interfaces.

Add the shrinker framework that will allow us to implement LRU
reclaim of buffers when memory pressure occues.  We also create a
low memory detection and reclaim wait mechanism to allow use to
throttle back new allocations while we are shrinking the buffer
cache.

We also include malloc heap trimming callouts so that once the
shrinker frees the memory, we trim the malloc heap to release the
freed memory back to the system.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 libxfs/buftarg.c     | 142 ++++++++++++++++++++++++++++++++++++++++++-
 libxfs/xfs_buftarg.h |   9 +++
 2 files changed, 150 insertions(+), 1 deletion(-)

Comments

Darrick J. Wong Oct. 15, 2020, 5:56 p.m. UTC | #1
On Thu, Oct 15, 2020 at 06:21:51PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> The buffer cache needs to have a reliable trigger for shrinking
> the cache. Modern kernels track and report memory pressure events to
> the userspace via the Pressure Stall Interface (PSI). Create a PSI
> memory pressure monitoring thread to listen for memory pressure
> events and use that to drive buffer cache shrinking interfaces.
> 
> Add the shrinker framework that will allow us to implement LRU
> reclaim of buffers when memory pressure occues.  We also create a
> low memory detection and reclaim wait mechanism to allow use to
> throttle back new allocations while we are shrinking the buffer
> cache.
> 
> We also include malloc heap trimming callouts so that once the
> shrinker frees the memory, we trim the malloc heap to release the
> freed memory back to the system.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  libxfs/buftarg.c     | 142 ++++++++++++++++++++++++++++++++++++++++++-
>  libxfs/xfs_buftarg.h |   9 +++
>  2 files changed, 150 insertions(+), 1 deletion(-)
> 
> diff --git a/libxfs/buftarg.c b/libxfs/buftarg.c
> index 42806e433715..6c7142d41eb1 100644
> --- a/libxfs/buftarg.c
> +++ b/libxfs/buftarg.c
> @@ -62,6 +62,128 @@ xfs_buftarg_setsize_early(
>  	return xfs_buftarg_setsize(btp, bsize);
>  }
>  
> +/*
> + * Scan a chunk of the buffer cache and drop LRU reference counts. If the
> + * count goes to zero, dispose of the buffer.
> + */
> +static void
> +xfs_buftarg_shrink(
> +	struct xfs_buftarg	*btc)
> +{
> +	/*
> +	 * Make the fact we are in memory reclaim externally visible. This
> +	 * allows buffer cache allocation throttling while we are trying to
> +	 * free memory.
> +	 */
> +	atomic_inc_return(&btc->bt_low_mem);
> +
> +	fprintf(stderr, "Got memory pressure event. Shrinking caches!\n");
> +
> +	/*
> +	 * Now we've free a bunch of memory, trim the heap down to release the
> +	 * freed memory back to the kernel and reduce the pressure we are
> +	 * placing on the system.
> +	 */
> +	malloc_trim(0);
> +
> +	/*
> +	 * Done, wake anyone waiting on memory reclaim to complete.
> +	 */
> +	atomic_dec_return(&btc->bt_low_mem);
> +	complete(&btc->bt_low_mem_wait);
> +}
> +
> +static void *
> +xfs_buftarg_shrinker(
> +	void			*args)
> +{
> +	struct xfs_buftarg	*btp = args;
> +	struct pollfd		 fds = {
> +		.fd = btp->bt_psi_fd,
> +		.events = POLLPRI,
> +	};
> +
> +	rcu_register_thread();
> +	while (!btp->bt_exiting) {
> +		int	n;
> +
> +		n = poll(&fds, 1, 100);
> +		if (n == 0)
> +			continue;	/* timeout */
> +		if (n < 0) {
> +			perror("poll(PSI)");
> +			break;
> +		}
> +		if (fds.revents & POLLERR) {
> +			fprintf(stderr,
> +				"poll(psi) POLLERR: event source dead?\n");
> +			break;
> +		}
> +		if (!(fds.revents & POLLPRI)) {
> +			fprintf(stderr,
> +				"poll(psi): unknown event.  Ignoring.\n");
> +			continue;
> +		}
> +
> +		/* run the shrinker here */
> +		xfs_buftarg_shrink(btp);
> +
> +	}
> +	rcu_unregister_thread();
> +	return NULL;
> +}
> +
> +/*
> + * This only picks up on global memory pressure. Maybe in future we can detect
> + * whether we are running inside a container and use the PSI information for the
> + * container.
> + *
> + * We want relatively early notification of memory pressure stalls because
> + * xfs_repair will consume lots of memory. Hence set a low trigger threshold for
> + * reclaim to run - a partial stall of 5ms over a 1s sample period will trigger

The trigger string looks like it's configuring for a partial stall of
10ms over a 1s sample period?

> + * reclaim algorithms.
> + */
> +static int
> +xfs_buftarg_mempressue_init(

xfs_buftarg_mempressure_init() ?

> +	struct xfs_buftarg	*btp)
> +{
> +	const char		*fname = "/proc/pressure/memory";
> +	const char		*trigger = "some 10000 1000000";
> +	int			error;
> +
> +	btp->bt_psi_fd = open(fname, O_RDWR | O_NONBLOCK);
> +	if (btp->bt_psi_fd < 0) {
> +		perror("open(PSI)");
> +		return -errno;
> +	}
> +	if (write(btp->bt_psi_fd, trigger, strlen(trigger) + 1) !=
> +						strlen(trigger) + 1) {
> +		perror("write(PSI)");
> +		error = -errno;
> +		goto out_close;
> +	}
> +
> +	atomic_set(&btp->bt_low_mem, 0);
> +	init_completion(&btp->bt_low_mem_wait);
> +
> +	/*
> +	 * Now create the monitoring reclaim thread. This will run until the
> +	 * buftarg is torn down.
> +	 */
> +	error = pthread_create(&btp->bt_psi_tid, NULL,
> +				xfs_buftarg_shrinker, btp);
> +	if (error)
> +		goto out_close;
> +
> +	return 0;
> +
> +out_close:
> +	close(btp->bt_psi_fd);
> +	btp->bt_psi_fd = -1;
> +	return error;
> +}
> +
> +
>  struct xfs_buftarg *
>  xfs_buftarg_alloc(
>  	struct xfs_mount	*mp,
> @@ -74,6 +196,8 @@ xfs_buftarg_alloc(
>  	btp->bt_mount = mp;
>  	btp->bt_fd = libxfs_device_to_fd(bdev);
>  	btp->bt_bdev = bdev;
> +	btp->bt_psi_fd = -1;
> +	btp->bt_exiting = false;
>  
>  	if (xfs_buftarg_setsize_early(btp))
>  		goto error_free;
> @@ -84,8 +208,13 @@ xfs_buftarg_alloc(
>  	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
>  		goto error_lru;
>  
> +	if (xfs_buftarg_mempressue_init(btp))

So what happens if PSI isn't enabled or procfs isn't mounted yet?
xfs_repair just ... fails?  That seems disappointing, particularly if
the admin is trying to fix a dead root fs from the initramfs premount
shell and /proc isn't set up yet.

Hmm, looks like Debian actually /does/ set up procfs nowadays.  Still,
if we're going to add a hard requirement on CONFIG_PSI=y and
CONFIG_PSI_DEFAULT_DISABLED=n, we need to advertise this kind of loudly.

(Personally, I thought that if there's no pressure stall information,
we'd just fall back to not having a shrinker and daring the system to
OOM us like it does now...)

--D

> +		goto error_pcp;
> +
>  	return btp;
>  
> +error_pcp:
> +	percpu_counter_destroy(&btp->bt_io_count);
>  error_lru:
>  	list_lru_destroy(&btp->bt_lru);
>  error_free:
> @@ -97,6 +226,12 @@ void
>  xfs_buftarg_free(
>  	struct xfs_buftarg	*btp)
>  {
> +	btp->bt_exiting = true;
> +	if (btp->bt_psi_tid)
> +		pthread_join(btp->bt_psi_tid, NULL);
> +	if (btp->bt_psi_fd >= 0)
> +		close(btp->bt_psi_fd);
> +
>  	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
>  	percpu_counter_destroy(&btp->bt_io_count);
>  	platform_flush_device(btp->bt_fd, btp->bt_bdev);
> @@ -121,10 +256,15 @@ xfs_buf_allocate_memory(
>  	struct xfs_buf		*bp,
>  	uint			flags)
>  {
> +	struct xfs_buftarg	*btp = bp->b_target;
>  	size_t			size;
>  
> +	/* Throttle allocation while dealing with low memory events */
> +	while (atomic_read(&btp->bt_low_mem))
> +		wait_for_completion(&btp->bt_low_mem_wait);
> +
>  	size = BBTOB(bp->b_length);
> -	bp->b_addr = memalign(bp->b_target->bt_meta_sectorsize, size);
> +	bp->b_addr = memalign(btp->bt_meta_sectorsize, size);
>  	if (!bp->b_addr)
>  		return -ENOMEM;
>  	return 0;
> diff --git a/libxfs/xfs_buftarg.h b/libxfs/xfs_buftarg.h
> index 798980fdafeb..d2ce47e22545 100644
> --- a/libxfs/xfs_buftarg.h
> +++ b/libxfs/xfs_buftarg.h
> @@ -41,7 +41,16 @@ struct xfs_buftarg {
>  
>  	uint32_t		bt_io_count;
>  	unsigned int		flags;
> +
> +	/*
> +	 * Memory pressure (PSI) and cache reclaim infrastructure
> +	 */
>  	struct list_lru		bt_lru;
> +	int			bt_psi_fd;
> +	pthread_t		bt_psi_tid;
> +	bool			bt_exiting;
> +	bool			bt_low_mem;
> +	struct completion	bt_low_mem_wait;
>  };
>  
>  /* We purged a dirty buffer and lost a write. */
> -- 
> 2.28.0
>
Dave Chinner Oct. 15, 2020, 9:20 p.m. UTC | #2
On Thu, Oct 15, 2020 at 10:56:11AM -0700, Darrick J. Wong wrote:
> On Thu, Oct 15, 2020 at 06:21:51PM +1100, Dave Chinner wrote:
> > @@ -74,6 +196,8 @@ xfs_buftarg_alloc(
> >  	btp->bt_mount = mp;
> >  	btp->bt_fd = libxfs_device_to_fd(bdev);
> >  	btp->bt_bdev = bdev;
> > +	btp->bt_psi_fd = -1;
> > +	btp->bt_exiting = false;
> >  
> >  	if (xfs_buftarg_setsize_early(btp))
> >  		goto error_free;
> > @@ -84,8 +208,13 @@ xfs_buftarg_alloc(
> >  	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
> >  		goto error_lru;
> >  
> > +	if (xfs_buftarg_mempressue_init(btp))
> 
> So what happens if PSI isn't enabled or procfs isn't mounted yet?
> xfs_repair just ... fails?  That seems disappointing, particularly if
> the admin is trying to fix a dead root fs from the initramfs premount
> shell and /proc isn't set up yet.

Yes, right now it just fails. I'm more interested right now in
getting the new infrastructure working such that the kernel buffer
cache "just works" when there's more metadata than RAM to cache it
in.

> Hmm, looks like Debian actually /does/ set up procfs nowadays.  Still,
> if we're going to add a hard requirement on CONFIG_PSI=y and
> CONFIG_PSI_DEFAULT_DISABLED=n, we need to advertise this kind of loudly.
> 
> (Personally, I thought that if there's no pressure stall information,
> we'd just fall back to not having a shrinker and daring the system to
> OOM us like it does now...)

Well, the existing buffer cache does have a shrinker mechanism - it
will shake the cache down when it is full to free up old buffers.
That's what all the MRU lists and buffer priority stuff in the
repair prefetch code is all about.

repair tries to bound the maximum size of the buffer cache and
prevent OOM that way. If it calculates that the memory requirement
is larger than RAM, that's when it gets into OOM trouble because we
still allow it to use lots of memory and then just hope...

I kind of want to get away from all those messy static heuristics.
I'd much prefer that we do dynamic cache growth detection and size
calculations in repair and determine if we should purge the cache at
the end of each AG or retain it in RAM. i.e. if ((per ag cache size
* no. of AGs) > 75% RAM) then purge the AG cache when the phase scan
is done. This way we run with minimal caching (just what is needed
for prefetching to be efficient) when it is likely we can't fit all
the metadata in RAM, and otherwise we behave like we currently do.

That sort of setup will go a long way to avoiding OOM kill and the
need for actual memory shrinkers to activate. This mode could be
activated if the PSI infomration is not there, hence might also
solve most of the rescue situation problems.

Cheers,

Dave.
diff mbox series

Patch

diff --git a/libxfs/buftarg.c b/libxfs/buftarg.c
index 42806e433715..6c7142d41eb1 100644
--- a/libxfs/buftarg.c
+++ b/libxfs/buftarg.c
@@ -62,6 +62,128 @@  xfs_buftarg_setsize_early(
 	return xfs_buftarg_setsize(btp, bsize);
 }
 
+/*
+ * Scan a chunk of the buffer cache and drop LRU reference counts. If the
+ * count goes to zero, dispose of the buffer.
+ */
+static void
+xfs_buftarg_shrink(
+	struct xfs_buftarg	*btc)
+{
+	/*
+	 * Make the fact we are in memory reclaim externally visible. This
+	 * allows buffer cache allocation throttling while we are trying to
+	 * free memory.
+	 */
+	atomic_inc_return(&btc->bt_low_mem);
+
+	fprintf(stderr, "Got memory pressure event. Shrinking caches!\n");
+
+	/*
+	 * Now we've free a bunch of memory, trim the heap down to release the
+	 * freed memory back to the kernel and reduce the pressure we are
+	 * placing on the system.
+	 */
+	malloc_trim(0);
+
+	/*
+	 * Done, wake anyone waiting on memory reclaim to complete.
+	 */
+	atomic_dec_return(&btc->bt_low_mem);
+	complete(&btc->bt_low_mem_wait);
+}
+
+static void *
+xfs_buftarg_shrinker(
+	void			*args)
+{
+	struct xfs_buftarg	*btp = args;
+	struct pollfd		 fds = {
+		.fd = btp->bt_psi_fd,
+		.events = POLLPRI,
+	};
+
+	rcu_register_thread();
+	while (!btp->bt_exiting) {
+		int	n;
+
+		n = poll(&fds, 1, 100);
+		if (n == 0)
+			continue;	/* timeout */
+		if (n < 0) {
+			perror("poll(PSI)");
+			break;
+		}
+		if (fds.revents & POLLERR) {
+			fprintf(stderr,
+				"poll(psi) POLLERR: event source dead?\n");
+			break;
+		}
+		if (!(fds.revents & POLLPRI)) {
+			fprintf(stderr,
+				"poll(psi): unknown event.  Ignoring.\n");
+			continue;
+		}
+
+		/* run the shrinker here */
+		xfs_buftarg_shrink(btp);
+
+	}
+	rcu_unregister_thread();
+	return NULL;
+}
+
+/*
+ * This only picks up on global memory pressure. Maybe in future we can detect
+ * whether we are running inside a container and use the PSI information for the
+ * container.
+ *
+ * We want relatively early notification of memory pressure stalls because
+ * xfs_repair will consume lots of memory. Hence set a low trigger threshold for
+ * reclaim to run - a partial stall of 5ms over a 1s sample period will trigger
+ * reclaim algorithms.
+ */
+static int
+xfs_buftarg_mempressue_init(
+	struct xfs_buftarg	*btp)
+{
+	const char		*fname = "/proc/pressure/memory";
+	const char		*trigger = "some 10000 1000000";
+	int			error;
+
+	btp->bt_psi_fd = open(fname, O_RDWR | O_NONBLOCK);
+	if (btp->bt_psi_fd < 0) {
+		perror("open(PSI)");
+		return -errno;
+	}
+	if (write(btp->bt_psi_fd, trigger, strlen(trigger) + 1) !=
+						strlen(trigger) + 1) {
+		perror("write(PSI)");
+		error = -errno;
+		goto out_close;
+	}
+
+	atomic_set(&btp->bt_low_mem, 0);
+	init_completion(&btp->bt_low_mem_wait);
+
+	/*
+	 * Now create the monitoring reclaim thread. This will run until the
+	 * buftarg is torn down.
+	 */
+	error = pthread_create(&btp->bt_psi_tid, NULL,
+				xfs_buftarg_shrinker, btp);
+	if (error)
+		goto out_close;
+
+	return 0;
+
+out_close:
+	close(btp->bt_psi_fd);
+	btp->bt_psi_fd = -1;
+	return error;
+}
+
+
 struct xfs_buftarg *
 xfs_buftarg_alloc(
 	struct xfs_mount	*mp,
@@ -74,6 +196,8 @@  xfs_buftarg_alloc(
 	btp->bt_mount = mp;
 	btp->bt_fd = libxfs_device_to_fd(bdev);
 	btp->bt_bdev = bdev;
+	btp->bt_psi_fd = -1;
+	btp->bt_exiting = false;
 
 	if (xfs_buftarg_setsize_early(btp))
 		goto error_free;
@@ -84,8 +208,13 @@  xfs_buftarg_alloc(
 	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
 		goto error_lru;
 
+	if (xfs_buftarg_mempressue_init(btp))
+		goto error_pcp;
+
 	return btp;
 
+error_pcp:
+	percpu_counter_destroy(&btp->bt_io_count);
 error_lru:
 	list_lru_destroy(&btp->bt_lru);
 error_free:
@@ -97,6 +226,12 @@  void
 xfs_buftarg_free(
 	struct xfs_buftarg	*btp)
 {
+	btp->bt_exiting = true;
+	if (btp->bt_psi_tid)
+		pthread_join(btp->bt_psi_tid, NULL);
+	if (btp->bt_psi_fd >= 0)
+		close(btp->bt_psi_fd);
+
 	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
 	percpu_counter_destroy(&btp->bt_io_count);
 	platform_flush_device(btp->bt_fd, btp->bt_bdev);
@@ -121,10 +256,15 @@  xfs_buf_allocate_memory(
 	struct xfs_buf		*bp,
 	uint			flags)
 {
+	struct xfs_buftarg	*btp = bp->b_target;
 	size_t			size;
 
+	/* Throttle allocation while dealing with low memory events */
+	while (atomic_read(&btp->bt_low_mem))
+		wait_for_completion(&btp->bt_low_mem_wait);
+
 	size = BBTOB(bp->b_length);
-	bp->b_addr = memalign(bp->b_target->bt_meta_sectorsize, size);
+	bp->b_addr = memalign(btp->bt_meta_sectorsize, size);
 	if (!bp->b_addr)
 		return -ENOMEM;
 	return 0;
diff --git a/libxfs/xfs_buftarg.h b/libxfs/xfs_buftarg.h
index 798980fdafeb..d2ce47e22545 100644
--- a/libxfs/xfs_buftarg.h
+++ b/libxfs/xfs_buftarg.h
@@ -41,7 +41,16 @@  struct xfs_buftarg {
 
 	uint32_t		bt_io_count;
 	unsigned int		flags;
+
+	/*
+	 * Memory pressure (PSI) and cache reclaim infrastructure
+	 */
 	struct list_lru		bt_lru;
+	int			bt_psi_fd;
+	pthread_t		bt_psi_tid;
+	bool			bt_exiting;
+	bool			bt_low_mem;
+	struct completion	bt_low_mem_wait;
 };
 
 /* We purged a dirty buffer and lost a write. */