Message ID | 20201015072155.1631135-24-david@fromorbit.com (mailing list archive) |
---|---|
State | Deferred, archived |
Headers | show |
Series | xfsprogs: xfs_buf unification and AIO | expand |
On Thu, Oct 15, 2020 at 06:21:51PM +1100, Dave Chinner wrote: > From: Dave Chinner <dchinner@redhat.com> > > The buffer cache needs to have a reliable trigger for shrinking > the cache. Modern kernels track and report memory pressure events to > the userspace via the Pressure Stall Interface (PSI). Create a PSI > memory pressure monitoring thread to listen for memory pressure > events and use that to drive buffer cache shrinking interfaces. > > Add the shrinker framework that will allow us to implement LRU > reclaim of buffers when memory pressure occues. We also create a > low memory detection and reclaim wait mechanism to allow use to > throttle back new allocations while we are shrinking the buffer > cache. > > We also include malloc heap trimming callouts so that once the > shrinker frees the memory, we trim the malloc heap to release the > freed memory back to the system. > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > --- > libxfs/buftarg.c | 142 ++++++++++++++++++++++++++++++++++++++++++- > libxfs/xfs_buftarg.h | 9 +++ > 2 files changed, 150 insertions(+), 1 deletion(-) > > diff --git a/libxfs/buftarg.c b/libxfs/buftarg.c > index 42806e433715..6c7142d41eb1 100644 > --- a/libxfs/buftarg.c > +++ b/libxfs/buftarg.c > @@ -62,6 +62,128 @@ xfs_buftarg_setsize_early( > return xfs_buftarg_setsize(btp, bsize); > } > > +/* > + * Scan a chunk of the buffer cache and drop LRU reference counts. If the > + * count goes to zero, dispose of the buffer. > + */ > +static void > +xfs_buftarg_shrink( > + struct xfs_buftarg *btc) > +{ > + /* > + * Make the fact we are in memory reclaim externally visible. This > + * allows buffer cache allocation throttling while we are trying to > + * free memory. > + */ > + atomic_inc_return(&btc->bt_low_mem); > + > + fprintf(stderr, "Got memory pressure event. Shrinking caches!\n"); > + > + /* > + * Now we've free a bunch of memory, trim the heap down to release the > + * freed memory back to the kernel and reduce the pressure we are > + * placing on the system. > + */ > + malloc_trim(0); > + > + /* > + * Done, wake anyone waiting on memory reclaim to complete. > + */ > + atomic_dec_return(&btc->bt_low_mem); > + complete(&btc->bt_low_mem_wait); > +} > + > +static void * > +xfs_buftarg_shrinker( > + void *args) > +{ > + struct xfs_buftarg *btp = args; > + struct pollfd fds = { > + .fd = btp->bt_psi_fd, > + .events = POLLPRI, > + }; > + > + rcu_register_thread(); > + while (!btp->bt_exiting) { > + int n; > + > + n = poll(&fds, 1, 100); > + if (n == 0) > + continue; /* timeout */ > + if (n < 0) { > + perror("poll(PSI)"); > + break; > + } > + if (fds.revents & POLLERR) { > + fprintf(stderr, > + "poll(psi) POLLERR: event source dead?\n"); > + break; > + } > + if (!(fds.revents & POLLPRI)) { > + fprintf(stderr, > + "poll(psi): unknown event. Ignoring.\n"); > + continue; > + } > + > + /* run the shrinker here */ > + xfs_buftarg_shrink(btp); > + > + } > + rcu_unregister_thread(); > + return NULL; > +} > + > +/* > + * This only picks up on global memory pressure. Maybe in future we can detect > + * whether we are running inside a container and use the PSI information for the > + * container. > + * > + * We want relatively early notification of memory pressure stalls because > + * xfs_repair will consume lots of memory. Hence set a low trigger threshold for > + * reclaim to run - a partial stall of 5ms over a 1s sample period will trigger The trigger string looks like it's configuring for a partial stall of 10ms over a 1s sample period? > + * reclaim algorithms. > + */ > +static int > +xfs_buftarg_mempressue_init( xfs_buftarg_mempressure_init() ? > + struct xfs_buftarg *btp) > +{ > + const char *fname = "/proc/pressure/memory"; > + const char *trigger = "some 10000 1000000"; > + int error; > + > + btp->bt_psi_fd = open(fname, O_RDWR | O_NONBLOCK); > + if (btp->bt_psi_fd < 0) { > + perror("open(PSI)"); > + return -errno; > + } > + if (write(btp->bt_psi_fd, trigger, strlen(trigger) + 1) != > + strlen(trigger) + 1) { > + perror("write(PSI)"); > + error = -errno; > + goto out_close; > + } > + > + atomic_set(&btp->bt_low_mem, 0); > + init_completion(&btp->bt_low_mem_wait); > + > + /* > + * Now create the monitoring reclaim thread. This will run until the > + * buftarg is torn down. > + */ > + error = pthread_create(&btp->bt_psi_tid, NULL, > + xfs_buftarg_shrinker, btp); > + if (error) > + goto out_close; > + > + return 0; > + > +out_close: > + close(btp->bt_psi_fd); > + btp->bt_psi_fd = -1; > + return error; > +} > + > + > struct xfs_buftarg * > xfs_buftarg_alloc( > struct xfs_mount *mp, > @@ -74,6 +196,8 @@ xfs_buftarg_alloc( > btp->bt_mount = mp; > btp->bt_fd = libxfs_device_to_fd(bdev); > btp->bt_bdev = bdev; > + btp->bt_psi_fd = -1; > + btp->bt_exiting = false; > > if (xfs_buftarg_setsize_early(btp)) > goto error_free; > @@ -84,8 +208,13 @@ xfs_buftarg_alloc( > if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) > goto error_lru; > > + if (xfs_buftarg_mempressue_init(btp)) So what happens if PSI isn't enabled or procfs isn't mounted yet? xfs_repair just ... fails? That seems disappointing, particularly if the admin is trying to fix a dead root fs from the initramfs premount shell and /proc isn't set up yet. Hmm, looks like Debian actually /does/ set up procfs nowadays. Still, if we're going to add a hard requirement on CONFIG_PSI=y and CONFIG_PSI_DEFAULT_DISABLED=n, we need to advertise this kind of loudly. (Personally, I thought that if there's no pressure stall information, we'd just fall back to not having a shrinker and daring the system to OOM us like it does now...) --D > + goto error_pcp; > + > return btp; > > +error_pcp: > + percpu_counter_destroy(&btp->bt_io_count); > error_lru: > list_lru_destroy(&btp->bt_lru); > error_free: > @@ -97,6 +226,12 @@ void > xfs_buftarg_free( > struct xfs_buftarg *btp) > { > + btp->bt_exiting = true; > + if (btp->bt_psi_tid) > + pthread_join(btp->bt_psi_tid, NULL); > + if (btp->bt_psi_fd >= 0) > + close(btp->bt_psi_fd); > + > ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); > percpu_counter_destroy(&btp->bt_io_count); > platform_flush_device(btp->bt_fd, btp->bt_bdev); > @@ -121,10 +256,15 @@ xfs_buf_allocate_memory( > struct xfs_buf *bp, > uint flags) > { > + struct xfs_buftarg *btp = bp->b_target; > size_t size; > > + /* Throttle allocation while dealing with low memory events */ > + while (atomic_read(&btp->bt_low_mem)) > + wait_for_completion(&btp->bt_low_mem_wait); > + > size = BBTOB(bp->b_length); > - bp->b_addr = memalign(bp->b_target->bt_meta_sectorsize, size); > + bp->b_addr = memalign(btp->bt_meta_sectorsize, size); > if (!bp->b_addr) > return -ENOMEM; > return 0; > diff --git a/libxfs/xfs_buftarg.h b/libxfs/xfs_buftarg.h > index 798980fdafeb..d2ce47e22545 100644 > --- a/libxfs/xfs_buftarg.h > +++ b/libxfs/xfs_buftarg.h > @@ -41,7 +41,16 @@ struct xfs_buftarg { > > uint32_t bt_io_count; > unsigned int flags; > + > + /* > + * Memory pressure (PSI) and cache reclaim infrastructure > + */ > struct list_lru bt_lru; > + int bt_psi_fd; > + pthread_t bt_psi_tid; > + bool bt_exiting; > + bool bt_low_mem; > + struct completion bt_low_mem_wait; > }; > > /* We purged a dirty buffer and lost a write. */ > -- > 2.28.0 >
On Thu, Oct 15, 2020 at 10:56:11AM -0700, Darrick J. Wong wrote: > On Thu, Oct 15, 2020 at 06:21:51PM +1100, Dave Chinner wrote: > > @@ -74,6 +196,8 @@ xfs_buftarg_alloc( > > btp->bt_mount = mp; > > btp->bt_fd = libxfs_device_to_fd(bdev); > > btp->bt_bdev = bdev; > > + btp->bt_psi_fd = -1; > > + btp->bt_exiting = false; > > > > if (xfs_buftarg_setsize_early(btp)) > > goto error_free; > > @@ -84,8 +208,13 @@ xfs_buftarg_alloc( > > if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) > > goto error_lru; > > > > + if (xfs_buftarg_mempressue_init(btp)) > > So what happens if PSI isn't enabled or procfs isn't mounted yet? > xfs_repair just ... fails? That seems disappointing, particularly if > the admin is trying to fix a dead root fs from the initramfs premount > shell and /proc isn't set up yet. Yes, right now it just fails. I'm more interested right now in getting the new infrastructure working such that the kernel buffer cache "just works" when there's more metadata than RAM to cache it in. > Hmm, looks like Debian actually /does/ set up procfs nowadays. Still, > if we're going to add a hard requirement on CONFIG_PSI=y and > CONFIG_PSI_DEFAULT_DISABLED=n, we need to advertise this kind of loudly. > > (Personally, I thought that if there's no pressure stall information, > we'd just fall back to not having a shrinker and daring the system to > OOM us like it does now...) Well, the existing buffer cache does have a shrinker mechanism - it will shake the cache down when it is full to free up old buffers. That's what all the MRU lists and buffer priority stuff in the repair prefetch code is all about. repair tries to bound the maximum size of the buffer cache and prevent OOM that way. If it calculates that the memory requirement is larger than RAM, that's when it gets into OOM trouble because we still allow it to use lots of memory and then just hope... I kind of want to get away from all those messy static heuristics. I'd much prefer that we do dynamic cache growth detection and size calculations in repair and determine if we should purge the cache at the end of each AG or retain it in RAM. i.e. if ((per ag cache size * no. of AGs) > 75% RAM) then purge the AG cache when the phase scan is done. This way we run with minimal caching (just what is needed for prefetching to be efficient) when it is likely we can't fit all the metadata in RAM, and otherwise we behave like we currently do. That sort of setup will go a long way to avoiding OOM kill and the need for actual memory shrinkers to activate. This mode could be activated if the PSI infomration is not there, hence might also solve most of the rescue situation problems. Cheers, Dave.
diff --git a/libxfs/buftarg.c b/libxfs/buftarg.c index 42806e433715..6c7142d41eb1 100644 --- a/libxfs/buftarg.c +++ b/libxfs/buftarg.c @@ -62,6 +62,128 @@ xfs_buftarg_setsize_early( return xfs_buftarg_setsize(btp, bsize); } +/* + * Scan a chunk of the buffer cache and drop LRU reference counts. If the + * count goes to zero, dispose of the buffer. + */ +static void +xfs_buftarg_shrink( + struct xfs_buftarg *btc) +{ + /* + * Make the fact we are in memory reclaim externally visible. This + * allows buffer cache allocation throttling while we are trying to + * free memory. + */ + atomic_inc_return(&btc->bt_low_mem); + + fprintf(stderr, "Got memory pressure event. Shrinking caches!\n"); + + /* + * Now we've free a bunch of memory, trim the heap down to release the + * freed memory back to the kernel and reduce the pressure we are + * placing on the system. + */ + malloc_trim(0); + + /* + * Done, wake anyone waiting on memory reclaim to complete. + */ + atomic_dec_return(&btc->bt_low_mem); + complete(&btc->bt_low_mem_wait); +} + +static void * +xfs_buftarg_shrinker( + void *args) +{ + struct xfs_buftarg *btp = args; + struct pollfd fds = { + .fd = btp->bt_psi_fd, + .events = POLLPRI, + }; + + rcu_register_thread(); + while (!btp->bt_exiting) { + int n; + + n = poll(&fds, 1, 100); + if (n == 0) + continue; /* timeout */ + if (n < 0) { + perror("poll(PSI)"); + break; + } + if (fds.revents & POLLERR) { + fprintf(stderr, + "poll(psi) POLLERR: event source dead?\n"); + break; + } + if (!(fds.revents & POLLPRI)) { + fprintf(stderr, + "poll(psi): unknown event. Ignoring.\n"); + continue; + } + + /* run the shrinker here */ + xfs_buftarg_shrink(btp); + + } + rcu_unregister_thread(); + return NULL; +} + +/* + * This only picks up on global memory pressure. Maybe in future we can detect + * whether we are running inside a container and use the PSI information for the + * container. + * + * We want relatively early notification of memory pressure stalls because + * xfs_repair will consume lots of memory. Hence set a low trigger threshold for + * reclaim to run - a partial stall of 5ms over a 1s sample period will trigger + * reclaim algorithms. + */ +static int +xfs_buftarg_mempressue_init( + struct xfs_buftarg *btp) +{ + const char *fname = "/proc/pressure/memory"; + const char *trigger = "some 10000 1000000"; + int error; + + btp->bt_psi_fd = open(fname, O_RDWR | O_NONBLOCK); + if (btp->bt_psi_fd < 0) { + perror("open(PSI)"); + return -errno; + } + if (write(btp->bt_psi_fd, trigger, strlen(trigger) + 1) != + strlen(trigger) + 1) { + perror("write(PSI)"); + error = -errno; + goto out_close; + } + + atomic_set(&btp->bt_low_mem, 0); + init_completion(&btp->bt_low_mem_wait); + + /* + * Now create the monitoring reclaim thread. This will run until the + * buftarg is torn down. + */ + error = pthread_create(&btp->bt_psi_tid, NULL, + xfs_buftarg_shrinker, btp); + if (error) + goto out_close; + + return 0; + +out_close: + close(btp->bt_psi_fd); + btp->bt_psi_fd = -1; + return error; +} + + struct xfs_buftarg * xfs_buftarg_alloc( struct xfs_mount *mp, @@ -74,6 +196,8 @@ xfs_buftarg_alloc( btp->bt_mount = mp; btp->bt_fd = libxfs_device_to_fd(bdev); btp->bt_bdev = bdev; + btp->bt_psi_fd = -1; + btp->bt_exiting = false; if (xfs_buftarg_setsize_early(btp)) goto error_free; @@ -84,8 +208,13 @@ xfs_buftarg_alloc( if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) goto error_lru; + if (xfs_buftarg_mempressue_init(btp)) + goto error_pcp; + return btp; +error_pcp: + percpu_counter_destroy(&btp->bt_io_count); error_lru: list_lru_destroy(&btp->bt_lru); error_free: @@ -97,6 +226,12 @@ void xfs_buftarg_free( struct xfs_buftarg *btp) { + btp->bt_exiting = true; + if (btp->bt_psi_tid) + pthread_join(btp->bt_psi_tid, NULL); + if (btp->bt_psi_fd >= 0) + close(btp->bt_psi_fd); + ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); platform_flush_device(btp->bt_fd, btp->bt_bdev); @@ -121,10 +256,15 @@ xfs_buf_allocate_memory( struct xfs_buf *bp, uint flags) { + struct xfs_buftarg *btp = bp->b_target; size_t size; + /* Throttle allocation while dealing with low memory events */ + while (atomic_read(&btp->bt_low_mem)) + wait_for_completion(&btp->bt_low_mem_wait); + size = BBTOB(bp->b_length); - bp->b_addr = memalign(bp->b_target->bt_meta_sectorsize, size); + bp->b_addr = memalign(btp->bt_meta_sectorsize, size); if (!bp->b_addr) return -ENOMEM; return 0; diff --git a/libxfs/xfs_buftarg.h b/libxfs/xfs_buftarg.h index 798980fdafeb..d2ce47e22545 100644 --- a/libxfs/xfs_buftarg.h +++ b/libxfs/xfs_buftarg.h @@ -41,7 +41,16 @@ struct xfs_buftarg { uint32_t bt_io_count; unsigned int flags; + + /* + * Memory pressure (PSI) and cache reclaim infrastructure + */ struct list_lru bt_lru; + int bt_psi_fd; + pthread_t bt_psi_tid; + bool bt_exiting; + bool bt_low_mem; + struct completion bt_low_mem_wait; }; /* We purged a dirty buffer and lost a write. */