Message ID | 20201103131754.94949-2-laoar.shao@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | avoid xfs transaction reservation recursion | expand |
On Tue, Nov 03, 2020 at 09:17:53PM +0800, Yafang Shao wrote: > From: "Matthew Wilcox (Oracle)" <willy@infradead.org> > > Since XFS needs to pretend to be kswapd in some of its worker threads, > create methods to save & restore kswapd state. Don't bother restoring > kswapd state in kswapd -- the only time we reach this code is when we're > exiting and the task_struct is about to be destroyed anyway. > > Cc: Dave Chinner <david@fromorbit.com> > Acked-by: Michal Hocko <mhocko@suse.com> > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > Reviewed-by: Christoph Hellwig <hch@lst.de> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > Signed-off-by: Yafang Shao <laoar.shao@gmail.com> > --- > fs/xfs/libxfs/xfs_btree.c | 14 ++++++++------ > include/linux/sched/mm.h | 23 +++++++++++++++++++++++ > mm/vmscan.c | 16 +--------------- > 3 files changed, 32 insertions(+), 21 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c > index 2d25bab..a04a442 100644 > --- a/fs/xfs/libxfs/xfs_btree.c > +++ b/fs/xfs/libxfs/xfs_btree.c > @@ -2813,8 +2813,9 @@ struct xfs_btree_split_args { > { > struct xfs_btree_split_args *args = container_of(work, > struct xfs_btree_split_args, work); > + bool is_kswapd = args->kswapd; > unsigned long pflags; > - unsigned long new_pflags = PF_MEMALLOC_NOFS; > + int memalloc_nofs; > > /* > * we are in a transaction context here, but may also be doing work > @@ -2822,16 +2823,17 @@ struct xfs_btree_split_args { > * temporarily to ensure that we don't block waiting for memory reclaim > * in any way. > */ > - if (args->kswapd) > - new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; > - > - current_set_flags_nested(&pflags, new_pflags); > + if (is_kswapd) > + pflags = become_kswapd(); > + memalloc_nofs = memalloc_nofs_save(); > > args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, > args->key, args->curp, args->stat); > complete(args->done); > > - current_restore_flags_nested(&pflags, new_pflags); > + memalloc_nofs_restore(memalloc_nofs); > + if (is_kswapd) > + restore_kswapd(pflags); Note that there's a trivial merge conflict with the mrlock_t removal series. I'll carry the fix in the tree, assuming that everything passes. --D > } > > /* > diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h > index d5ece7a..2faf03e 100644 > --- a/include/linux/sched/mm.h > +++ b/include/linux/sched/mm.h > @@ -278,6 +278,29 @@ static inline void memalloc_nocma_restore(unsigned int flags) > } > #endif > > +/* > + * Tell the memory management code that this thread is working on behalf > + * of background memory reclaim (like kswapd). That means that it will > + * get access to memory reserves should it need to allocate memory in > + * order to make forward progress. With this great power comes great > + * responsibility to not exhaust those reserves. > + */ > +#define KSWAPD_PF_FLAGS (PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD) > + > +static inline unsigned long become_kswapd(void) > +{ > + unsigned long flags = current->flags & KSWAPD_PF_FLAGS; > + > + current->flags |= KSWAPD_PF_FLAGS; > + > + return flags; > +} > + > +static inline void restore_kswapd(unsigned long flags) > +{ > + current->flags &= ~(flags ^ KSWAPD_PF_FLAGS); > +} > + > #ifdef CONFIG_MEMCG > DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); > /** > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 1b8f0e0..77bc1dd 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -3869,19 +3869,7 @@ static int kswapd(void *p) > if (!cpumask_empty(cpumask)) > set_cpus_allowed_ptr(tsk, cpumask); > > - /* > - * Tell the memory management that we're a "memory allocator", > - * and that if we need more memory we should get access to it > - * regardless (see "__alloc_pages()"). "kswapd" should > - * never get caught in the normal page freeing logic. > - * > - * (Kswapd normally doesn't need memory anyway, but sometimes > - * you need a small amount of memory in order to be able to > - * page out something else, and this flag essentially protects > - * us from recursively trying to free more memory as we're > - * trying to free the first piece of memory in the first place). > - */ > - tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; > + become_kswapd(); > set_freezable(); > > WRITE_ONCE(pgdat->kswapd_order, 0); > @@ -3931,8 +3919,6 @@ static int kswapd(void *p) > goto kswapd_try_sleep; > } > > - tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); > - > return 0; > } > > -- > 1.8.3.1 >
On Wed, Nov 4, 2020 at 3:48 AM Darrick J. Wong <darrick.wong@oracle.com> wrote: > > On Tue, Nov 03, 2020 at 09:17:53PM +0800, Yafang Shao wrote: > > From: "Matthew Wilcox (Oracle)" <willy@infradead.org> > > > > Since XFS needs to pretend to be kswapd in some of its worker threads, > > create methods to save & restore kswapd state. Don't bother restoring > > kswapd state in kswapd -- the only time we reach this code is when we're > > exiting and the task_struct is about to be destroyed anyway. > > > > Cc: Dave Chinner <david@fromorbit.com> > > Acked-by: Michal Hocko <mhocko@suse.com> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > > Reviewed-by: Christoph Hellwig <hch@lst.de> > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > > Signed-off-by: Yafang Shao <laoar.shao@gmail.com> > > --- > > fs/xfs/libxfs/xfs_btree.c | 14 ++++++++------ > > include/linux/sched/mm.h | 23 +++++++++++++++++++++++ > > mm/vmscan.c | 16 +--------------- > > 3 files changed, 32 insertions(+), 21 deletions(-) > > > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c > > index 2d25bab..a04a442 100644 > > --- a/fs/xfs/libxfs/xfs_btree.c > > +++ b/fs/xfs/libxfs/xfs_btree.c > > @@ -2813,8 +2813,9 @@ struct xfs_btree_split_args { > > { > > struct xfs_btree_split_args *args = container_of(work, > > struct xfs_btree_split_args, work); > > + bool is_kswapd = args->kswapd; > > unsigned long pflags; > > - unsigned long new_pflags = PF_MEMALLOC_NOFS; > > + int memalloc_nofs; > > > > /* > > * we are in a transaction context here, but may also be doing work > > @@ -2822,16 +2823,17 @@ struct xfs_btree_split_args { > > * temporarily to ensure that we don't block waiting for memory reclaim > > * in any way. > > */ > > - if (args->kswapd) > > - new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; > > - > > - current_set_flags_nested(&pflags, new_pflags); > > + if (is_kswapd) > > + pflags = become_kswapd(); > > + memalloc_nofs = memalloc_nofs_save(); > > > > args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, > > args->key, args->curp, args->stat); > > complete(args->done); > > > > - current_restore_flags_nested(&pflags, new_pflags); > > + memalloc_nofs_restore(memalloc_nofs); > > + if (is_kswapd) > > + restore_kswapd(pflags); > > Note that there's a trivial merge conflict with the mrlock_t removal > series. I'll carry the fix in the tree, assuming that everything > passes. > This patchset is based on Andrew's tree currently. Seems I should rebase this patchset on your tree instead of Andrew's tree ? > --D > > > } > > > > /* > > diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h > > index d5ece7a..2faf03e 100644 > > --- a/include/linux/sched/mm.h > > +++ b/include/linux/sched/mm.h > > @@ -278,6 +278,29 @@ static inline void memalloc_nocma_restore(unsigned int flags) > > } > > #endif > > > > +/* > > + * Tell the memory management code that this thread is working on behalf > > + * of background memory reclaim (like kswapd). That means that it will > > + * get access to memory reserves should it need to allocate memory in > > + * order to make forward progress. With this great power comes great > > + * responsibility to not exhaust those reserves. > > + */ > > +#define KSWAPD_PF_FLAGS (PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD) > > + > > +static inline unsigned long become_kswapd(void) > > +{ > > + unsigned long flags = current->flags & KSWAPD_PF_FLAGS; > > + > > + current->flags |= KSWAPD_PF_FLAGS; > > + > > + return flags; > > +} > > + > > +static inline void restore_kswapd(unsigned long flags) > > +{ > > + current->flags &= ~(flags ^ KSWAPD_PF_FLAGS); > > +} > > + > > #ifdef CONFIG_MEMCG > > DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); > > /** > > diff --git a/mm/vmscan.c b/mm/vmscan.c > > index 1b8f0e0..77bc1dd 100644 > > --- a/mm/vmscan.c > > +++ b/mm/vmscan.c > > @@ -3869,19 +3869,7 @@ static int kswapd(void *p) > > if (!cpumask_empty(cpumask)) > > set_cpus_allowed_ptr(tsk, cpumask); > > > > - /* > > - * Tell the memory management that we're a "memory allocator", > > - * and that if we need more memory we should get access to it > > - * regardless (see "__alloc_pages()"). "kswapd" should > > - * never get caught in the normal page freeing logic. > > - * > > - * (Kswapd normally doesn't need memory anyway, but sometimes > > - * you need a small amount of memory in order to be able to > > - * page out something else, and this flag essentially protects > > - * us from recursively trying to free more memory as we're > > - * trying to free the first piece of memory in the first place). > > - */ > > - tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; > > + become_kswapd(); > > set_freezable(); > > > > WRITE_ONCE(pgdat->kswapd_order, 0); > > @@ -3931,8 +3919,6 @@ static int kswapd(void *p) > > goto kswapd_try_sleep; > > } > > > > - tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); > > - > > return 0; > > } > > > > -- > > 1.8.3.1 > >
On Wed, Nov 04, 2020 at 10:17:16PM +0800, Yafang Shao wrote: > On Wed, Nov 4, 2020 at 3:48 AM Darrick J. Wong <darrick.wong@oracle.com> wrote: > > > > On Tue, Nov 03, 2020 at 09:17:53PM +0800, Yafang Shao wrote: > > > From: "Matthew Wilcox (Oracle)" <willy@infradead.org> > > > > > > Since XFS needs to pretend to be kswapd in some of its worker threads, > > > create methods to save & restore kswapd state. Don't bother restoring > > > kswapd state in kswapd -- the only time we reach this code is when we're > > > exiting and the task_struct is about to be destroyed anyway. > > > > > > Cc: Dave Chinner <david@fromorbit.com> > > > Acked-by: Michal Hocko <mhocko@suse.com> > > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > > > Reviewed-by: Christoph Hellwig <hch@lst.de> > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > > > Signed-off-by: Yafang Shao <laoar.shao@gmail.com> > > > --- > > > fs/xfs/libxfs/xfs_btree.c | 14 ++++++++------ > > > include/linux/sched/mm.h | 23 +++++++++++++++++++++++ > > > mm/vmscan.c | 16 +--------------- > > > 3 files changed, 32 insertions(+), 21 deletions(-) > > > > > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c > > > index 2d25bab..a04a442 100644 > > > --- a/fs/xfs/libxfs/xfs_btree.c > > > +++ b/fs/xfs/libxfs/xfs_btree.c > > > @@ -2813,8 +2813,9 @@ struct xfs_btree_split_args { > > > { > > > struct xfs_btree_split_args *args = container_of(work, > > > struct xfs_btree_split_args, work); > > > + bool is_kswapd = args->kswapd; > > > unsigned long pflags; > > > - unsigned long new_pflags = PF_MEMALLOC_NOFS; > > > + int memalloc_nofs; > > > > > > /* > > > * we are in a transaction context here, but may also be doing work > > > @@ -2822,16 +2823,17 @@ struct xfs_btree_split_args { > > > * temporarily to ensure that we don't block waiting for memory reclaim > > > * in any way. > > > */ > > > - if (args->kswapd) > > > - new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; > > > - > > > - current_set_flags_nested(&pflags, new_pflags); > > > + if (is_kswapd) > > > + pflags = become_kswapd(); > > > + memalloc_nofs = memalloc_nofs_save(); > > > > > > args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, > > > args->key, args->curp, args->stat); > > > complete(args->done); > > > > > > - current_restore_flags_nested(&pflags, new_pflags); > > > + memalloc_nofs_restore(memalloc_nofs); > > > + if (is_kswapd) > > > + restore_kswapd(pflags); > > > > Note that there's a trivial merge conflict with the mrlock_t removal > > series. I'll carry the fix in the tree, assuming that everything > > passes. > > > > This patchset is based on Andrew's tree currently. > Seems I should rebase this patchset on your tree instead of Andrew's tree ? That depends on whether or not you want me to push these two patches through the xfs tree or if they're going through Andrew (Morton?)'s quiltset. Frankly I'd rather take them via xfs since most of the diff is xfs... --D > > --D > > > > > } > > > > > > /* > > > diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h > > > index d5ece7a..2faf03e 100644 > > > --- a/include/linux/sched/mm.h > > > +++ b/include/linux/sched/mm.h > > > @@ -278,6 +278,29 @@ static inline void memalloc_nocma_restore(unsigned int flags) > > > } > > > #endif > > > > > > +/* > > > + * Tell the memory management code that this thread is working on behalf > > > + * of background memory reclaim (like kswapd). That means that it will > > > + * get access to memory reserves should it need to allocate memory in > > > + * order to make forward progress. With this great power comes great > > > + * responsibility to not exhaust those reserves. > > > + */ > > > +#define KSWAPD_PF_FLAGS (PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD) > > > + > > > +static inline unsigned long become_kswapd(void) > > > +{ > > > + unsigned long flags = current->flags & KSWAPD_PF_FLAGS; > > > + > > > + current->flags |= KSWAPD_PF_FLAGS; > > > + > > > + return flags; > > > +} > > > + > > > +static inline void restore_kswapd(unsigned long flags) > > > +{ > > > + current->flags &= ~(flags ^ KSWAPD_PF_FLAGS); > > > +} > > > + > > > #ifdef CONFIG_MEMCG > > > DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); > > > /** > > > diff --git a/mm/vmscan.c b/mm/vmscan.c > > > index 1b8f0e0..77bc1dd 100644 > > > --- a/mm/vmscan.c > > > +++ b/mm/vmscan.c > > > @@ -3869,19 +3869,7 @@ static int kswapd(void *p) > > > if (!cpumask_empty(cpumask)) > > > set_cpus_allowed_ptr(tsk, cpumask); > > > > > > - /* > > > - * Tell the memory management that we're a "memory allocator", > > > - * and that if we need more memory we should get access to it > > > - * regardless (see "__alloc_pages()"). "kswapd" should > > > - * never get caught in the normal page freeing logic. > > > - * > > > - * (Kswapd normally doesn't need memory anyway, but sometimes > > > - * you need a small amount of memory in order to be able to > > > - * page out something else, and this flag essentially protects > > > - * us from recursively trying to free more memory as we're > > > - * trying to free the first piece of memory in the first place). > > > - */ > > > - tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; > > > + become_kswapd(); > > > set_freezable(); > > > > > > WRITE_ONCE(pgdat->kswapd_order, 0); > > > @@ -3931,8 +3919,6 @@ static int kswapd(void *p) > > > goto kswapd_try_sleep; > > > } > > > > > > - tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); > > > - > > > return 0; > > > } > > > > > > -- > > > 1.8.3.1 > > > > > > > -- > Thanks > Yafang
On Wed, Nov 4, 2020 at 8:30 AM Darrick J. Wong <darrick.wong@oracle.com> wrote: > > On Wed, Nov 04, 2020 at 10:17:16PM +0800, Yafang Shao wrote: > > On Wed, Nov 4, 2020 at 3:48 AM Darrick J. Wong <darrick.wong@oracle.com> wrote: > > > > > > On Tue, Nov 03, 2020 at 09:17:53PM +0800, Yafang Shao wrote: > > > > From: "Matthew Wilcox (Oracle)" <willy@infradead.org> > > > > > > > > Since XFS needs to pretend to be kswapd in some of its worker threads, > > > > create methods to save & restore kswapd state. Don't bother restoring > > > > kswapd state in kswapd -- the only time we reach this code is when we're > > > > exiting and the task_struct is about to be destroyed anyway. > > > > > > > > Cc: Dave Chinner <david@fromorbit.com> > > > > Acked-by: Michal Hocko <mhocko@suse.com> > > > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > Reviewed-by: Christoph Hellwig <hch@lst.de> > > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > > > > Signed-off-by: Yafang Shao <laoar.shao@gmail.com> > > > > --- > > > > fs/xfs/libxfs/xfs_btree.c | 14 ++++++++------ > > > > include/linux/sched/mm.h | 23 +++++++++++++++++++++++ > > > > mm/vmscan.c | 16 +--------------- > > > > 3 files changed, 32 insertions(+), 21 deletions(-) > > > > > > > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c > > > > index 2d25bab..a04a442 100644 > > > > --- a/fs/xfs/libxfs/xfs_btree.c > > > > +++ b/fs/xfs/libxfs/xfs_btree.c > > > > @@ -2813,8 +2813,9 @@ struct xfs_btree_split_args { > > > > { > > > > struct xfs_btree_split_args *args = container_of(work, > > > > struct xfs_btree_split_args, work); > > > > + bool is_kswapd = args->kswapd; > > > > unsigned long pflags; > > > > - unsigned long new_pflags = PF_MEMALLOC_NOFS; > > > > + int memalloc_nofs; > > > > > > > > /* > > > > * we are in a transaction context here, but may also be doing work > > > > @@ -2822,16 +2823,17 @@ struct xfs_btree_split_args { > > > > * temporarily to ensure that we don't block waiting for memory reclaim > > > > * in any way. > > > > */ > > > > - if (args->kswapd) > > > > - new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; > > > > - > > > > - current_set_flags_nested(&pflags, new_pflags); > > > > + if (is_kswapd) > > > > + pflags = become_kswapd(); > > > > + memalloc_nofs = memalloc_nofs_save(); > > > > > > > > args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, > > > > args->key, args->curp, args->stat); > > > > complete(args->done); > > > > > > > > - current_restore_flags_nested(&pflags, new_pflags); > > > > + memalloc_nofs_restore(memalloc_nofs); > > > > + if (is_kswapd) > > > > + restore_kswapd(pflags); > > > > > > Note that there's a trivial merge conflict with the mrlock_t removal > > > series. I'll carry the fix in the tree, assuming that everything > > > passes. > > > > > > > This patchset is based on Andrew's tree currently. > > Seems I should rebase this patchset on your tree instead of Andrew's tree ? > > That depends on whether or not you want me to push these two patches > through the xfs tree or if they're going through Andrew (Morton?)'s > quiltset. I think they could go through either, but the XFS tree might be ever-so-slightly better. > > Frankly I'd rather take them via xfs since most of the diff is xfs... Yeah, it's an XFS patch so let's throw it through the XFS tree. > > --D > > > > --D > > > > > > > } > > > > > > > > /* > > > > diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h > > > > index d5ece7a..2faf03e 100644 > > > > --- a/include/linux/sched/mm.h > > > > +++ b/include/linux/sched/mm.h > > > > @@ -278,6 +278,29 @@ static inline void memalloc_nocma_restore(unsigned int flags) > > > > } > > > > #endif > > > > > > > > +/* > > > > + * Tell the memory management code that this thread is working on behalf > > > > + * of background memory reclaim (like kswapd). That means that it will > > > > + * get access to memory reserves should it need to allocate memory in > > > > + * order to make forward progress. With this great power comes great > > > > + * responsibility to not exhaust those reserves. > > > > + */ > > > > +#define KSWAPD_PF_FLAGS (PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD) > > > > + > > > > +static inline unsigned long become_kswapd(void) > > > > +{ > > > > + unsigned long flags = current->flags & KSWAPD_PF_FLAGS; > > > > + > > > > + current->flags |= KSWAPD_PF_FLAGS; > > > > + > > > > + return flags; > > > > +} > > > > + > > > > +static inline void restore_kswapd(unsigned long flags) > > > > +{ > > > > + current->flags &= ~(flags ^ KSWAPD_PF_FLAGS); > > > > +} > > > > + > > > > #ifdef CONFIG_MEMCG > > > > DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); > > > > /** > > > > diff --git a/mm/vmscan.c b/mm/vmscan.c > > > > index 1b8f0e0..77bc1dd 100644 > > > > --- a/mm/vmscan.c > > > > +++ b/mm/vmscan.c > > > > @@ -3869,19 +3869,7 @@ static int kswapd(void *p) > > > > if (!cpumask_empty(cpumask)) > > > > set_cpus_allowed_ptr(tsk, cpumask); > > > > > > > > - /* > > > > - * Tell the memory management that we're a "memory allocator", > > > > - * and that if we need more memory we should get access to it > > > > - * regardless (see "__alloc_pages()"). "kswapd" should > > > > - * never get caught in the normal page freeing logic. > > > > - * > > > > - * (Kswapd normally doesn't need memory anyway, but sometimes > > > > - * you need a small amount of memory in order to be able to > > > > - * page out something else, and this flag essentially protects > > > > - * us from recursively trying to free more memory as we're > > > > - * trying to free the first piece of memory in the first place). > > > > - */ > > > > - tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; > > > > + become_kswapd(); > > > > set_freezable(); > > > > > > > > WRITE_ONCE(pgdat->kswapd_order, 0); > > > > @@ -3931,8 +3919,6 @@ static int kswapd(void *p) > > > > goto kswapd_try_sleep; > > > > } > > > > > > > > - tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); > > > > - > > > > return 0; > > > > } > > > > > > > > -- > > > > 1.8.3.1 > > > > > > > > > > > > -- > > Thanks > > Yafang Best regards, Amy Parker (they/them)
On Thu, Nov 5, 2020 at 12:26 AM Darrick J. Wong <darrick.wong@oracle.com> wrote: > > On Wed, Nov 04, 2020 at 10:17:16PM +0800, Yafang Shao wrote: > > On Wed, Nov 4, 2020 at 3:48 AM Darrick J. Wong <darrick.wong@oracle.com> wrote: > > > > > > On Tue, Nov 03, 2020 at 09:17:53PM +0800, Yafang Shao wrote: > > > > From: "Matthew Wilcox (Oracle)" <willy@infradead.org> > > > > > > > > Since XFS needs to pretend to be kswapd in some of its worker threads, > > > > create methods to save & restore kswapd state. Don't bother restoring > > > > kswapd state in kswapd -- the only time we reach this code is when we're > > > > exiting and the task_struct is about to be destroyed anyway. > > > > > > > > Cc: Dave Chinner <david@fromorbit.com> > > > > Acked-by: Michal Hocko <mhocko@suse.com> > > > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > Reviewed-by: Christoph Hellwig <hch@lst.de> > > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > > > > Signed-off-by: Yafang Shao <laoar.shao@gmail.com> > > > > --- > > > > fs/xfs/libxfs/xfs_btree.c | 14 ++++++++------ > > > > include/linux/sched/mm.h | 23 +++++++++++++++++++++++ > > > > mm/vmscan.c | 16 +--------------- > > > > 3 files changed, 32 insertions(+), 21 deletions(-) > > > > > > > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c > > > > index 2d25bab..a04a442 100644 > > > > --- a/fs/xfs/libxfs/xfs_btree.c > > > > +++ b/fs/xfs/libxfs/xfs_btree.c > > > > @@ -2813,8 +2813,9 @@ struct xfs_btree_split_args { > > > > { > > > > struct xfs_btree_split_args *args = container_of(work, > > > > struct xfs_btree_split_args, work); > > > > + bool is_kswapd = args->kswapd; > > > > unsigned long pflags; > > > > - unsigned long new_pflags = PF_MEMALLOC_NOFS; > > > > + int memalloc_nofs; > > > > > > > > /* > > > > * we are in a transaction context here, but may also be doing work > > > > @@ -2822,16 +2823,17 @@ struct xfs_btree_split_args { > > > > * temporarily to ensure that we don't block waiting for memory reclaim > > > > * in any way. > > > > */ > > > > - if (args->kswapd) > > > > - new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; > > > > - > > > > - current_set_flags_nested(&pflags, new_pflags); > > > > + if (is_kswapd) > > > > + pflags = become_kswapd(); > > > > + memalloc_nofs = memalloc_nofs_save(); > > > > > > > > args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, > > > > args->key, args->curp, args->stat); > > > > complete(args->done); > > > > > > > > - current_restore_flags_nested(&pflags, new_pflags); > > > > + memalloc_nofs_restore(memalloc_nofs); > > > > + if (is_kswapd) > > > > + restore_kswapd(pflags); > > > > > > Note that there's a trivial merge conflict with the mrlock_t removal > > > series. I'll carry the fix in the tree, assuming that everything > > > passes. > > > > > > > This patchset is based on Andrew's tree currently. > > Seems I should rebase this patchset on your tree instead of Andrew's tree ? > > That depends on whether or not you want me to push these two patches > through the xfs tree or if they're going through Andrew (Morton?)'s > quiltset. > > Frankly I'd rather take them via xfs since most of the diff is xfs... > Sure, I will rebase in on the xfs tree. > > > > > > > } > > > > > > > > /* > > > > diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h > > > > index d5ece7a..2faf03e 100644 > > > > --- a/include/linux/sched/mm.h > > > > +++ b/include/linux/sched/mm.h > > > > @@ -278,6 +278,29 @@ static inline void memalloc_nocma_restore(unsigned int flags) > > > > } > > > > #endif > > > > > > > > +/* > > > > + * Tell the memory management code that this thread is working on behalf > > > > + * of background memory reclaim (like kswapd). That means that it will > > > > + * get access to memory reserves should it need to allocate memory in > > > > + * order to make forward progress. With this great power comes great > > > > + * responsibility to not exhaust those reserves. > > > > + */ > > > > +#define KSWAPD_PF_FLAGS (PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD) > > > > + > > > > +static inline unsigned long become_kswapd(void) > > > > +{ > > > > + unsigned long flags = current->flags & KSWAPD_PF_FLAGS; > > > > + > > > > + current->flags |= KSWAPD_PF_FLAGS; > > > > + > > > > + return flags; > > > > +} > > > > + > > > > +static inline void restore_kswapd(unsigned long flags) > > > > +{ > > > > + current->flags &= ~(flags ^ KSWAPD_PF_FLAGS); > > > > +} > > > > + > > > > #ifdef CONFIG_MEMCG > > > > DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); > > > > /** > > > > diff --git a/mm/vmscan.c b/mm/vmscan.c > > > > index 1b8f0e0..77bc1dd 100644 > > > > --- a/mm/vmscan.c > > > > +++ b/mm/vmscan.c > > > > @@ -3869,19 +3869,7 @@ static int kswapd(void *p) > > > > if (!cpumask_empty(cpumask)) > > > > set_cpus_allowed_ptr(tsk, cpumask); > > > > > > > > - /* > > > > - * Tell the memory management that we're a "memory allocator", > > > > - * and that if we need more memory we should get access to it > > > > - * regardless (see "__alloc_pages()"). "kswapd" should > > > > - * never get caught in the normal page freeing logic. > > > > - * > > > > - * (Kswapd normally doesn't need memory anyway, but sometimes > > > > - * you need a small amount of memory in order to be able to > > > > - * page out something else, and this flag essentially protects > > > > - * us from recursively trying to free more memory as we're > > > > - * trying to free the first piece of memory in the first place). > > > > - */ > > > > - tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; > > > > + become_kswapd(); > > > > set_freezable(); > > > > > > > > WRITE_ONCE(pgdat->kswapd_order, 0); > > > > @@ -3931,8 +3919,6 @@ static int kswapd(void *p) > > > > goto kswapd_try_sleep; > > > > } > > > > > > > > - tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); > > > > - > > > > return 0; > > > > } > > > > > > > > -- > > > > 1.8.3.1 > > > > > > > > > > > > -- > > Thanks > > Yafang
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 2d25bab..a04a442 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2813,8 +2813,9 @@ struct xfs_btree_split_args { { struct xfs_btree_split_args *args = container_of(work, struct xfs_btree_split_args, work); + bool is_kswapd = args->kswapd; unsigned long pflags; - unsigned long new_pflags = PF_MEMALLOC_NOFS; + int memalloc_nofs; /* * we are in a transaction context here, but may also be doing work @@ -2822,16 +2823,17 @@ struct xfs_btree_split_args { * temporarily to ensure that we don't block waiting for memory reclaim * in any way. */ - if (args->kswapd) - new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; - - current_set_flags_nested(&pflags, new_pflags); + if (is_kswapd) + pflags = become_kswapd(); + memalloc_nofs = memalloc_nofs_save(); args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, args->key, args->curp, args->stat); complete(args->done); - current_restore_flags_nested(&pflags, new_pflags); + memalloc_nofs_restore(memalloc_nofs); + if (is_kswapd) + restore_kswapd(pflags); } /* diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index d5ece7a..2faf03e 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -278,6 +278,29 @@ static inline void memalloc_nocma_restore(unsigned int flags) } #endif +/* + * Tell the memory management code that this thread is working on behalf + * of background memory reclaim (like kswapd). That means that it will + * get access to memory reserves should it need to allocate memory in + * order to make forward progress. With this great power comes great + * responsibility to not exhaust those reserves. + */ +#define KSWAPD_PF_FLAGS (PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD) + +static inline unsigned long become_kswapd(void) +{ + unsigned long flags = current->flags & KSWAPD_PF_FLAGS; + + current->flags |= KSWAPD_PF_FLAGS; + + return flags; +} + +static inline void restore_kswapd(unsigned long flags) +{ + current->flags &= ~(flags ^ KSWAPD_PF_FLAGS); +} + #ifdef CONFIG_MEMCG DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); /** diff --git a/mm/vmscan.c b/mm/vmscan.c index 1b8f0e0..77bc1dd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3869,19 +3869,7 @@ static int kswapd(void *p) if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); - /* - * Tell the memory management that we're a "memory allocator", - * and that if we need more memory we should get access to it - * regardless (see "__alloc_pages()"). "kswapd" should - * never get caught in the normal page freeing logic. - * - * (Kswapd normally doesn't need memory anyway, but sometimes - * you need a small amount of memory in order to be able to - * page out something else, and this flag essentially protects - * us from recursively trying to free more memory as we're - * trying to free the first piece of memory in the first place). - */ - tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + become_kswapd(); set_freezable(); WRITE_ONCE(pgdat->kswapd_order, 0); @@ -3931,8 +3919,6 @@ static int kswapd(void *p) goto kswapd_try_sleep; } - tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); - return 0; }