diff mbox series

[3/3] mm/vmscan: replace implicit RECLAIM_ZONE checks with explicit checks

Message ID 20200701152627.8761147E@viggo.jf.intel.com (mailing list archive)
State New, archived
Headers show
Series Repair and clean up vm.zone_reclaim_mode sysctl ABI | expand

Commit Message

Dave Hansen July 1, 2020, 3:26 p.m. UTC
From: Dave Hansen <dave.hansen@linux.intel.com>

RECLAIM_ZONE was assumed to be unused because it was never explicitly
used in the kernel.  However, there were a number of places where it
was checked implicitly by checking 'node_reclaim_mode' for a zero
value.

These zero checks are not great because it is not obvious what a zero
mode *means* in the code.  Replace them with a helper which makes it
more obvious: node_reclaim_enabled().

This helper also provides a handy place to explicitly check the
RECLAIM_ZONE bit itself.  Check it explicitly there to make it more
obvious where the bit can affect behavior.

This should have no functional impact.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ben Widawsky <ben.widawsky@intel.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Daniel Wagner <dwagner@suse.de>
Cc: "Tobin C. Harding" <tobin@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Daniel Wagner <dwagner@suse.de>

--

Note: This is not cc'd to stable.  It does not fix any bugs.
---

 b/include/linux/swap.h |    7 +++++++
 b/mm/khugepaged.c      |    2 +-
 b/mm/page_alloc.c      |    2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

Comments

David Rientjes July 1, 2020, 8:03 p.m. UTC | #1
On Wed, 1 Jul 2020, Dave Hansen wrote:

> diff -puN include/linux/swap.h~mm-vmscan-node_reclaim_mode_helper include/linux/swap.h
> --- a/include/linux/swap.h~mm-vmscan-node_reclaim_mode_helper	2020-07-01 08:22:13.650955330 -0700
> +++ b/include/linux/swap.h	2020-07-01 08:22:13.659955330 -0700
> @@ -12,6 +12,7 @@
>  #include <linux/fs.h>
>  #include <linux/atomic.h>
>  #include <linux/page-flags.h>
> +#include <uapi/linux/mempolicy.h>
>  #include <asm/page.h>
>  
>  struct notifier_block;
> @@ -374,6 +375,12 @@ extern int sysctl_min_slab_ratio;
>  #define node_reclaim_mode 0
>  #endif
>  
> +static inline bool node_reclaim_enabled(void)
> +{
> +	/* Is any node_reclaim_mode bit set? */
> +	return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
> +}
> +
>  extern void check_move_unevictable_pages(struct pagevec *pvec);
>  
>  extern int kswapd_run(int nid);

If a user writes a bit that isn't a RECLAIM_* bit to vm.zone_reclaim_mode 
today, it acts as though RECLAIM_ZONE is enabled: we try to reclaim in 
zonelist order before falling back to the next zone in the page allocator.  
The sysctl doesn't enforce any max value :/  I dont know if there is any 
such user, but this would break them if there is.

Should this simply be return !!node_reclaim_mode?
Ben Widawsky July 1, 2020, 8:04 p.m. UTC | #2
On 20-07-01 13:03:01, David Rientjes wrote:
> On Wed, 1 Jul 2020, Dave Hansen wrote:
> 
> > diff -puN include/linux/swap.h~mm-vmscan-node_reclaim_mode_helper include/linux/swap.h
> > --- a/include/linux/swap.h~mm-vmscan-node_reclaim_mode_helper	2020-07-01 08:22:13.650955330 -0700
> > +++ b/include/linux/swap.h	2020-07-01 08:22:13.659955330 -0700
> > @@ -12,6 +12,7 @@
> >  #include <linux/fs.h>
> >  #include <linux/atomic.h>
> >  #include <linux/page-flags.h>
> > +#include <uapi/linux/mempolicy.h>
> >  #include <asm/page.h>
> >  
> >  struct notifier_block;
> > @@ -374,6 +375,12 @@ extern int sysctl_min_slab_ratio;
> >  #define node_reclaim_mode 0
> >  #endif
> >  
> > +static inline bool node_reclaim_enabled(void)
> > +{
> > +	/* Is any node_reclaim_mode bit set? */
> > +	return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
> > +}
> > +
> >  extern void check_move_unevictable_pages(struct pagevec *pvec);
> >  
> >  extern int kswapd_run(int nid);
> 
> If a user writes a bit that isn't a RECLAIM_* bit to vm.zone_reclaim_mode 
> today, it acts as though RECLAIM_ZONE is enabled: we try to reclaim in 
> zonelist order before falling back to the next zone in the page allocator.  
> The sysctl doesn't enforce any max value :/  I dont know if there is any 
> such user, but this would break them if there is.
> 
> Should this simply be return !!node_reclaim_mode?
> 

I don't think so because I don't think anything else validates the unused bits
remain unused.
Dave Hansen July 1, 2020, 9:29 p.m. UTC | #3
On 7/1/20 1:04 PM, Ben Widawsky wrote:
>> +static inline bool node_reclaim_enabled(void)
>> +{
>> +	/* Is any node_reclaim_mode bit set? */
>> +	return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
>> +}
>> +
>>  extern void check_move_unevictable_pages(struct pagevec *pvec);
>>  
>>  extern int kswapd_run(int nid);
> If a user writes a bit that isn't a RECLAIM_* bit to vm.zone_reclaim_mode 
> today, it acts as though RECLAIM_ZONE is enabled: we try to reclaim in 
> zonelist order before falling back to the next zone in the page allocator.  
> The sysctl doesn't enforce any max value :/  I dont know if there is any 
> such user, but this would break them if there is.
> 
> Should this simply be return !!node_reclaim_mode?

You're right that there _could_ be a user-visible behavior change here.
 But, if there were a change it would be for a bit which wasn't even
mentioned in the documentation.  Somebody would have had to look at the
doc mentioning 1,2,4 and written an 8.  If they did that, they're asking
for trouble because we could have defined the '8' bit to do nasty things
like auto-demote all your memory. :)

I'll mention it in the changelog, but I still think we should check the
actual, known bits rather than check for 0.

BTW, in the hardware, they almost invariably make unused bits "reserved"
and do mean things like #GP if someone tries to set them.  This is a
case where the kernel probably should have done the same.  It would have
saved us the trouble of asking these questions now.  Maybe we should
even do that going forward.
David Rientjes July 1, 2020, 10:01 p.m. UTC | #4
On Wed, 1 Jul 2020, Dave Hansen wrote:

> On 7/1/20 1:04 PM, Ben Widawsky wrote:
> >> +static inline bool node_reclaim_enabled(void)
> >> +{
> >> +	/* Is any node_reclaim_mode bit set? */
> >> +	return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
> >> +}
> >> +
> >>  extern void check_move_unevictable_pages(struct pagevec *pvec);
> >>  
> >>  extern int kswapd_run(int nid);
> > If a user writes a bit that isn't a RECLAIM_* bit to vm.zone_reclaim_mode 
> > today, it acts as though RECLAIM_ZONE is enabled: we try to reclaim in 
> > zonelist order before falling back to the next zone in the page allocator.  
> > The sysctl doesn't enforce any max value :/  I dont know if there is any 
> > such user, but this would break them if there is.
> > 
> > Should this simply be return !!node_reclaim_mode?
> 
> You're right that there _could_ be a user-visible behavior change here.
>  But, if there were a change it would be for a bit which wasn't even
> mentioned in the documentation.  Somebody would have had to look at the
> doc mentioning 1,2,4 and written an 8.  If they did that, they're asking
> for trouble because we could have defined the '8' bit to do nasty things
> like auto-demote all your memory. :)
> 
> I'll mention it in the changelog, but I still think we should check the
> actual, known bits rather than check for 0.
> 
> BTW, in the hardware, they almost invariably make unused bits "reserved"
> and do mean things like #GP if someone tries to set them.  This is a
> case where the kernel probably should have done the same.  It would have
> saved us the trouble of asking these questions now.  Maybe we should
> even do that going forward.
> 

Maybe enforce it in a sysctl handler so the user catches any errors, which 
would be better than silently accepting some policy that doesn't exist?

RECLAIM_UNMAP and/or RECLAIM_WRITE should likely get -EINVAL if attempted 
to be set without RECLAIM_ZONE as well: they are no-ops without 
RECLAIM_ZONE.  This would likely have caught something wrong with commit 
648b5cf368e0 ("mm/vmscan: remove unused RECLAIM_OFF/RECLAIM_ZONE") if it 
would have already been in place.

I don't feel strongly about this, so feel free to ignore.
diff mbox series

Patch

diff -puN include/linux/swap.h~mm-vmscan-node_reclaim_mode_helper include/linux/swap.h
--- a/include/linux/swap.h~mm-vmscan-node_reclaim_mode_helper	2020-07-01 08:22:13.650955330 -0700
+++ b/include/linux/swap.h	2020-07-01 08:22:13.659955330 -0700
@@ -12,6 +12,7 @@ 
 #include <linux/fs.h>
 #include <linux/atomic.h>
 #include <linux/page-flags.h>
+#include <uapi/linux/mempolicy.h>
 #include <asm/page.h>
 
 struct notifier_block;
@@ -374,6 +375,12 @@  extern int sysctl_min_slab_ratio;
 #define node_reclaim_mode 0
 #endif
 
+static inline bool node_reclaim_enabled(void)
+{
+	/* Is any node_reclaim_mode bit set? */
+	return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
+}
+
 extern void check_move_unevictable_pages(struct pagevec *pvec);
 
 extern int kswapd_run(int nid);
diff -puN mm/khugepaged.c~mm-vmscan-node_reclaim_mode_helper mm/khugepaged.c
--- a/mm/khugepaged.c~mm-vmscan-node_reclaim_mode_helper	2020-07-01 08:22:13.652955330 -0700
+++ b/mm/khugepaged.c	2020-07-01 08:22:13.660955330 -0700
@@ -709,7 +709,7 @@  static bool khugepaged_scan_abort(int ni
 	 * If node_reclaim_mode is disabled, then no extra effort is made to
 	 * allocate memory locally.
 	 */
-	if (!node_reclaim_mode)
+	if (!node_reclaim_enabled())
 		return false;
 
 	/* If there is a count for this node already, it must be acceptable */
diff -puN mm/page_alloc.c~mm-vmscan-node_reclaim_mode_helper mm/page_alloc.c
--- a/mm/page_alloc.c~mm-vmscan-node_reclaim_mode_helper	2020-07-01 08:22:13.655955330 -0700
+++ b/mm/page_alloc.c	2020-07-01 08:22:13.662955330 -0700
@@ -3733,7 +3733,7 @@  retry:
 			if (alloc_flags & ALLOC_NO_WATERMARKS)
 				goto try_this_zone;
 
-			if (node_reclaim_mode == 0 ||
+			if (!node_reclaim_enabled() ||
 			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
 				continue;