diff mbox series

mm: always consider THP when adjusting min_free_kbytes

Message ID 20200204194156.61672-1-mike.kravetz@oracle.com (mailing list archive)
State New, archived
Headers show
Series mm: always consider THP when adjusting min_free_kbytes | expand

Commit Message

Mike Kravetz Feb. 4, 2020, 7:41 p.m. UTC
At system initialization time, min_free_kbytes is calculated based
on the amount of memory in the system.  If THP is enabled, then
khugepaged is started and min_free_kbytes may be adjusted in an
attempt to reserve some pageblocks for THP allocations.

When memory is offlined or onlined, min_free_kbytes is recalculated
and adjusted based on the amount of memory.  However, the adjustment
for THP is not considered.  Here is an example from a 2 node system
with 8GB of memory.

 # cat /proc/sys/vm/min_free_kbytes
 90112
 # echo 0 > /sys/devices/system/node/node1/memory56/online
 # cat /proc/sys/vm/min_free_kbytes
 11243
 # echo 1 > /sys/devices/system/node/node1/memory56/online
 # cat /proc/sys/vm/min_free_kbytes
 11412

One would expect that min_free_kbytes would return to it's original
value after the offline/online operations.

Create a simple interface for THP/khugepaged based adjustment and
call this whenever min_free_kbytes is adjusted.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 include/linux/khugepaged.h |  5 +++++
 mm/khugepaged.c            | 35 ++++++++++++++++++++++++++++++-----
 mm/page_alloc.c            |  4 +++-
 3 files changed, 38 insertions(+), 6 deletions(-)

Comments

David Rientjes Feb. 4, 2020, 8:33 p.m. UTC | #1
On Tue, 4 Feb 2020, Mike Kravetz wrote:

> At system initialization time, min_free_kbytes is calculated based
> on the amount of memory in the system.  If THP is enabled, then
> khugepaged is started and min_free_kbytes may be adjusted in an
> attempt to reserve some pageblocks for THP allocations.
> 
> When memory is offlined or onlined, min_free_kbytes is recalculated
> and adjusted based on the amount of memory.  However, the adjustment
> for THP is not considered.  Here is an example from a 2 node system
> with 8GB of memory.
> 
>  # cat /proc/sys/vm/min_free_kbytes
>  90112
>  # echo 0 > /sys/devices/system/node/node1/memory56/online
>  # cat /proc/sys/vm/min_free_kbytes
>  11243
>  # echo 1 > /sys/devices/system/node/node1/memory56/online
>  # cat /proc/sys/vm/min_free_kbytes
>  11412
> 

Ah, that doesn't look good.

> One would expect that min_free_kbytes would return to it's original
> value after the offline/online operations.
> 
> Create a simple interface for THP/khugepaged based adjustment and
> call this whenever min_free_kbytes is adjusted.
> 
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>  include/linux/khugepaged.h |  5 +++++
>  mm/khugepaged.c            | 35 ++++++++++++++++++++++++++++++-----
>  mm/page_alloc.c            |  4 +++-
>  3 files changed, 38 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index bc45ea1efbf7..8f02d3575829 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -15,6 +15,7 @@ extern int __khugepaged_enter(struct mm_struct *mm);
>  extern void __khugepaged_exit(struct mm_struct *mm);
>  extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
>  				      unsigned long vm_flags);
> +extern bool khugepaged_adjust_min_free_kbytes(void);
>  #ifdef CONFIG_SHMEM
>  extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr);
>  #else
> @@ -81,6 +82,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
>  {
>  	return 0;
>  }
> +static bool khugepaged_adjust_min_free_kbytes(void)
> +{
> +	return false;
> +}
>  static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
>  					   unsigned long addr)
>  {
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index b679908743cb..d8040cf19e98 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2138,7 +2138,7 @@ static int khugepaged(void *none)
>  	return 0;
>  }
>  
> -static void set_recommended_min_free_kbytes(void)
> +bool __khugepaged_adjust_min_free_kbytes(void)
>  {
>  	struct zone *zone;
>  	int nr_zones = 0;
> @@ -2174,17 +2174,26 @@ static void set_recommended_min_free_kbytes(void)
>  
>  	if (recommended_min > min_free_kbytes) {
>  		if (user_min_free_kbytes >= 0)
> -			pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
> +			pr_info_once("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
>  				min_free_kbytes, recommended_min);
>  
>  		min_free_kbytes = recommended_min;
> +		return true;
>  	}
> -	setup_per_zone_wmarks();
> +
> +	return false;
> +}
> +
> +static void set_recommended_min_free_kbytes(void)
> +{
> +	if (__khugepaged_adjust_min_free_kbytes())
> +		setup_per_zone_wmarks();
>  }
>  
> -int start_stop_khugepaged(void)
> +static struct task_struct *khugepaged_thread __read_mostly;
> +
> +int __ref start_stop_khugepaged(void)
>  {
> -	static struct task_struct *khugepaged_thread __read_mostly;
>  	static DEFINE_MUTEX(khugepaged_mutex);
>  	int err = 0;
>  
> @@ -2207,8 +2216,24 @@ int start_stop_khugepaged(void)
>  	} else if (khugepaged_thread) {
>  		kthread_stop(khugepaged_thread);
>  		khugepaged_thread = NULL;
> +		init_per_zone_wmark_min();
>  	}
>  fail:
>  	mutex_unlock(&khugepaged_mutex);
>  	return err;
>  }
> +
> +bool khugepaged_adjust_min_free_kbytes(void)
> +{
> +	bool ret = false;
> +
> +	/*
> +	 * This is a bit racy, and we could miss transitions.  However,
> +	 * start/stop code above will make additional adjustments at the
> +	 * end of transitions.
> +	 */
> +	if (khugepaged_enabled() && khugepaged_thread)
> +		ret = __khugepaged_adjust_min_free_kbytes();
> +
> +	return ret;
> +}
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d047bf7d8fd4..a7b3a6663ba6 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -68,6 +68,7 @@
>  #include <linux/lockdep.h>
>  #include <linux/nmi.h>
>  #include <linux/psi.h>
> +#include <linux/khugepaged.h>
>  
>  #include <asm/sections.h>
>  #include <asm/tlbflush.h>
> @@ -7827,9 +7828,10 @@ int __meminit init_per_zone_wmark_min(void)
>  		if (min_free_kbytes > 65536)
>  			min_free_kbytes = 65536;
>  	} else {
> -		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
> +		pr_warn_once("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
>  				new_min_free_kbytes, user_min_free_kbytes);
>  	}
> +	(void)khugepaged_adjust_min_free_kbytes();
>  	setup_per_zone_wmarks();
>  	refresh_zone_stat_thresholds();
>  	setup_per_zone_lowmem_reserve();

Hmm, if khugepaged_adjust_min_free_kbytes() increases min_free_kbytes for 
thp, then the user has no ability to override this increase by using 
vm.min_free_kbytes?

IIUC, with this change, it looks like memory hotplug events properly 
increase min_free_kbytes for thp optimization but also doesn't respect a 
previous user-defined value?

So it looks like this is fixing an obvious correctness issue but also now 
requires users to rewrite the sysctl if they want to decrease the min 
watermark.
Mike Kravetz Feb. 4, 2020, 9:42 p.m. UTC | #2
On 2/4/20 12:33 PM, David Rientjes wrote:
> On Tue, 4 Feb 2020, Mike Kravetz wrote:
> 
> Hmm, if khugepaged_adjust_min_free_kbytes() increases min_free_kbytes for 
> thp, then the user has no ability to override this increase by using 
> vm.min_free_kbytes?
> 
> IIUC, with this change, it looks like memory hotplug events properly 
> increase min_free_kbytes for thp optimization but also doesn't respect a 
> previous user-defined value?

Good catch.

We should only call khugepaged_adjust_min_free_kbytes from the 'true'
block of this if statement in init_per_zone_wmark_min.

	if (new_min_free_kbytes > user_min_free_kbytes) {
		min_free_kbytes = new_min_free_kbytes;
		if (min_free_kbytes < 128)
			min_free_kbytes = 128;
		if (min_free_kbytes > 65536)
			min_free_kbytes = 65536;
	} else {
		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
				new_min_free_kbytes, user_min_free_kbytes);
	}

In the existing code, a hotplug event will cause min_free_kbytes to overwrite
the user defined value if the new value is greater.  However, you will get
the warning message if the user defined value is greater.  I am not sure if
this is the 'desired/expected' behavior?  We print a warning if the user value
takes precedence over our calculated value.  However, we do not print a message
if we overwrite the user defined value.  That doesn't seem right!

> So it looks like this is fixing an obvious correctness issue but also now 
> requires users to rewrite the sysctl if they want to decrease the min 
> watermark.

Moving the call to khugepaged_adjust_min_free_kbytes as described above
would avoid the THP adjustment unless we were going to overwrite the
user defined value.  Now, I am not sure overwriting the user defined value
as is done today is actually the correct thing to do.

Thoughts?
Perhaps we should never overwrite a user defined value?
Matthew Wilcox Feb. 4, 2020, 9:53 p.m. UTC | #3
On Tue, Feb 04, 2020 at 01:42:43PM -0800, Mike Kravetz wrote:
> On 2/4/20 12:33 PM, David Rientjes wrote:
> > On Tue, 4 Feb 2020, Mike Kravetz wrote:
> > 
> > Hmm, if khugepaged_adjust_min_free_kbytes() increases min_free_kbytes for 
> > thp, then the user has no ability to override this increase by using 
> > vm.min_free_kbytes?
> > 
> > IIUC, with this change, it looks like memory hotplug events properly 
> > increase min_free_kbytes for thp optimization but also doesn't respect a 
> > previous user-defined value?
> 
> Good catch.
> 
> We should only call khugepaged_adjust_min_free_kbytes from the 'true'
> block of this if statement in init_per_zone_wmark_min.
> 
> 	if (new_min_free_kbytes > user_min_free_kbytes) {
> 		min_free_kbytes = new_min_free_kbytes;
> 		if (min_free_kbytes < 128)
> 			min_free_kbytes = 128;
> 		if (min_free_kbytes > 65536)
> 			min_free_kbytes = 65536;
> 	} else {
> 		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
> 				new_min_free_kbytes, user_min_free_kbytes);
> 	}
> 
> In the existing code, a hotplug event will cause min_free_kbytes to overwrite
> the user defined value if the new value is greater.  However, you will get
> the warning message if the user defined value is greater.  I am not sure if
> this is the 'desired/expected' behavior?  We print a warning if the user value
> takes precedence over our calculated value.  However, we do not print a message
> if we overwrite the user defined value.  That doesn't seem right!
> 
> > So it looks like this is fixing an obvious correctness issue but also now 
> > requires users to rewrite the sysctl if they want to decrease the min 
> > watermark.
> 
> Moving the call to khugepaged_adjust_min_free_kbytes as described above
> would avoid the THP adjustment unless we were going to overwrite the
> user defined value.  Now, I am not sure overwriting the user defined value
> as is done today is actually the correct thing to do.
> 
> Thoughts?
> Perhaps we should never overwrite a user defined value?

We should certainly warn if we would have adjusted it, had they not
changed it!

I'm reluctant to suggest we do a more complex adjustment of the value
(eg figure out what the adjustment would have been, then apply some
fraction of that adjustment to keep the ratios in proportion) because
we don't really know why they adjusted it.

OTOH, we should adjust it if the user-set min_free_kbytes is now too
large for the amount of memory now in the machine.
Khalid Aziz Feb. 4, 2020, 11:37 p.m. UTC | #4
On Tue, 2020-02-04 at 13:42 -0800, Mike Kravetz wrote:
> On 2/4/20 12:33 PM, David Rientjes wrote:
> > 
> > So it looks like this is fixing an obvious correctness issue but
> > also now 
> > requires users to rewrite the sysctl if they want to decrease the
> > min 
> > watermark.
> 
> Moving the call to khugepaged_adjust_min_free_kbytes as described
> above
> would avoid the THP adjustment unless we were going to overwrite the
> user defined value.  Now, I am not sure overwriting the user defined
> value
> as is done today is actually the correct thing to do.
> 
> Thoughts?
> Perhaps we should never overwrite a user defined value?

We might need to override user defined value if it is too low but
overriding it silently is not quite right. We should print a warning
at least. On the other hand, a user setting min_free_kbytes should know
what they are doing and if they set it too low, they have been warned
in the sysctl documentation. I would say we never override user defined
value but print a warning if the value is too low and kernel would have
adjusted it if it were not for the user defined value.

--
Khalid
Mike Kravetz Feb. 5, 2020, 12:33 a.m. UTC | #5
On 2/4/20 1:53 PM, Matthew Wilcox wrote:
> On Tue, Feb 04, 2020 at 01:42:43PM -0800, Mike Kravetz wrote:
>> On 2/4/20 12:33 PM, David Rientjes wrote:
>>> On Tue, 4 Feb 2020, Mike Kravetz wrote:
>>>
>>> Hmm, if khugepaged_adjust_min_free_kbytes() increases min_free_kbytes for 
>>> thp, then the user has no ability to override this increase by using 
>>> vm.min_free_kbytes?
>>>
>>> IIUC, with this change, it looks like memory hotplug events properly 
>>> increase min_free_kbytes for thp optimization but also doesn't respect a 
>>> previous user-defined value?
>>
>> Good catch.
>>
>> We should only call khugepaged_adjust_min_free_kbytes from the 'true'
>> block of this if statement in init_per_zone_wmark_min.
>>
>> 	if (new_min_free_kbytes > user_min_free_kbytes) {
>> 		min_free_kbytes = new_min_free_kbytes;
>> 		if (min_free_kbytes < 128)
>> 			min_free_kbytes = 128;
>> 		if (min_free_kbytes > 65536)
>> 			min_free_kbytes = 65536;
>> 	} else {
>> 		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
>> 				new_min_free_kbytes, user_min_free_kbytes);
>> 	}
>>
>> In the existing code, a hotplug event will cause min_free_kbytes to overwrite
>> the user defined value if the new value is greater.  However, you will get
>> the warning message if the user defined value is greater.  I am not sure if
>> this is the 'desired/expected' behavior?  We print a warning if the user value
>> takes precedence over our calculated value.  However, we do not print a message
>> if we overwrite the user defined value.  That doesn't seem right!
>>
>>> So it looks like this is fixing an obvious correctness issue but also now 
>>> requires users to rewrite the sysctl if they want to decrease the min 
>>> watermark.
>>
>> Moving the call to khugepaged_adjust_min_free_kbytes as described above
>> would avoid the THP adjustment unless we were going to overwrite the
>> user defined value.  Now, I am not sure overwriting the user defined value
>> as is done today is actually the correct thing to do.
>>
>> Thoughts?
>> Perhaps we should never overwrite a user defined value?
> 
> We should certainly warn if we would have adjusted it, had they not
> changed it!

Ok, the code above does that today.

> I'm reluctant to suggest we do a more complex adjustment of the value
> (eg figure out what the adjustment would have been, then apply some
> fraction of that adjustment to keep the ratios in proportion) because
> we don't really know why they adjusted it.

Agree!

> OTOH, we should adjust it if the user-set min_free_kbytes is now too
> large for the amount of memory now in the machine.

Today, we never overwrite a user defined value that is larger than
that calculated by the code.  However, we will owerwrite a user defined
value if the code calculates a larger value.

I'm starting to think the best option is to NEVER overwrite a user defined
value.
Mike Kravetz Feb. 6, 2020, 1:36 a.m. UTC | #6
On 2/4/20 4:33 PM, Mike Kravetz wrote:
> Today, we never overwrite a user defined value that is larger than
> that calculated by the code.  However, we will owerwrite a user defined
> value if the code calculates a larger value.

If we overwrite a user defined value, we do not inform the user.

> I'm starting to think the best option is to NEVER overwrite a user defined
> value.

Below is an updated patch which never overwrites a user defined value.
We would only potentially overwrite a user value on a memory offline/online
or khugepaged start/stop operation.  Since no validation of user defined
values is performed, it seems the desired behavior is to never overwrite them?  

If user has specified a value and we calculate a value, a message containing
both values is logged.

I am not aware of anyone complaining about current behavior.  While working
a customer issue and playing with min_free_kbytes, I noticed the behavior
and thought it does not do what one would expect.

From: Mike Kravetz <mike.kravetz@oracle.com>
Subject: [PATCH v2] mm: Don't overwrite user min_free_kbytes, consider THP
 when adjusting

The value of min_free_kbytes is calculated in two routines:
1) init_per_zone_wmark_min based on available memory
2) set_recommended_min_free_kbytes may reserve extra space for
   THP allocations

In both of these routines, a user defined min_free_kbytes value will
be overwritten if the value calculated in the code is larger. No message
is logged if the user value is overwritten.

Change code to never overwrite user defined value.  However, do log a
message (once per value) showing the value calculated in code.

At system initialization time, both init_per_zone_wmark_min and
set_recommended_min_free_kbytes are called to set the initial value
for min_free_kbytes.  When memory is offlined or onlined, min_free_kbytes
is recalculated and adjusted based on the amount of memory.  However,
the adjustment for THP is not considered.  Here is an example from a 2
node system with 8GB of memory.

 # cat /proc/sys/vm/min_free_kbytes
 90112
 # echo 0 > /sys/devices/system/node/node1/memory56/online
 # cat /proc/sys/vm/min_free_kbytes
 11243
 # echo 1 > /sys/devices/system/node/node1/memory56/online
 # cat /proc/sys/vm/min_free_kbytes
 11412

One would expect that min_free_kbytes would return to it's original
value after the offline/online operations.

Create a simple interface for THP/khugepaged based adjustment and
call this whenever min_free_kbytes is adjusted.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 include/linux/khugepaged.h |  5 ++++
 mm/internal.h              |  2 ++
 mm/khugepaged.c            | 56 ++++++++++++++++++++++++++++++++------
 mm/page_alloc.c            | 35 ++++++++++++++++--------
 4 files changed, 78 insertions(+), 20 deletions(-)

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index bc45ea1efbf7..faa92923dbe5 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -15,6 +15,7 @@ extern int __khugepaged_enter(struct mm_struct *mm);
 extern void __khugepaged_exit(struct mm_struct *mm);
 extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
 				      unsigned long vm_flags);
+extern int khugepaged_adjust_min_free_kbytes(int mfk_value);
 #ifdef CONFIG_SHMEM
 extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr);
 #else
@@ -81,6 +82,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
 {
 	return 0;
 }
+static inline int khugepaged_adjust_min_free_kbytes(int mfk_value)
+{
+	return 0;
+}
 static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
 					   unsigned long addr)
 {
diff --git a/mm/internal.h b/mm/internal.h
index 3cf20ab3ca01..57bbc157124e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -164,6 +164,8 @@ extern void prep_compound_page(struct page *page, unsigned int order);
 extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
 extern int user_min_free_kbytes;
+#define UNSET_USER_MFK_VALUE -1
+extern int calc_min_free_kbytes;
 
 extern void zone_pcp_update(struct zone *zone);
 extern void zone_pcp_reset(struct zone *zone);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b679908743cb..6af72cb7e337 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2138,8 +2138,9 @@ static int khugepaged(void *none)
 	return 0;
 }
 
-static void set_recommended_min_free_kbytes(void)
+int __khugepaged_adjust_min_free_kbytes(int mfk_value)
 {
+	int ret = 0;
 	struct zone *zone;
 	int nr_zones = 0;
 	unsigned long recommended_min;
@@ -2172,19 +2173,40 @@ static void set_recommended_min_free_kbytes(void)
 			      (unsigned long) nr_free_buffer_pages() / 20);
 	recommended_min <<= (PAGE_SHIFT-10);
 
-	if (recommended_min > min_free_kbytes) {
-		if (user_min_free_kbytes >= 0)
-			pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
-				min_free_kbytes, recommended_min);
+	if (recommended_min > mfk_value)
+		ret = (int)recommended_min - mfk_value;
+
+	return ret;
+}
 
-		min_free_kbytes = recommended_min;
+static void set_recommended_min_free_kbytes(void)
+{
+	int av = __khugepaged_adjust_min_free_kbytes(min_free_kbytes);
+
+	if (av) {
+		if (user_min_free_kbytes != UNSET_USER_MFK_VALUE) {
+			/* Do not change user defined value. */
+			if ((min_free_kbytes + av) != calc_min_free_kbytes) {
+				/*
+				 * Save calculated value so we only print
+				 * warning once per value.
+				 */
+				calc_min_free_kbytes = min_free_kbytes + av;
+				pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
+					calc_min_free_kbytes,
+					user_min_free_kbytes);
+			}
+		} else {
+			min_free_kbytes += av;
+			setup_per_zone_wmarks();
+		}
 	}
-	setup_per_zone_wmarks();
 }
 
-int start_stop_khugepaged(void)
+static struct task_struct *khugepaged_thread __read_mostly;
+
+int __ref start_stop_khugepaged(void)
 {
-	static struct task_struct *khugepaged_thread __read_mostly;
 	static DEFINE_MUTEX(khugepaged_mutex);
 	int err = 0;
 
@@ -2207,8 +2229,24 @@ int start_stop_khugepaged(void)
 	} else if (khugepaged_thread) {
 		kthread_stop(khugepaged_thread);
 		khugepaged_thread = NULL;
+		init_per_zone_wmark_min();
 	}
 fail:
 	mutex_unlock(&khugepaged_mutex);
 	return err;
 }
+
+int khugepaged_adjust_min_free_kbytes(int mfk_value)
+{
+	int ret = 0;
+
+	/*
+	 * This is a bit racy, and we could miss transitions.  However,
+	 * start/stop code above will make additional adjustments at the
+	 * end of transitions.
+	 */
+	if (khugepaged_enabled() && khugepaged_thread)
+		ret = __khugepaged_adjust_min_free_kbytes(mfk_value);
+
+	return ret;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d047bf7d8fd4..73162a5bf296 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,6 +68,7 @@
 #include <linux/lockdep.h>
 #include <linux/nmi.h>
 #include <linux/psi.h>
+#include <linux/khugepaged.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -314,7 +315,8 @@ compound_page_dtor * const compound_page_dtors[] = {
 };
 
 int min_free_kbytes = 1024;
-int user_min_free_kbytes = -1;
+int user_min_free_kbytes = UNSET_USER_MFK_VALUE;
+int calc_min_free_kbytes = -1;
 #ifdef CONFIG_DISCONTIGMEM
 /*
  * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
@@ -7819,17 +7821,28 @@ int __meminit init_per_zone_wmark_min(void)
 
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
-
-	if (new_min_free_kbytes > user_min_free_kbytes) {
-		min_free_kbytes = new_min_free_kbytes;
-		if (min_free_kbytes < 128)
-			min_free_kbytes = 128;
-		if (min_free_kbytes > 65536)
-			min_free_kbytes = 65536;
-	} else {
-		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
+	if (new_min_free_kbytes < 128)
+		new_min_free_kbytes = 128;
+	if (new_min_free_kbytes > 65536)
+		new_min_free_kbytes = 65536;
+	new_min_free_kbytes +=
+		khugepaged_adjust_min_free_kbytes(new_min_free_kbytes);
+
+	if (user_min_free_kbytes != UNSET_USER_MFK_VALUE) {
+		/* Do not change user defined value. */
+		if (new_min_free_kbytes != calc_min_free_kbytes) {
+			/*
+			 * Save calculated value so we only print warning
+			 * once per value.
+			 */
+			calc_min_free_kbytes = new_min_free_kbytes;
+			pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
 				new_min_free_kbytes, user_min_free_kbytes);
+		}
+		goto out;
 	}
+
+	min_free_kbytes = new_min_free_kbytes;
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
@@ -7838,7 +7851,7 @@ int __meminit init_per_zone_wmark_min(void)
 	setup_min_unmapped_ratio();
 	setup_min_slab_ratio();
 #endif
-
+out:
 	return 0;
 }
 core_initcall(init_per_zone_wmark_min)
Khalid Aziz Feb. 6, 2020, 8:09 p.m. UTC | #7
On Wed, 2020-02-05 at 17:36 -0800, Mike Kravetz wrote:
> On 2/4/20 4:33 PM, Mike Kravetz wrote:
> > Today, we never overwrite a user defined value that is larger than
> > that calculated by the code.  However, we will owerwrite a user
> > defined
> > value if the code calculates a larger value.
> 
> If we overwrite a user defined value, we do not inform the user.
> 
> > I'm starting to think the best option is to NEVER overwrite a user
> > defined
> > value.
> 
> Below is an updated patch which never overwrites a user defined
> value.
> We would only potentially overwrite a user value on a memory
> offline/online
> or khugepaged start/stop operation.  Since no validation of user
> defined
> values is performed, it seems the desired behavior is to never
> overwrite them?  
> 
> If user has specified a value and we calculate a value, a message
> containing
> both values is logged.
> 
> I am not aware of anyone complaining about current behavior.  While
> working
> a customer issue and playing with min_free_kbytes, I noticed the
> behavior
> and thought it does not do what one would expect.
> 
> From: Mike Kravetz <mike.kravetz@oracle.com>
> Subject: [PATCH v2] mm: Don't overwrite user min_free_kbytes,
> consider THP
>  when adjusting
> 
> The value of min_free_kbytes is calculated in two routines:
> 1) init_per_zone_wmark_min based on available memory
> 2) set_recommended_min_free_kbytes may reserve extra space for
>    THP allocations
> 
> In both of these routines, a user defined min_free_kbytes value will
> be overwritten if the value calculated in the code is larger. No
> message
> is logged if the user value is overwritten.
> 
> Change code to never overwrite user defined value.  However, do log a
> message (once per value) showing the value calculated in code.
> 
> At system initialization time, both init_per_zone_wmark_min and
> set_recommended_min_free_kbytes are called to set the initial value
> for min_free_kbytes.  When memory is offlined or onlined,
> min_free_kbytes
> is recalculated and adjusted based on the amount of memory.  However,
> the adjustment for THP is not considered.  Here is an example from a
> 2
> node system with 8GB of memory.
> 
>  # cat /proc/sys/vm/min_free_kbytes
>  90112
>  # echo 0 > /sys/devices/system/node/node1/memory56/online
>  # cat /proc/sys/vm/min_free_kbytes
>  11243
>  # echo 1 > /sys/devices/system/node/node1/memory56/online
>  # cat /proc/sys/vm/min_free_kbytes
>  11412
> 
> One would expect that min_free_kbytes would return to it's original
> value after the offline/online operations.
> 
> Create a simple interface for THP/khugepaged based adjustment and
> call this whenever min_free_kbytes is adjusted.
> 
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>  include/linux/khugepaged.h |  5 ++++
>  mm/internal.h              |  2 ++
>  mm/khugepaged.c            | 56 ++++++++++++++++++++++++++++++++--
> ----
>  mm/page_alloc.c            | 35 ++++++++++++++++--------
>  4 files changed, 78 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index bc45ea1efbf7..faa92923dbe5 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -15,6 +15,7 @@ extern int __khugepaged_enter(struct mm_struct
> *mm);
>  extern void __khugepaged_exit(struct mm_struct *mm);
>  extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
>  				      unsigned long vm_flags);
> +extern int khugepaged_adjust_min_free_kbytes(int mfk_value);
>  #ifdef CONFIG_SHMEM
>  extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned
> long addr);
>  #else
> @@ -81,6 +82,10 @@ static inline int
> khugepaged_enter_vma_merge(struct vm_area_struct *vma,
>  {
>  	return 0;
>  }
> +static inline int khugepaged_adjust_min_free_kbytes(int mfk_value)
> +{
> +	return 0;
> +}
>  static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
>  					   unsigned long addr)
>  {
> diff --git a/mm/internal.h b/mm/internal.h
> index 3cf20ab3ca01..57bbc157124e 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -164,6 +164,8 @@ extern void prep_compound_page(struct page *page,
> unsigned int order);
>  extern void post_alloc_hook(struct page *page, unsigned int order,
>  					gfp_t gfp_flags);
>  extern int user_min_free_kbytes;
> +#define UNSET_USER_MFK_VALUE -1
> +extern int calc_min_free_kbytes;
>  
>  extern void zone_pcp_update(struct zone *zone);
>  extern void zone_pcp_reset(struct zone *zone);
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index b679908743cb..6af72cb7e337 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2138,8 +2138,9 @@ static int khugepaged(void *none)
>  	return 0;
>  }
>  
> -static void set_recommended_min_free_kbytes(void)
> +int __khugepaged_adjust_min_free_kbytes(int mfk_value)
>  {
> +	int ret = 0;
>  	struct zone *zone;
>  	int nr_zones = 0;
>  	unsigned long recommended_min;
> @@ -2172,19 +2173,40 @@ static void
> set_recommended_min_free_kbytes(void)
>  			      (unsigned long) nr_free_buffer_pages() /
> 20);
>  	recommended_min <<= (PAGE_SHIFT-10);
>  
> -	if (recommended_min > min_free_kbytes) {
> -		if (user_min_free_kbytes >= 0)
> -			pr_info("raising min_free_kbytes from %d to %lu
> to help transparent hugepage allocations\n",
> -				min_free_kbytes, recommended_min);
> +	if (recommended_min > mfk_value)
> +		ret = (int)recommended_min - mfk_value;
> +
> +	return ret;
> +}
>  
> -		min_free_kbytes = recommended_min;
> +static void set_recommended_min_free_kbytes(void)
> +{
> +	int av = __khugepaged_adjust_min_free_kbytes(min_free_kbytes);
> +
> +	if (av) {
> +		if (user_min_free_kbytes != UNSET_USER_MFK_VALUE) {
> +			/* Do not change user defined value. */
> +			if ((min_free_kbytes + av) !=
> calc_min_free_kbytes) {
> +				/*
> +				 * Save calculated value so we only
> print
> +				 * warning once per value.
> +				 */
> +				calc_min_free_kbytes = min_free_kbytes
> + av;
> +				pr_warn("min_free_kbytes is not updated
> to %d because user defined value %d is preferred\n",
> +					calc_min_free_kbytes,
> +					user_min_free_kbytes);
> +			}
> +		} else {
> +			min_free_kbytes += av;
> +			setup_per_zone_wmarks();
> +		}
>  	}
> -	setup_per_zone_wmarks();
>  }
>  
> -int start_stop_khugepaged(void)
> +static struct task_struct *khugepaged_thread __read_mostly;
> +
> +int __ref start_stop_khugepaged(void)
>  {
> -	static struct task_struct *khugepaged_thread __read_mostly;
>  	static DEFINE_MUTEX(khugepaged_mutex);
>  	int err = 0;
>  
> @@ -2207,8 +2229,24 @@ int start_stop_khugepaged(void)
>  	} else if (khugepaged_thread) {
>  		kthread_stop(khugepaged_thread);
>  		khugepaged_thread = NULL;
> +		init_per_zone_wmark_min();
>  	}
>  fail:
>  	mutex_unlock(&khugepaged_mutex);
>  	return err;
>  }
> +
> +int khugepaged_adjust_min_free_kbytes(int mfk_value)
> +{
> +	int ret = 0;
> +
> +	/*
> +	 * This is a bit racy, and we could miss transitions.  However,
> +	 * start/stop code above will make additional adjustments at
> the
> +	 * end of transitions.
> +	 */
> +	if (khugepaged_enabled() && khugepaged_thread)
> +		ret = __khugepaged_adjust_min_free_kbytes(mfk_value);
> +
> +	return ret;
> +}
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d047bf7d8fd4..73162a5bf296 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -68,6 +68,7 @@
>  #include <linux/lockdep.h>
>  #include <linux/nmi.h>
>  #include <linux/psi.h>
> +#include <linux/khugepaged.h>
>  
>  #include <asm/sections.h>
>  #include <asm/tlbflush.h>
> @@ -314,7 +315,8 @@ compound_page_dtor * const compound_page_dtors[]
> = {
>  };
>  
>  int min_free_kbytes = 1024;
> -int user_min_free_kbytes = -1;
> +int user_min_free_kbytes = UNSET_USER_MFK_VALUE;
> +int calc_min_free_kbytes = -1;
>  #ifdef CONFIG_DISCONTIGMEM
>  /*
>   * DiscontigMem defines memory ranges as separate pg_data_t even if
> the ranges
> @@ -7819,17 +7821,28 @@ int __meminit init_per_zone_wmark_min(void)
>  
>  	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
>  	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
> -
> -	if (new_min_free_kbytes > user_min_free_kbytes) {
> -		min_free_kbytes = new_min_free_kbytes;
> -		if (min_free_kbytes < 128)
> -			min_free_kbytes = 128;
> -		if (min_free_kbytes > 65536)
> -			min_free_kbytes = 65536;
> -	} else {
> -		pr_warn("min_free_kbytes is not updated to %d because
> user defined value %d is preferred\n",
> +	if (new_min_free_kbytes < 128)
> +		new_min_free_kbytes = 128;
> +	if (new_min_free_kbytes > 65536)
> +		new_min_free_kbytes = 65536;
> +	new_min_free_kbytes +=
> +		khugepaged_adjust_min_free_kbytes(new_min_free_kbytes);
> +
> +	if (user_min_free_kbytes != UNSET_USER_MFK_VALUE) {
> +		/* Do not change user defined value. */
> +		if (new_min_free_kbytes != calc_min_free_kbytes) {
> +			/*
> +			 * Save calculated value so we only print
> warning
> +			 * once per value.
> +			 */
> +			calc_min_free_kbytes = new_min_free_kbytes;
> +			pr_warn("min_free_kbytes is not updated to %d
> because user defined value %d is preferred\n",
>  				new_min_free_kbytes,
> user_min_free_kbytes);
> +		}
> +		goto out;
>  	}
> +
> +	min_free_kbytes = new_min_free_kbytes;
>  	setup_per_zone_wmarks();
>  	refresh_zone_stat_thresholds();
>  	setup_per_zone_lowmem_reserve();
> @@ -7838,7 +7851,7 @@ int __meminit init_per_zone_wmark_min(void)
>  	setup_min_unmapped_ratio();
>  	setup_min_slab_ratio();
>  #endif
> -
> +out:
>  	return 0;
>  }
>  core_initcall(init_per_zone_wmark_min)

This looks good to me.

Reviewed-by: Khalid Aziz <khalid.aziz@oracle.com>
Matthew Wilcox Feb. 6, 2020, 8:39 p.m. UTC | #8
On Wed, Feb 05, 2020 at 05:36:44PM -0800, Mike Kravetz wrote:
> The value of min_free_kbytes is calculated in two routines:
> 1) init_per_zone_wmark_min based on available memory
> 2) set_recommended_min_free_kbytes may reserve extra space for
>    THP allocations
> 
> In both of these routines, a user defined min_free_kbytes value will
> be overwritten if the value calculated in the code is larger. No message
> is logged if the user value is overwritten.
> 
> Change code to never overwrite user defined value.  However, do log a
> message (once per value) showing the value calculated in code.

But what if the user set min_free_kbytes to, say, half of system memory,
and then hot-unplugs three quarters of their memory?  I think the kernel
should protect itself against such foolishness.
Mike Kravetz Feb. 6, 2020, 9:23 p.m. UTC | #9
On 2/6/20 12:39 PM, Matthew Wilcox wrote:
> On Wed, Feb 05, 2020 at 05:36:44PM -0800, Mike Kravetz wrote:
>> The value of min_free_kbytes is calculated in two routines:
>> 1) init_per_zone_wmark_min based on available memory
>> 2) set_recommended_min_free_kbytes may reserve extra space for
>>    THP allocations
>>
>> In both of these routines, a user defined min_free_kbytes value will
>> be overwritten if the value calculated in the code is larger. No message
>> is logged if the user value is overwritten.
>>
>> Change code to never overwrite user defined value.  However, do log a
>> message (once per value) showing the value calculated in code.
> 
> But what if the user set min_free_kbytes to, say, half of system memory,
> and then hot-unplugs three quarters of their memory?  I think the kernel
> should protect itself against such foolishness.

I'm not sure what we should set it to in this case.  Previously you said,

>> I'm reluctant to suggest we do a more complex adjustment of the value
>> (eg figure out what the adjustment would have been, then apply some
>> fraction of that adjustment to keep the ratios in proportion) because
>> we don't really know why they adjusted it.

So, I suspect you would suggest setting it to the default computed value?
But then, when do we start adjusting?  What if they only remove a small
amount of memory?  And, then add the same amount back in?

BTW - In the above scenario existing code would not change min_free_kbytes
because the user defined value is greater than value computed in code.
Matthew Wilcox Feb. 6, 2020, 9:32 p.m. UTC | #10
On Thu, Feb 06, 2020 at 01:23:21PM -0800, Mike Kravetz wrote:
> On 2/6/20 12:39 PM, Matthew Wilcox wrote:
> > On Wed, Feb 05, 2020 at 05:36:44PM -0800, Mike Kravetz wrote:
> >> The value of min_free_kbytes is calculated in two routines:
> >> 1) init_per_zone_wmark_min based on available memory
> >> 2) set_recommended_min_free_kbytes may reserve extra space for
> >>    THP allocations
> >>
> >> In both of these routines, a user defined min_free_kbytes value will
> >> be overwritten if the value calculated in the code is larger. No message
> >> is logged if the user value is overwritten.
> >>
> >> Change code to never overwrite user defined value.  However, do log a
> >> message (once per value) showing the value calculated in code.
> > 
> > But what if the user set min_free_kbytes to, say, half of system memory,
> > and then hot-unplugs three quarters of their memory?  I think the kernel
> > should protect itself against such foolishness.
> 
> I'm not sure what we should set it to in this case.  Previously you said,
> 
> >> I'm reluctant to suggest we do a more complex adjustment of the value
> >> (eg figure out what the adjustment would have been, then apply some
> >> fraction of that adjustment to keep the ratios in proportion) because
> >> we don't really know why they adjusted it.
> 
> So, I suspect you would suggest setting it to the default computed value?
> But then, when do we start adjusting?  What if they only remove a small
> amount of memory?  And, then add the same amount back in?

I don't know about the default computed value ... we don't seem to have
any protection against the user setting min_free_kbytes to double the
amount of memory in the machine today.  Which would presumably cause
problems if I asked to maintain 32GB free at all times on my 16GB laptop?

Maybe we should have such protection?

> BTW - In the above scenario existing code would not change min_free_kbytes
> because the user defined value is greater than value computed in code.

True!
Mike Kravetz Feb. 10, 2020, 6:58 p.m. UTC | #11
On 2/6/20 1:32 PM, Matthew Wilcox wrote:
> On Thu, Feb 06, 2020 at 01:23:21PM -0800, Mike Kravetz wrote:
>> On 2/6/20 12:39 PM, Matthew Wilcox wrote:
>>> On Wed, Feb 05, 2020 at 05:36:44PM -0800, Mike Kravetz wrote:
>>>> The value of min_free_kbytes is calculated in two routines:
>>>> 1) init_per_zone_wmark_min based on available memory
>>>> 2) set_recommended_min_free_kbytes may reserve extra space for
>>>>    THP allocations
>>>>
>>>> In both of these routines, a user defined min_free_kbytes value will
>>>> be overwritten if the value calculated in the code is larger. No message
>>>> is logged if the user value is overwritten.
>>>>
>>>> Change code to never overwrite user defined value.  However, do log a
>>>> message (once per value) showing the value calculated in code.
>>>
>>> But what if the user set min_free_kbytes to, say, half of system memory,
>>> and then hot-unplugs three quarters of their memory?  I think the kernel
>>> should protect itself against such foolishness.
>>
>> I'm not sure what we should set it to in this case.  Previously you said,
>>
>>>> I'm reluctant to suggest we do a more complex adjustment of the value
>>>> (eg figure out what the adjustment would have been, then apply some
>>>> fraction of that adjustment to keep the ratios in proportion) because
>>>> we don't really know why they adjusted it.
>>
>> So, I suspect you would suggest setting it to the default computed value?
>> But then, when do we start adjusting?  What if they only remove a small
>> amount of memory?  And, then add the same amount back in?
> 
> I don't know about the default computed value ... we don't seem to have
> any protection against the user setting min_free_kbytes to double the
> amount of memory in the machine today.  Which would presumably cause
> problems if I asked to maintain 32GB free at all times on my 16GB laptop?
> 
> Maybe we should have such protection?

I am not going to attempt to define parameters for user defined values
of min_free_kbytes.  Someone smarter than me can take that on.  Documentation
is pretty clear that user is allowed to do bad things which will immediately
cause them problems.

I'm going to explicitly send out the v2 version of patch.  It is not something
I feel strongly about.  Just an attempt to clean up some inconsistencies
observed while looking at something else.
diff mbox series

Patch

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index bc45ea1efbf7..8f02d3575829 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -15,6 +15,7 @@  extern int __khugepaged_enter(struct mm_struct *mm);
 extern void __khugepaged_exit(struct mm_struct *mm);
 extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
 				      unsigned long vm_flags);
+extern bool khugepaged_adjust_min_free_kbytes(void);
 #ifdef CONFIG_SHMEM
 extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr);
 #else
@@ -81,6 +82,10 @@  static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
 {
 	return 0;
 }
+static bool khugepaged_adjust_min_free_kbytes(void)
+{
+	return false;
+}
 static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
 					   unsigned long addr)
 {
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b679908743cb..d8040cf19e98 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2138,7 +2138,7 @@  static int khugepaged(void *none)
 	return 0;
 }
 
-static void set_recommended_min_free_kbytes(void)
+bool __khugepaged_adjust_min_free_kbytes(void)
 {
 	struct zone *zone;
 	int nr_zones = 0;
@@ -2174,17 +2174,26 @@  static void set_recommended_min_free_kbytes(void)
 
 	if (recommended_min > min_free_kbytes) {
 		if (user_min_free_kbytes >= 0)
-			pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
+			pr_info_once("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
 				min_free_kbytes, recommended_min);
 
 		min_free_kbytes = recommended_min;
+		return true;
 	}
-	setup_per_zone_wmarks();
+
+	return false;
+}
+
+static void set_recommended_min_free_kbytes(void)
+{
+	if (__khugepaged_adjust_min_free_kbytes())
+		setup_per_zone_wmarks();
 }
 
-int start_stop_khugepaged(void)
+static struct task_struct *khugepaged_thread __read_mostly;
+
+int __ref start_stop_khugepaged(void)
 {
-	static struct task_struct *khugepaged_thread __read_mostly;
 	static DEFINE_MUTEX(khugepaged_mutex);
 	int err = 0;
 
@@ -2207,8 +2216,24 @@  int start_stop_khugepaged(void)
 	} else if (khugepaged_thread) {
 		kthread_stop(khugepaged_thread);
 		khugepaged_thread = NULL;
+		init_per_zone_wmark_min();
 	}
 fail:
 	mutex_unlock(&khugepaged_mutex);
 	return err;
 }
+
+bool khugepaged_adjust_min_free_kbytes(void)
+{
+	bool ret = false;
+
+	/*
+	 * This is a bit racy, and we could miss transitions.  However,
+	 * start/stop code above will make additional adjustments at the
+	 * end of transitions.
+	 */
+	if (khugepaged_enabled() && khugepaged_thread)
+		ret = __khugepaged_adjust_min_free_kbytes();
+
+	return ret;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d047bf7d8fd4..a7b3a6663ba6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,6 +68,7 @@ 
 #include <linux/lockdep.h>
 #include <linux/nmi.h>
 #include <linux/psi.h>
+#include <linux/khugepaged.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -7827,9 +7828,10 @@  int __meminit init_per_zone_wmark_min(void)
 		if (min_free_kbytes > 65536)
 			min_free_kbytes = 65536;
 	} else {
-		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
+		pr_warn_once("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
 				new_min_free_kbytes, user_min_free_kbytes);
 	}
+	(void)khugepaged_adjust_min_free_kbytes();
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();