diff mbox series

[2/2,v3] mm/compaction: Disable compact_unevictable_allowed on RT

Message ID 20200303202225.nhqc3v5gwlb7x6et@linutronix.de (mailing list archive)
State New, archived
Headers show
Series [1/2] =?UTF-8?q?mm/compaction:=20Really=20limit=20compact?= =?UTF-8?q?=5Funevictable=5Fallowed=20to=200=E2=80=A61?= | expand

Commit Message

Sebastian Andrzej Siewior March 3, 2020, 8:22 p.m. UTC
Since commit
    5bbe3547aa3ba ("mm: allow compaction of unevictable pages")

it is allowed to examine mlocked pages and compact them by default.
On -RT even minor pagefaults are problematic because it may take a few
100us to resolve them and until then the task is blocked.

Make compact_unevictable_allowed = 0 default and issue a warning on RT
if it is changed.

Link: https://lore.kernel.org/linux-mm/20190710144138.qyn4tuttdq6h7kqx@linutronix.de/
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
v2…v3: - Allow to modify the value but issue a warning if it is changed.

v1…v2: - Make the proc file RO instead removing it.
       - Mention this change in Documentation/…/vm.rst.

 Documentation/admin-guide/sysctl/vm.rst |  3 +++
 kernel/sysctl.c                         | 27 ++++++++++++++++++++++++-
 mm/compaction.c                         |  4 ++++
 3 files changed, 33 insertions(+), 1 deletion(-)

Comments

Andrew Morton March 3, 2020, 11:56 p.m. UTC | #1
On Tue, 3 Mar 2020 21:22:25 +0100 Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> Since commit
>     5bbe3547aa3ba ("mm: allow compaction of unevictable pages")
> 
> it is allowed to examine mlocked pages and compact them by default.
> On -RT even minor pagefaults are problematic because it may take a few
> 100us to resolve them and until then the task is blocked.
> 
> Make compact_unevictable_allowed = 0 default and issue a warning on RT
> if it is changed.

Fair enough, I guess.

> @@ -2572,6 +2577,26 @@ int proc_dointvec(struct ctl_table *table, int write,
>  	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
>  }
>  
> +#ifdef CONFIG_COMPACTION
> +static int proc_dointvec_warn_RT_change(struct ctl_table *table, int write,
> +					void __user *buffer, size_t *lenp,
> +					loff_t *ppos)
> +{
> +	int ret, old;
> +
> +	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
> +		return proc_dointvec(table, write, buffer, lenp, ppos);
> +
> +	old = *(int *)table->data;
> +	ret = proc_dointvec(table, write, buffer, lenp, ppos);
> +	if (ret)
> +		return ret;
> +	WARN_ONCE(old != *(int *)table->data, "sysctl attribute %s changed.",
> +		  table->procname);

The WARN will include a stack trace which just isn't interesting.  A
pr_warn() would be better?

> +	return ret;
> +}
> +#endif
Vlastimil Babka March 4, 2020, 8:18 a.m. UTC | #2
On 3/3/20 9:22 PM, Sebastian Andrzej Siewior wrote:
> Since commit
>     5bbe3547aa3ba ("mm: allow compaction of unevictable pages")
> 
> it is allowed to examine mlocked pages and compact them by default.
> On -RT even minor pagefaults are problematic because it may take a few
> 100us to resolve them and until then the task is blocked.
> 
> Make compact_unevictable_allowed = 0 default and issue a warning on RT
> if it is changed.
> 
> Link: https://lore.kernel.org/linux-mm/20190710144138.qyn4tuttdq6h7kqx@linutronix.de/
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
> v2…v3: - Allow to modify the value but issue a warning if it is changed.
> 
> v1…v2: - Make the proc file RO instead removing it.
>        - Mention this change in Documentation/…/vm.rst.
> 
>  Documentation/admin-guide/sysctl/vm.rst |  3 +++
>  kernel/sysctl.c                         | 27 ++++++++++++++++++++++++-
>  mm/compaction.c                         |  4 ++++
>  3 files changed, 33 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> index 64aeee1009cab..0329a4d3fa9ec 100644
> --- a/Documentation/admin-guide/sysctl/vm.rst
> +++ b/Documentation/admin-guide/sysctl/vm.rst
> @@ -128,6 +128,9 @@ allowed to examine the unevictable lru (mlocked pages) for pages to compact.
>  This should be used on systems where stalls for minor page faults are an
>  acceptable trade for large contiguous free memory.  Set to 0 to prevent
>  compaction from moving pages that are unevictable.  Default value is 1.
> +On CONFIG_PREEMPT_RT the default value is 0 in order to avoid a page fault, due
> +to compaction, which would block the task from becomming active until the fault
> +is resolved.
>  
>  
>  dirty_background_bytes
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 982203101f961..3ace90b6ac57f 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -212,6 +212,11 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
>  		  void __user *buffer, size_t *lenp, loff_t *ppos);
>  static int proc_taint(struct ctl_table *table, int write,
>  			       void __user *buffer, size_t *lenp, loff_t *ppos);
> +#ifdef CONFIG_COMPACTION
> +static int proc_dointvec_warn_RT_change(struct ctl_table *table, int write,
> +					void __user *buffer, size_t *lenp,
> +					loff_t *ppos);
> +#endif
>  #endif
>  
>  #ifdef CONFIG_PRINTK
> @@ -1484,7 +1489,7 @@ static struct ctl_table vm_table[] = {
>  		.data		= &sysctl_compact_unevictable_allowed,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
> -		.proc_handler	= proc_dointvec_minmax,
> +		.proc_handler	= proc_dointvec_warn_RT_change,
>  		.extra1		= SYSCTL_ZERO,
>  		.extra2		= SYSCTL_ONE,
>  	},
> @@ -2572,6 +2577,26 @@ int proc_dointvec(struct ctl_table *table, int write,
>  	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
>  }
>  
> +#ifdef CONFIG_COMPACTION
> +static int proc_dointvec_warn_RT_change(struct ctl_table *table, int write,
> +					void __user *buffer, size_t *lenp,
> +					loff_t *ppos)
> +{
> +	int ret, old;
> +
> +	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
> +		return proc_dointvec(table, write, buffer, lenp, ppos);

Shouldn't you use her proc_dointvec_minmax() per Patch 1/2 ?

> +
> +	old = *(int *)table->data;
> +	ret = proc_dointvec(table, write, buffer, lenp, ppos);

And here.

> +	if (ret)
> +		return ret;
> +	WARN_ONCE(old != *(int *)table->data, "sysctl attribute %s changed.",
> +		  table->procname);
> +	return ret;
> +}
> +#endif
> +
>  /**
>   * proc_douintvec - read a vector of unsigned integers
>   * @table: the sysctl table
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 672d3c78c6abf..ba77809a1666e 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1590,7 +1590,11 @@ typedef enum {
>   * Allow userspace to control policy on scanning the unevictable LRU for
>   * compactable pages.
>   */
> +#ifdef CONFIG_PREEMPT_RT
> +int sysctl_compact_unevictable_allowed __read_mostly = 0;
> +#else
>  int sysctl_compact_unevictable_allowed __read_mostly = 1;
> +#endif
>  
>  static inline void
>  update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
>
Vlastimil Babka March 4, 2020, 8:19 a.m. UTC | #3
On 3/4/20 12:56 AM, Andrew Morton wrote:
>> @@ -2572,6 +2577,26 @@ int proc_dointvec(struct ctl_table *table, int write,
>>  	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
>>  }
>>  
>> +#ifdef CONFIG_COMPACTION
>> +static int proc_dointvec_warn_RT_change(struct ctl_table *table, int write,
>> +					void __user *buffer, size_t *lenp,
>> +					loff_t *ppos)
>> +{
>> +	int ret, old;
>> +
>> +	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
>> +		return proc_dointvec(table, write, buffer, lenp, ppos);
>> +
>> +	old = *(int *)table->data;
>> +	ret = proc_dointvec(table, write, buffer, lenp, ppos);
>> +	if (ret)
>> +		return ret;
>> +	WARN_ONCE(old != *(int *)table->data, "sysctl attribute %s changed.",
>> +		  table->procname);
> 
> The WARN will include a stack trace which just isn't interesting.  A
> pr_warn() would be better?

Yeah, the only interesting part of full WARN would possibly be, which process
changed it. That might be useful to print.

>> +	return ret;
>> +}
>> +#endif
>
Mel Gorman March 4, 2020, 9:11 a.m. UTC | #4
On Tue, Mar 03, 2020 at 09:22:25PM +0100, Sebastian Andrzej Siewior wrote:
> Since commit
>     5bbe3547aa3ba ("mm: allow compaction of unevictable pages")
> 
> it is allowed to examine mlocked pages and compact them by default.
> On -RT even minor pagefaults are problematic because it may take a few
> 100us to resolve them and until then the task is blocked.
> 
> Make compact_unevictable_allowed = 0 default and issue a warning on RT
> if it is changed.
> 
> Link: https://lore.kernel.org/linux-mm/20190710144138.qyn4tuttdq6h7kqx@linutronix.de/
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

Acked-by: Mel Gorman <mgorman@techsingularity.net>

(caveat: I do not spend very much some on RT-specific topics)

While I ack'd this, an RT application using THP is playing with fire,
I know the RT extension for SLE explicitly disables it from being enabled
at kernel config time. At minimum the critical regions should be mlocked
followed by prctl to disable future THP faults that are non-deterministic,
both from an allocation point of view, and a TLB access point of view. It's
still reasonable to expect a smaller TLB reach for huge pages than
base pages.

It's a similar hazard with NUMA balancing, an RT application should either
disable balancing globally or set a memory policy that forces it to be
ignored. They should be doing this anyway to avoid non-deterministic
memory access costs due to NUMA artifacts but it wouldn't surprise me
if some applications got it wrong.  In that case, the SLE RT extension
disables balancing by default and it probably should warn if it's enabled
like this patch does.

It wouldn't surprise me to see patches like this in the future (completely
untested, illustrative only).

diff --git a/init/Kconfig b/init/Kconfig
index 452bc1835cd4..7a406e2b5580 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -797,7 +797,7 @@ config NUMA_BALANCING
 config NUMA_BALANCING_DEFAULT_ENABLED
 	bool "Automatically enable NUMA aware memory/task placement"
 	default y
-	depends on NUMA_BALANCING
+	depends on NUMA_BALANCING && !PREEMPT_RT
 	help
 	  If set, automatic NUMA balancing will be enabled if running on a NUMA
 	  machine.
diff --git a/mm/Kconfig b/mm/Kconfig
index ab80933be65f..313a5d794491 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -385,7 +385,7 @@ config TRANSPARENT_HUGEPAGE
 
 choice
 	prompt "Transparent Hugepage Support sysfs defaults"
-	depends on TRANSPARENT_HUGEPAGE
+	depends on TRANSPARENT_HUGEPAGE && !PREEMPT_RT
 	default TRANSPARENT_HUGEPAGE_ALWAYS
 	help
 	  Selects the sysfs defaults for Transparent Hugepage Support.

I would hope that a user of an RT kernel with an RT-aware application
would be aware of this anyway but ..... uhhhh.

Point for Andrew is that I would not be too surprised if there were more
RT-specific checks in the future that sanity checked some configuration
options in response to RT-specific bugs that were down to insane
configurations (be they kernel configs or sysctls)
Sebastian Andrzej Siewior March 4, 2020, 9:25 a.m. UTC | #5
On 2020-03-04 09:18:21 [+0100], Vlastimil Babka wrote:
> > @@ -2572,6 +2577,26 @@ int proc_dointvec(struct ctl_table *table, int write,
> >  	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
> >  }
> >  
> > +#ifdef CONFIG_COMPACTION
> > +static int proc_dointvec_warn_RT_change(struct ctl_table *table, int write,
> > +					void __user *buffer, size_t *lenp,
> > +					loff_t *ppos)
> > +{
> > +	int ret, old;
> > +
> > +	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
> > +		return proc_dointvec(table, write, buffer, lenp, ppos);
> 
> Shouldn't you use her proc_dointvec_minmax() per Patch 1/2 ?
> 
> > +
> > +	old = *(int *)table->data;
> > +	ret = proc_dointvec(table, write, buffer, lenp, ppos);
> 
> And here.

Yes, thank you for noticing. It didn't make from editor to disk after
rebasing…

Sebastian
Sebastian Andrzej Siewior March 4, 2020, 9:27 a.m. UTC | #6
On 2020-03-04 09:19:21 [+0100], Vlastimil Babka wrote:
> >> +	WARN_ONCE(old != *(int *)table->data, "sysctl attribute %s changed.",
> >> +		  table->procname);
> > 
> > The WARN will include a stack trace which just isn't interesting.  A
> > pr_warn() would be better?
> 
> Yeah, the only interesting part of full WARN would possibly be, which process
> changed it. That might be useful to print.

Yes, the stack trace and register dump isn't interesting. But as
Vlastimil says, the task and pid are informative. So if that is too much
I could extract those two informations and include it in a pr_warn().

Sebastian
diff mbox series

Patch

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 64aeee1009cab..0329a4d3fa9ec 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -128,6 +128,9 @@  allowed to examine the unevictable lru (mlocked pages) for pages to compact.
 This should be used on systems where stalls for minor page faults are an
 acceptable trade for large contiguous free memory.  Set to 0 to prevent
 compaction from moving pages that are unevictable.  Default value is 1.
+On CONFIG_PREEMPT_RT the default value is 0 in order to avoid a page fault, due
+to compaction, which would block the task from becomming active until the fault
+is resolved.
 
 
 dirty_background_bytes
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 982203101f961..3ace90b6ac57f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -212,6 +212,11 @@  static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
 static int proc_taint(struct ctl_table *table, int write,
 			       void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_COMPACTION
+static int proc_dointvec_warn_RT_change(struct ctl_table *table, int write,
+					void __user *buffer, size_t *lenp,
+					loff_t *ppos);
+#endif
 #endif
 
 #ifdef CONFIG_PRINTK
@@ -1484,7 +1489,7 @@  static struct ctl_table vm_table[] = {
 		.data		= &sysctl_compact_unevictable_allowed,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_warn_RT_change,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	},
@@ -2572,6 +2577,26 @@  int proc_dointvec(struct ctl_table *table, int write,
 	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
 }
 
+#ifdef CONFIG_COMPACTION
+static int proc_dointvec_warn_RT_change(struct ctl_table *table, int write,
+					void __user *buffer, size_t *lenp,
+					loff_t *ppos)
+{
+	int ret, old;
+
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
+		return proc_dointvec(table, write, buffer, lenp, ppos);
+
+	old = *(int *)table->data;
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret)
+		return ret;
+	WARN_ONCE(old != *(int *)table->data, "sysctl attribute %s changed.",
+		  table->procname);
+	return ret;
+}
+#endif
+
 /**
  * proc_douintvec - read a vector of unsigned integers
  * @table: the sysctl table
diff --git a/mm/compaction.c b/mm/compaction.c
index 672d3c78c6abf..ba77809a1666e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1590,7 +1590,11 @@  typedef enum {
  * Allow userspace to control policy on scanning the unevictable LRU for
  * compactable pages.
  */
+#ifdef CONFIG_PREEMPT_RT
+int sysctl_compact_unevictable_allowed __read_mostly = 0;
+#else
 int sysctl_compact_unevictable_allowed __read_mostly = 1;
+#endif
 
 static inline void
 update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)