diff mbox series

[v4] sched/numa: add per-process numa_balancing

Message ID 20220929064359.46932-1-ligang.bdlg@bytedance.com (mailing list archive)
State New, archived
Headers show
Series [v4] sched/numa: add per-process numa_balancing | expand

Commit Message

Gang Li Sept. 29, 2022, 6:43 a.m. UTC
This patch add a new api PR_NUMA_BALANCING in prctl.

A large number of page faults will cause performance loss when numa
balancing is performing. Thus those processes which care about worst-case
performance need numa balancing disabled. Others, on the contrary, allow a
temporary performance loss in exchange for higher average performance, so
enable numa balancing is better for them.

Numa balancing can only be controlled globally by
/proc/sys/kernel/numa_balancing. Due to the above case, we want to
disable/enable numa_balancing per-process instead.

Add numa_balancing under mm_struct. Then use it in task_tick_fair.

Set per-process numa balancing:
	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DISABLE); //disable
	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_ENABLE);  //enable
	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DEFAULT); //follow global
Get numa_balancing state:
	prctl(PR_NUMA_BALANCING, PR_GET_NUMAB, &ret);
	cat /proc/<pid>/status | grep NumaB_enabled

Cc: linux-api@vger.kernel.org
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
---
Changes in v4:
- code clean: add wrapper function `numa_balancing_enabled`

Changes in v3:
- Fix compile error.

Changes in v2:
- Now PR_NUMA_BALANCING support three states: enabled, disabled, default.
  enabled and disabled will ignore global setting, and default will follow
  global setting.
---
 Documentation/filesystems/proc.rst   |  2 ++
 fs/proc/task_mmu.c                   | 19 +++++++++++++++
 include/linux/mm_types.h             |  3 +++
 include/linux/sched/numa_balancing.h |  8 +++++++
 include/uapi/linux/prctl.h           |  7 ++++++
 kernel/fork.c                        |  3 +++
 kernel/sched/core.c                  | 15 ++++++++++++
 kernel/sched/fair.c                  | 32 ++++++++++++++++++++-----
 kernel/sys.c                         | 35 ++++++++++++++++++++++++++++
 9 files changed, 118 insertions(+), 6 deletions(-)

Comments

Bagas Sanjaya Sept. 29, 2022, 8:22 a.m. UTC | #1
On 9/29/22 13:43, Gang Li wrote:
> This patch add a new api PR_NUMA_BALANCING in prctl.
> 

Use imperative mood instead fir patch description, hence better say
"Add PR_NUMA_BALANCING to prctl".
Peter Zijlstra Sept. 29, 2022, 8:45 a.m. UTC | #2
The alternative to this is ofcourse to have your latency critical
applications use mbind()/set_mempolicy() etc.., because surely, them
being timing critical, they have the infrastructure to do this right?

Because timing critical software doesn't want it's memory spread
randomly, because well random is bad for performance, hmm?

And once numa balancing sees all the memory has an expliciy policy, it
won't touch it.

On Thu, Sep 29, 2022 at 02:43:58PM +0800, Gang Li wrote:

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ef0e6b3e08ff..87215b3776c9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2818,6 +2818,24 @@ void task_numa_free(struct task_struct *p, bool final)
>  	}
>  }
>  
> +inline bool numa_balancing_enabled(struct task_struct *p)

Does this want to be static?

> +{
> +	if (p->mm) {
> +		int numab = p->mm->numab_enabled;
> +
> +		switch (numab) {
> +		case NUMAB_ENABLED:
> +			return true;
> +		case NUMAB_DISABLED:
> +			return false;
> +		case NUMAB_DEFAULT:
> +			break;
> +		}
> +	}
> +
> +	return static_branch_unlikely(&sched_numa_balancing);
> +}

Blergh, this sucks. Now you have the unconditional pointer chasing and
cache-misses. The advantage of sched_numa_balancing was that there is no
overhead when disabled.

Also, "numab" is a weird word.

What about something like:

static inline bool numa_balancing_enabled(struct task_struct *p)
{
	if (!static_branch_unlikely(&sched_numa_balancing))
		return false;

	if (p->mm) switch (p->mm->numa_balancing_mode) {
	case NUMA_BALANCING_ENABLED:
		return true;
	case NUMA_BALANCING_DISABLED:
		return false
	default:
		break;
	}

	return sysctl_numa_balancing_mode;
}

( Note how that all following the existing 'numa_balancing' wording
  without inventing weird new words. )

And then you frob the sysctl and prctl such that sched_numa_balancing
and sysctl_numa_balancing_mode are not tied together just so.
Specifically, I'm thinking you should use static_branch_inc() to count
how many enables you have, one for the default and one for each prctl().
Then it all just works.

> @@ -11581,8 +11599,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>  		entity_tick(cfs_rq, se, queued);
>  	}
>  
> -	if (static_branch_unlikely(&sched_numa_balancing))
> +#ifdef CONFIG_NUMA_BALANCING
> +	if (numa_balancing_enabled(curr))
>  		task_tick_numa(rq, curr);
> +#endif
>  
>  	update_misfit_status(curr, rq);
>  	update_overutilized_status(task_rq(curr));

Surely you can make that #ifdef go away without much effort.

> diff --git a/kernel/sys.c b/kernel/sys.c
> index 8a6432465dc5..11720a35455a 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -59,6 +59,7 @@
>  #include <linux/sched/coredump.h>
>  #include <linux/sched/task.h>
>  #include <linux/sched/cputime.h>
> +#include <linux/sched/numa_balancing.h>
>  #include <linux/rcupdate.h>
>  #include <linux/uidgid.h>
>  #include <linux/cred.h>
> @@ -2101,6 +2102,23 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
>  	return 0;
>  }
>  
> +#ifdef CONFIG_NUMA_BALANCING
> +static int prctl_pid_numa_balancing_write(int numa_balancing)
> +{
> +	if (numa_balancing != PR_SET_NUMAB_DEFAULT
> +	    && numa_balancing != PR_SET_NUMAB_DISABLED
> +	    && numa_balancing != PR_SET_NUMAB_ENABLED)
> +		return -EINVAL;

Operators go at the end of the line.

> +	current->mm->numab_enabled = numa_balancing;
> +	return 0;
> +}
diff mbox series

Patch

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index e7aafc82be99..b2ddffad015f 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -192,6 +192,7 @@  read the file /proc/PID/status::
   VmLib:      1412 kB
   VmPTE:        20 kb
   VmSwap:        0 kB
+  NumaB_enabled:  default
   HugetlbPages:          0 kB
   CoreDumping:    0
   THP_enabled:	  1
@@ -273,6 +274,7 @@  It's slow but very precise.
  VmPTE                       size of page table entries
  VmSwap                      amount of swap used by anonymous private data
                              (shmem swap usage is not included)
+ NumaB_enabled               numa balancing state, set by prctl(PR_NUMA_BALANCING, ...)
  HugetlbPages                size of hugetlb memory portions
  CoreDumping                 process's memory is currently being dumped
                              (killing the process may lead to a corrupted core)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8b4f3073f8f5..7358a5932e5a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -19,6 +19,7 @@ 
 #include <linux/shmem_fs.h>
 #include <linux/uaccess.h>
 #include <linux/pkeys.h>
+#include <linux/sched/numa_balancing.h>
 
 #include <asm/elf.h>
 #include <asm/tlb.h>
@@ -75,6 +76,24 @@  void task_mem(struct seq_file *m, struct mm_struct *mm)
 		    " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
 	SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
 	seq_puts(m, " kB\n");
+#ifdef CONFIG_NUMA_BALANCING
+	seq_puts(m, "NumaB_enabled:\t");
+	switch (mm->numab_enabled) {
+	case NUMAB_DEFAULT:
+		seq_puts(m, "default");
+		break;
+	case NUMAB_DISABLED:
+		seq_puts(m, "disabled");
+		break;
+	case NUMAB_ENABLED:
+		seq_puts(m, "enabled");
+		break;
+	default:
+		seq_puts(m, "unknown");
+		break;
+	}
+	seq_putc(m, '\n');
+#endif
 	hugetlb_report_usage(m, mm);
 }
 #undef SEQ_PUT_DEC
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 500e536796ca..d9bfa740d905 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -665,6 +665,9 @@  struct mm_struct {
 
 		/* numa_scan_seq prevents two threads remapping PTEs. */
 		int numa_scan_seq;
+
+		/* Controls whether NUMA balancing is active for this mm. */
+		int numab_enabled;
 #endif
 		/*
 		 * An operation with batched TLB flushing is going on. Anything
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15..f4a4cdf264bc 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -16,12 +16,20 @@ 
 #define TNF_MIGRATE_FAIL 0x10
 
 #ifdef CONFIG_NUMA_BALANCING
+enum {
+	NUMAB_DISABLED,
+	NUMAB_ENABLED,
+	NUMAB_DEFAULT
+};
+DECLARE_STATIC_KEY_FALSE(sched_numa_balancing);
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
 extern void task_numa_free(struct task_struct *p, bool final);
 extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
 					int src_nid, int dst_cpu);
+extern bool numa_balancing_enabled(struct task_struct *p);
+extern int numa_balancing_mode(struct mm_struct *mm);
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
 				   int flags)
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index a5e06dcbba13..4c57724b04c3 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -284,4 +284,11 @@  struct prctl_mm_map {
 #define PR_SET_VMA		0x53564d41
 # define PR_SET_VMA_ANON_NAME		0
 
+/* Set/get enabled per-process numa_balancing */
+#define PR_NUMA_BALANCING		65
+# define PR_SET_NUMAB_DISABLED		NUMAB_DISABLED
+# define PR_SET_NUMAB_ENABLED		NUMAB_ENABLED
+# define PR_SET_NUMAB_DEFAULT		NUMAB_DEFAULT
+# define PR_GET_NUMAB			3
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 844dfdc8c639..1b9254315770 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1133,6 +1133,9 @@  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_tlb_flush_pending(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	mm->pmd_huge_pte = NULL;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+	mm->numab_enabled = NUMAB_DEFAULT;
 #endif
 	mm_init_uprobes_state(mm);
 	hugetlb_count_init(mm);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a77e8bfbfb5b..12d171978538 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4396,6 +4396,21 @@  void set_numabalancing_state(bool enabled)
 	__set_numabalancing_state(enabled);
 }
 
+inline int numa_balancing_mode(struct mm_struct *mm)
+{
+	int numab = mm->numab_enabled;
+
+	switch (numab) {
+	case NUMAB_ENABLED:
+		return NUMA_BALANCING_NORMAL;
+	case NUMAB_DISABLED:
+		return NUMA_BALANCING_DISABLED;
+	case NUMAB_DEFAULT:
+	default:
+		return sysctl_numa_balancing_mode;
+	}
+}
+
 #ifdef CONFIG_PROC_SYSCTL
 static void reset_memory_tiering(void)
 {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef0e6b3e08ff..87215b3776c9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2818,6 +2818,24 @@  void task_numa_free(struct task_struct *p, bool final)
 	}
 }
 
+inline bool numa_balancing_enabled(struct task_struct *p)
+{
+	if (p->mm) {
+		int numab = p->mm->numab_enabled;
+
+		switch (numab) {
+		case NUMAB_ENABLED:
+			return true;
+		case NUMAB_DISABLED:
+			return false;
+		case NUMAB_DEFAULT:
+			break;
+		}
+	}
+
+	return static_branch_unlikely(&sched_numa_balancing);
+}
+
 /*
  * Got a PROT_NONE fault for a page on @node.
  */
@@ -2830,13 +2848,13 @@  void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	struct numa_group *ng;
 	int priv;
 
-	if (!static_branch_likely(&sched_numa_balancing))
-		return;
-
 	/* for example, ksmd faulting in a user's mm */
 	if (!p->mm)
 		return;
 
+	if (!numa_balancing_enabled(p))
+		return;
+
 	/*
 	 * NUMA faults statistics are unnecessary for the slow memory
 	 * node for memory tiering mode.
@@ -3151,7 +3169,7 @@  static void update_scan_period(struct task_struct *p, int new_cpu)
 	int src_nid = cpu_to_node(task_cpu(p));
 	int dst_nid = cpu_to_node(new_cpu);
 
-	if (!static_branch_likely(&sched_numa_balancing))
+	if (!numa_balancing_enabled(p))
 		return;
 
 	if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
@@ -7996,7 +8014,7 @@  static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 	unsigned long src_weight, dst_weight;
 	int src_nid, dst_nid, dist;
 
-	if (!static_branch_likely(&sched_numa_balancing))
+	if (!numa_balancing_enabled(p))
 		return -1;
 
 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
@@ -11581,8 +11599,10 @@  static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 		entity_tick(cfs_rq, se, queued);
 	}
 
-	if (static_branch_unlikely(&sched_numa_balancing))
+#ifdef CONFIG_NUMA_BALANCING
+	if (numa_balancing_enabled(curr))
 		task_tick_numa(rq, curr);
+#endif
 
 	update_misfit_status(curr, rq);
 	update_overutilized_status(task_rq(curr));
diff --git a/kernel/sys.c b/kernel/sys.c
index 8a6432465dc5..11720a35455a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -59,6 +59,7 @@ 
 #include <linux/sched/coredump.h>
 #include <linux/sched/task.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
@@ -2101,6 +2102,23 @@  static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
 	return 0;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static int prctl_pid_numa_balancing_write(int numa_balancing)
+{
+	if (numa_balancing != PR_SET_NUMAB_DEFAULT
+	    && numa_balancing != PR_SET_NUMAB_DISABLED
+	    && numa_balancing != PR_SET_NUMAB_ENABLED)
+		return -EINVAL;
+	current->mm->numab_enabled = numa_balancing;
+	return 0;
+}
+
+static int prctl_pid_numa_balancing_read(void)
+{
+	return current->mm->numab_enabled;
+}
+#endif
+
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
@@ -2615,6 +2633,23 @@  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		error = set_syscall_user_dispatch(arg2, arg3, arg4,
 						  (char __user *) arg5);
 		break;
+#ifdef CONFIG_NUMA_BALANCING
+	case PR_NUMA_BALANCING:
+		switch (arg2) {
+		case PR_SET_NUMAB_DEFAULT:
+		case PR_SET_NUMAB_DISABLED:
+		case PR_SET_NUMAB_ENABLED:
+			error = prctl_pid_numa_balancing_write((int)arg2);
+			break;
+		case PR_GET_NUMAB:
+			error = put_user(prctl_pid_numa_balancing_read(), (int __user *)arg3);
+			break;
+		default:
+			error = -EINVAL;
+			break;
+		}
+		break;
+#endif
 #ifdef CONFIG_SCHED_CORE
 	case PR_SCHED_CORE:
 		error = sched_core_share_pid(arg2, arg3, arg4, arg5);