diff mbox series

[-V7,3/3] NOT kernel/numactl: Support to enable Linux kernel NUMA balancing

Message ID 20201204091534.72239-4-ying.huang@intel.com (mailing list archive)
State New, archived
Headers show
Series numa balancing: Migrate on fault among multiple bound nodes | expand

Commit Message

Huang, Ying Dec. 4, 2020, 9:15 a.m. UTC
A new API: numa_set_membind_balancing() is added to libnuma.  It is
same as numa_set_membind() except that the Linux kernel NUMA balancing
will be enabled for the task if the feature is supported by the
kernel.

At the same time, a new option: --balancing (-b) is added to numactl.
Which can be used before the --membind/-m memory policy in the command
line.  With it, the Linux kernel NUMA balancing will be enabled for
the process if --membind/-m is used and the feature is supported by
the kernel.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
---
 libnuma.c         | 14 ++++++++++++++
 numa.3            | 15 +++++++++++++++
 numa.h            |  4 ++++
 numactl.8         | 12 ++++++++++++
 numactl.c         | 17 ++++++++++++++---
 numaif.h          |  3 +++
 versions.ldscript |  8 ++++++++
 7 files changed, 70 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/libnuma.c b/libnuma.c
index 88f479b..f073c50 100644
--- a/libnuma.c
+++ b/libnuma.c
@@ -1064,6 +1064,20 @@  numa_set_membind_v2(struct bitmask *bmp)
 
 make_internal_alias(numa_set_membind_v2);
 
+void
+numa_set_membind_balancing(struct bitmask *bmp)
+{
+	/* MPOL_F_NUMA_BALANCING: ignore if unsupported */
+	if (set_mempolicy(MPOL_BIND | MPOL_F_NUMA_BALANCING,
+			  bmp->maskp, bmp->size + 1) < 0) {
+		if (errno == EINVAL) {
+			errno = 0;
+			numa_set_membind_v2(bmp);
+		} else
+			numa_error("set_mempolicy");
+	}
+}
+
 /*
  * copy a bitmask map body to a numa.h nodemask_t structure
  */
diff --git a/numa.3 b/numa.3
index 3e18098..af01c8f 100644
--- a/numa.3
+++ b/numa.3
@@ -80,6 +80,8 @@  numa \- NUMA policy library
 .br
 .BI "void numa_set_membind(struct bitmask *" nodemask );
 .br
+.BI "void numa_set_membind_balancing(struct bitmask *" nodemask );
+.br
 .B struct bitmask *numa_get_membind(void);
 .sp
 .BI "void *numa_alloc_onnode(size_t " size ", int " node );
@@ -538,6 +540,19 @@  that contains nodes other than those in the mask returned by
 .IR numa_get_mems_allowed ()
 will result in an error.
 
+.BR numa_set_membind_balancing ()
+sets the memory allocation mask and enable the Linux kernel NUMA
+balancing for the task if the feature is supported by the kernel.
+The task will only allocate memory from the nodes set in
+.IR nodemask .
+Passing an empty
+.I nodemask
+or a
+.I nodemask
+that contains nodes other than those in the mask returned by
+.IR numa_get_mems_allowed ()
+will result in an error.
+
 .BR numa_get_membind ()
 returns the mask of nodes from which memory can currently be allocated.
 If the returned mask is equal to
diff --git a/numa.h b/numa.h
index bd1d676..5d8543a 100644
--- a/numa.h
+++ b/numa.h
@@ -192,6 +192,10 @@  void numa_set_localalloc(void);
 /* Only allocate memory from the nodes set in mask. 0 to turn off */
 void numa_set_membind(struct bitmask *nodemask);
 
+/* Only allocate memory from the nodes set in mask. Optimize page
+   placement with Linux kernel NUMA balancing if possible. 0 to turn off */
+void numa_set_membind_balancing(struct bitmask *bmp);
+
 /* Return current membind */
 struct bitmask *numa_get_membind(void);
 
diff --git a/numactl.8 b/numactl.8
index f3bb22b..7d52688 100644
--- a/numactl.8
+++ b/numactl.8
@@ -25,6 +25,8 @@  numactl \- Control NUMA policy for processes or shared memory
 [
 .B \-\-all
 ] [
+.B \-\-balancing
+] [
 .B \-\-interleave nodes
 ] [
 .B \-\-preferred node 
@@ -168,6 +170,12 @@  but if memory cannot be allocated there fall back to other nodes.
 This option takes only a single node number.
 Relative notation may be used.
 .TP
+.B \-\-balancing, \-b
+Enable Linux kernel NUMA balancing for the process if it is supported by kernel.
+This should only be used with
+.I \-\-membind, \-m
+only, otherwise ignored.
+.TP
 .B \-\-show, \-s
 Show NUMA policy settings of the current process. 
 .TP
@@ -278,6 +286,10 @@  numactl \-\-cpunodebind=0 \-\-membind=0,1 -- process -l
 Run process as above, but with an option (-l) that would be confused with
 a numactl option.
 
+numactl \-\-cpunodebind=0 \-\-balancing \-\-membind=0,1 process
+Run process on node 0 with memory allocated on node 0 and 1.  Optimize the
+page placement with Linux kernel NUMA balancing mechanism if possible.
+
 numactl \-\-cpunodebind=netdev:eth0 \-\-membind=netdev:eth0 network-server
 Run network-server on the node of network device eth0 with its memory
 also in the same node.
diff --git a/numactl.c b/numactl.c
index df9dbcb..5a9d2df 100644
--- a/numactl.c
+++ b/numactl.c
@@ -45,6 +45,7 @@  struct option opts[] = {
 	{"membind", 1, 0, 'm'},
 	{"show", 0, 0, 's' },
 	{"localalloc", 0,0, 'l'},
+	{"balancing", 0, 0, 'b'},
 	{"hardware", 0,0,'H' },
 
 	{"shm", 1, 0, 'S'},
@@ -65,9 +66,10 @@  struct option opts[] = {
 void usage(void)
 {
 	fprintf(stderr,
-		"usage: numactl [--all | -a] [--interleave= | -i <nodes>] [--preferred= | -p <node>]\n"
-		"               [--physcpubind= | -C <cpus>] [--cpunodebind= | -N <nodes>]\n"
-		"               [--membind= | -m <nodes>] [--localalloc | -l] command args ...\n"
+		"usage: numactl [--all | -a] [--balancing | -b] [--interleave= | -i <nodes>]\n"
+		"               [--preferred= | -p <node>] [--physcpubind= | -C <cpus>]\n"
+		"               [--cpunodebind= | -N <nodes>] [--membind= | -m <nodes>]\n"
+		"               [--localalloc | -l] command args ...\n"
 		"       numactl [--show | -s]\n"
 		"       numactl [--hardware | -H]\n"
 		"       numactl [--length | -l <length>] [--offset | -o <offset>] [--shmmode | -M <shmmode>]\n"
@@ -90,6 +92,8 @@  void usage(void)
 		"all numbers and ranges can be made cpuset-relative with +\n"
 		"the old --cpubind argument is deprecated.\n"
 		"use --cpunodebind or --physcpubind instead\n"
+		"use --balancing | -b to enable Linux kernel NUMA balancing\n"
+		"for the process if it is supported by kernel\n"
 		"<length> can have g (GB), m (MB) or k (KB) suffixes\n");
 	exit(1);
 }
@@ -338,6 +342,7 @@  int do_dump = 0;
 int shmattached = 0;
 int did_node_cpu_parse = 0;
 int parse_all = 0;
+int numa_balancing = 0;
 char *shmoption;
 
 void check_cpubind(int flag)
@@ -431,6 +436,10 @@  int main(int ac, char **av)
 			nopolicy();
 			hardware();
 			exit(0);
+		case 'b': /* --balancing  */
+			nopolicy();
+			numa_balancing = 1;
+			break;
 		case 'i': /* --interleave */
 			checknuma();
 			if (parse_all)
@@ -507,6 +516,8 @@  int main(int ac, char **av)
 			numa_set_bind_policy(1);
 			if (shmfd >= 0) {
 				numa_tonodemask_memory(shmptr, shmlen, mask);
+			} else if (numa_balancing) {
+				numa_set_membind_balancing(mask);
 			} else {
 				numa_set_membind(mask);
 			}
diff --git a/numaif.h b/numaif.h
index 91aa230..32c12c3 100644
--- a/numaif.h
+++ b/numaif.h
@@ -29,6 +29,9 @@  extern long move_pages(int pid, unsigned long count,
 #define MPOL_LOCAL       4
 #define MPOL_MAX         5
 
+/* Flags for set_mempolicy, specified in mode */
+#define MPOL_F_NUMA_BALANCING	(1 << 13) /* Optimize with NUMA balancing if possible */
+
 /* Flags for get_mem_policy */
 #define MPOL_F_NODE    (1<<0)   /* return next il node or node of address */
 				/* Warning: MPOL_F_NODE is unsupported and
diff --git a/versions.ldscript b/versions.ldscript
index 23074a0..358eeeb 100644
--- a/versions.ldscript
+++ b/versions.ldscript
@@ -146,3 +146,11 @@  libnuma_1.4 {
   local:
     *;
 } libnuma_1.3;
+
+# New interface for membind with NUMA balancing optimization
+libnuma_1.5 {
+  global:
+    numa_set_membind_balancing;
+  local:
+    *;
+} libnuma_1.4;