diff mbox series

[RFC,07/11] mm/mempolicy: add task mempolicy syscall variants

Message ID 20231122211200.31620-8-gregory.price@memverge.com (mailing list archive)
State New
Headers show
Series mm/mempolicy: Make task->mempolicy externally modifiable via syscall and procfs | expand

Commit Message

Gregory Price Nov. 22, 2023, 9:11 p.m. UTC
Add system calls to allow one task to view or change another task's
mempolicy settings. The task mempolicy has traditionally been a feature
that could only be changed by the task itself.  This creates issues
with task migrations between cgroups where cpusets may differ.

Attempts were made to allow policy nodemasks to be shifted via a flag
(MPOL_F_RELATIVE_NODES), but this is not foolproof.

Additionally, as new policies emerge (like weighted interleave), it
may be necessary to allow not just the policy to be changed, but
individual attributes of the policy (such as a node weight) in
response to other system events - such as memory hotplug.

If pid is 0, this behaves the same as the original mempolicy syscalls,
otherwise this interface requires CAP_SYS_NICE.

Syscalls in this patch:
	sys_set_task_mempolicy
	sys_get_task_mempolicy
	sys_set_task_mempolicy_home_node
	sys_task_mbind

Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   4 +
 arch/x86/entry/syscalls/syscall_64.tbl |   4 +
 include/linux/syscalls.h               |  14 +++
 include/uapi/asm-generic/unistd.h      |  10 ++-
 include/uapi/linux/mempolicy.h         |  10 +++
 mm/mempolicy.c                         | 119 +++++++++++++++++++++++++
 6 files changed, 160 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index c8fac5205803..358bd91d7461 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -461,3 +461,7 @@ 
 454	i386	futex_wake		sys_futex_wake
 455	i386	futex_wait		sys_futex_wait
 456	i386	futex_requeue		sys_futex_requeue
+457	i386	set_task_mempolicy	sys_set_task_mempolicy
+458	i386	get_task_mempolicy	sys_get_task_mempolicy
+459	i386	set_task_mempolicy_home_node	sys_set_task_mempolicy_home_node
+460	i386	task_mbind		sys_task_mbind
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 8cb8bf68721c..c83b0c5c1ff9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -378,6 +378,10 @@ 
 454	common	futex_wake		sys_futex_wake
 455	common	futex_wait		sys_futex_wait
 456	common	futex_requeue		sys_futex_requeue
+457	common	set_task_mempolicy	sys_set_task_mempolicy
+458	common	get_task_mempolicy	sys_get_task_mempolicy
+459	common	set_task_mempolicy_home_node	sys_set_task_mempolicy_home_node
+460	common	task_mbind		sys_task_mbind
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index fd9d12de7e92..fd1a8863b5c1 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -816,12 +816,21 @@  asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 				const unsigned long __user *nmask,
 				unsigned long maxnode,
 				unsigned flags);
+asmlinkage long sys_task_mbind(const struct mbind_args __user *uargs,
+			       size_t usize);
 asmlinkage long sys_get_mempolicy(int __user *policy,
 				unsigned long __user *nmask,
 				unsigned long maxnode,
 				unsigned long addr, unsigned long flags);
 asmlinkage long sys_set_mempolicy(int mode, const unsigned long __user *nmask,
 				unsigned long maxnode);
+asmlinkage long sys_get_task_mempolicy(pid_t pid, int __user *policy,
+				unsigned long __user *nmask,
+				unsigned long maxnode,
+				unsigned long addr, unsigned long flags);
+asmlinkage long sys_set_task_mempolicy(pid_t pid, int mode,
+				       const unsigned long __user *nmask,
+				       unsigned long maxnode);
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 				const unsigned long __user *from,
 				const unsigned long __user *to);
@@ -945,6 +954,11 @@  asmlinkage long sys_memfd_secret(unsigned int flags);
 asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
 					    unsigned long home_node,
 					    unsigned long flags);
+asmlinkage long sys_set_task_mempolicy_home_node(pid_t pid,
+						 unsigned long start,
+						 unsigned long len,
+						 unsigned long home_node,
+						 unsigned long flags);
 asmlinkage long sys_cachestat(unsigned int fd,
 		struct cachestat_range __user *cstat_range,
 		struct cachestat __user *cstat, unsigned int flags);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 756b013fb832..f179715f1d59 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -828,9 +828,17 @@  __SYSCALL(__NR_futex_wake, sys_futex_wake)
 __SYSCALL(__NR_futex_wait, sys_futex_wait)
 #define __NR_futex_requeue 456
 __SYSCALL(__NR_futex_requeue, sys_futex_requeue)
+#define __NR_set_task_mempolicy 457
+__SYSCALL(__NR_set_task_mempolicy, sys_set_task_mempolicy)
+#define __NR_get_task_mempolicy 458
+__SYSCALL(__NR_get_task_mempolicy, sys_get_task_mempolicy)
+#define __NR_set_task_mempolicy_home_node 459
+__SYSCALL(__NR_set_task_mempolicy_home_node, sys_set_task_mempolicy_home_node)
+#define __NR_task_mbind 460
+__SYSCALL(__NR_task_mbind, sys_task_mbind)
 
 #undef __NR_syscalls
-#define __NR_syscalls 457
+#define __NR_syscalls 461
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index a8963f7ef4c2..c29cfb25db29 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -26,6 +26,16 @@  enum {
 	MPOL_MAX,	/* always last member of enum */
 };
 
+struct mbind_args {
+	pid_t pid;
+	unsigned long start;
+	unsigned long len;
+	unsigned long mode;
+	unsigned long *nmask;
+	unsigned long maxnode;
+	unsigned int flags;
+};
+
 /* Flags for set_mempolicy */
 #define MPOL_F_STATIC_NODES	(1 << 15)
 #define MPOL_F_RELATIVE_NODES	(1 << 14)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3d2171ac4098..fb295ade8ad7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1654,6 +1654,32 @@  SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
 	return __set_mempolicy_home_node(current, start, len, home_node, flags);
 }
 
+SYSCALL_DEFINE5(set_task_mempolicy_home_node, pid_t, pid, unsigned long, start,
+		unsigned long, len, unsigned long, home_node,
+		unsigned long, flags)
+{
+	struct task_struct *task;
+	int err;
+
+	if (pid && !capable(CAP_SYS_NICE))
+		return -EPERM;
+
+	rcu_read_lock();
+	task = pid ? find_task_by_vpid(pid) : current;
+	if (!task) {
+		rcu_read_unlock();
+		err = -ESRCH;
+		goto out;
+	}
+	get_task_struct(task);
+	rcu_read_unlock();
+
+	err = __set_mempolicy_home_node(task, start, len, home_node, flags);
+	put_task_struct(task);
+out:
+	return err;
+}
+
 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
 		unsigned long, mode, const unsigned long __user *, nmask,
 		unsigned long, maxnode, unsigned int, flags)
@@ -1661,6 +1687,48 @@  SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
 	return kernel_mbind(current, start, len, mode, nmask, maxnode, flags);
 }
 
+static long kernel_task_mbind(const struct mbind_args __user *uargs,
+			      size_t usize)
+{
+	struct mbind_args kargs;
+	struct task_struct *task;
+	int err;
+
+	if (usize < sizeof(kargs))
+		return -EINVAL;
+
+	err = copy_struct_from_user(&kargs, sizeof(kargs), uargs, usize);
+	if (err)
+		return err;
+
+
+	if (kargs.pid && !capable(CAP_SYS_NICE))
+		return -EPERM;
+
+	rcu_read_lock();
+	task = kargs.pid ? find_task_by_vpid(kargs.pid) : current;
+	if (!task) {
+		rcu_read_unlock();
+		err = -ESRCH;
+		goto out;
+	}
+	get_task_struct(task);
+	rcu_read_unlock();
+
+	err = kernel_mbind(task, kargs.start, kargs.len, kargs.mode,
+			   kargs.nmask, kargs.maxnode, kargs.flags);
+
+	put_task_struct(task);
+out:
+	return err;
+}
+
+SYSCALL_DEFINE2(task_mbind, const struct mbind_args __user *, args,
+		size_t, size)
+{
+	return kernel_task_mbind(args, size);
+}
+
 /* Set the process memory policy */
 static long kernel_set_mempolicy(struct task_struct *task, int mode,
 				 const unsigned long __user *nmask,
@@ -1688,6 +1756,31 @@  SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
 	return kernel_set_mempolicy(current, mode, nmask, maxnode);
 }
 
+SYSCALL_DEFINE4(set_task_mempolicy, pid_t, pid, int, mode,
+		const unsigned long __user *, nmask, unsigned long, maxnode)
+{
+	struct task_struct *task;
+	int err;
+
+	if (pid && !capable(CAP_SYS_NICE))
+		return -EPERM;
+
+	rcu_read_lock();
+	task = pid ? find_task_by_vpid(pid) : current;
+	if (!task) {
+		rcu_read_unlock();
+		err = -ESRCH;
+		goto out;
+	}
+	get_task_struct(task);
+	rcu_read_unlock();
+
+	err = kernel_set_mempolicy(task, mode, nmask, maxnode);
+	put_task_struct(task);
+out:
+	return err;
+}
+
 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
 				const unsigned long __user *old_nodes,
 				const unsigned long __user *new_nodes)
@@ -1821,6 +1914,32 @@  SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
 				    flags);
 }
 
+SYSCALL_DEFINE6(get_task_mempolicy, pid_t, pid, int __user *, policy,
+		unsigned long __user *, nmask, unsigned long, maxnode,
+		unsigned long, addr, unsigned long, flags)
+{
+	struct task_struct *task;
+	int err;
+
+	if (pid && !capable(CAP_SYS_NICE))
+		return -EPERM;
+
+	rcu_read_lock();
+	task = pid ? find_task_by_vpid(pid) : current;
+	if (!task) {
+		rcu_read_unlock();
+		err = -ESRCH;
+		goto out;
+	}
+	get_task_struct(task);
+	rcu_read_unlock();
+
+	err = kernel_get_mempolicy(task, policy, nmask, maxnode, addr, flags);
+	put_task_struct(task);
+out:
+	return err;
+}
+
 bool vma_migratable(struct vm_area_struct *vma)
 {
 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))