diff mbox series

[RFC,05/11] mm/mempolicy: modify set_mempolicy_home_node to take a task argument

Message ID 20231122211200.31620-6-gregory.price@memverge.com (mailing list archive)
State New
Headers show
Series mm/mempolicy: Make task->mempolicy externally modifiable via syscall and procfs | expand

Commit Message

Gregory Price Nov. 22, 2023, 9:11 p.m. UTC
To make mempolicy fetchable by external tasks, we must first change
the callstack to take a task as an argument.

Modify the following functions to require a task argument:
    set_mempolicy_home_node

First we refactor set_mempolicy_home_node to __set_mempolicy_home_node
which accepts a task argument, and change the syscall definition to pass
in (current).

The only functional change in this patch is related to the way task->mm
is acquired. Originally, set_mempolicy_home_node would acquire task->mm
directly via (current->mm).  This is unsafe to do in a non-current
context.  However, utilizing get_task_mm would break the original
functionality of do_get_mempolicy due to the following check
in get_task_mm:

  if (mm) {
    if (task->flags & PF_KTHREAD)
      mm = NULL;
    else
      mmget(mm);
  }

To retain the original behavior, if (task == current) we access
the task->mm directly, but if (task != current) we will utilize
get_task_mm to safely access the mm.  We always take a reference
to the mm to keep the cleanup semantics simple.

Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
 mm/mempolicy.c | 62 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 49 insertions(+), 13 deletions(-)

Comments

Michal Hocko Nov. 28, 2023, 2:07 p.m. UTC | #1
On Wed 22-11-23 16:11:54, Gregory Price wrote:
[...]
> +
> +	/*
> +	 * Behavior when task == current allows a task modifying itself
> +	 * to bypass the check in get_task_mm and acquire the mm directly
> +	 */
> +	if (task == current) {
> +		mm = task->mm;
> +		mmget(mm);
> +	} else
> +		mm = get_task_mm(task);

Similar question as in the previous patch. When is a kthread legit to do
manipulate borrowed mm memory policy?

> +
> +	if (!mm)
> +		return -ENODEV;

Is this an expected error? No mm means task dying and mm has been
already released. Should we simply return ESRCH wouldbe a better error
code IMO. This should never happen for the current task so it sould be
safe to add.
Gregory Price Nov. 28, 2023, 2:14 p.m. UTC | #2
On Tue, Nov 28, 2023 at 03:07:38PM +0100, Michal Hocko wrote:
> On Wed 22-11-23 16:11:54, Gregory Price wrote:
> [...]
> > +
> > +	if (!mm)
> > +		return -ENODEV;
> 
> Is this an expected error? No mm means task dying and mm has been
> already released. Should we simply return ESRCH wouldbe a better error
> code IMO. This should never happen for the current task so it sould be
> safe to add.

Hm, good point, and in fact i should make sure that the new pidfd code
is not susceptible to this.

This particular set of lines is not present in the new RFC (not out yet)
but the access to the remote task mm is more controlled via mm_access,
so I would presume that mm_access either returns error, I will make sure
it cannot return null and check for this.

> 
> -- 
> Michal Hocko
> SUSE Labs
diff mbox series

Patch

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4519f39b1a07..540163f5d349 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1521,39 +1521,67 @@  static long kernel_mbind(unsigned long start, unsigned long len,
 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
 }
 
-SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
-		unsigned long, home_node, unsigned long, flags)
+static long __set_mempolicy_home_node(struct task_struct *task,
+				      unsigned long start,
+				      unsigned long len,
+				      unsigned long home_node,
+				      unsigned long flags)
 {
-	struct mm_struct *mm = current->mm;
+	struct mm_struct *mm;
 	struct vm_area_struct *vma, *prev;
 	struct mempolicy *new, *old;
 	unsigned long end;
 	int err = -ENOENT;
+
+	/*
+	 * Behavior when task == current allows a task modifying itself
+	 * to bypass the check in get_task_mm and acquire the mm directly
+	 */
+	if (task == current) {
+		mm = task->mm;
+		mmget(mm);
+	} else
+		mm = get_task_mm(task);
+
+	if (!mm)
+		return -ENODEV;
+
 	VMA_ITERATOR(vmi, mm, start);
 
 	start = untagged_addr(start);
-	if (start & ~PAGE_MASK)
-		return -EINVAL;
+	if (start & ~PAGE_MASK) {
+		err = -EINVAL;
+		goto mm_out;
+	}
 	/*
 	 * flags is used for future extension if any.
 	 */
-	if (flags != 0)
-		return -EINVAL;
+	if (flags != 0) {
+		err = -EINVAL;
+		goto mm_out;
+	}
 
 	/*
 	 * Check home_node is online to avoid accessing uninitialized
 	 * NODE_DATA.
 	 */
-	if (home_node >= MAX_NUMNODES || !node_online(home_node))
-		return -EINVAL;
+	if (home_node >= MAX_NUMNODES || !node_online(home_node)) {
+		err = -EINVAL;
+		goto mm_out;
+	}
 
 	len = PAGE_ALIGN(len);
 	end = start + len;
 
-	if (end < start)
-		return -EINVAL;
-	if (end == start)
-		return 0;
+	if (end < start) {
+		err = -EINVAL;
+		goto mm_out;
+	}
+	if (end == start) {
+		err = 0;
+		goto mm_out;
+	}
+
 	mmap_write_lock(mm);
 	prev = vma_prev(&vmi);
 	for_each_vma_range(vmi, vma, end) {
@@ -1585,9 +1613,17 @@  SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
 			break;
 	}
 	mmap_write_unlock(mm);
+mm_out:
+	mmput(mm);
 	return err;
 }
 
+SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
+		unsigned long, home_node, unsigned long, flags)
+{
+	return __set_mempolicy_home_node(current, start, len, home_node, flags);
+}
+
 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
 		unsigned long, mode, const unsigned long __user *, nmask,
 		unsigned long, maxnode, unsigned int, flags)