diff mbox series

[v6,5/7] mm: support both pid and pidfd for process_madvise

Message ID 20200219014433.88424-6-minchan@kernel.org (mailing list archive)
State New, archived
Headers show
Series introduce memory hinting API for external process | expand

Commit Message

Minchan Kim Feb. 19, 2020, 1:44 a.m. UTC
There is a demand[1] to support pid as well pidfd for process_madvise
to reduce unnecessary syscall to get pidfd if the user has control of
the target process(ie, they could guarantee the process is not gone
or pid is not reused. Or, it might be okay to give a hint to wrong
process).

This patch aims for supporting both options like waitid(2). So, the
syscall is currently,

	int process_madvise(int which, pid_t pid, void *addr,
		size_t length, int advise, unsigned long flag);

@which is actually idtype_t for userspace libray and currently,
it supports P_PID and P_PIDFD.

[1]  https://lore.kernel.org/linux-mm/9d849087-3359-c4ab-fbec-859e8186c509@virtuozzo.com/

Cc: Christian Brauner <christian@brauner.io>
Suggested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/linux/syscalls.h |  3 ++-
 mm/madvise.c             | 34 ++++++++++++++++++++++------------
 2 files changed, 24 insertions(+), 13 deletions(-)

Comments

Suren Baghdasaryan Feb. 28, 2020, 10:41 p.m. UTC | #1
On Tue, Feb 18, 2020 at 5:44 PM Minchan Kim <minchan@kernel.org> wrote:
>
> There is a demand[1] to support pid as well pidfd for process_madvise
> to reduce unnecessary syscall to get pidfd if the user has control of
> the target process(ie, they could guarantee the process is not gone
> or pid is not reused. Or, it might be okay to give a hint to wrong
> process).

nit: When would "give a hint to wrong process" be ok? I would just
remove this part.

>
> This patch aims for supporting both options like waitid(2). So, the
> syscall is currently,
>
>         int process_madvise(int which, pid_t pid, void *addr,
>                 size_t length, int advise, unsigned long flag);
>
> @which is actually idtype_t for userspace libray and currently,
> it supports P_PID and P_PIDFD.
>
> [1]  https://lore.kernel.org/linux-mm/9d849087-3359-c4ab-fbec-859e8186c509@virtuozzo.com/
>
> Cc: Christian Brauner <christian@brauner.io>
> Suggested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  include/linux/syscalls.h |  3 ++-
>  mm/madvise.c             | 34 ++++++++++++++++++++++------------
>  2 files changed, 24 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index e4cd2c2f8bb4..f5ada20e2943 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -876,7 +876,8 @@ asmlinkage long sys_munlockall(void);
>  asmlinkage long sys_mincore(unsigned long start, size_t len,
>                                 unsigned char __user * vec);
>  asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
> -asmlinkage long sys_process_madvise(int pidfd, unsigned long start,
> +
> +asmlinkage long sys_process_madvise(int which, pid_t pid, unsigned long start,
>                         size_t len, int behavior, unsigned long flags);
>  asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
>                         unsigned long prot, unsigned long pgoff,
> diff --git a/mm/madvise.c b/mm/madvise.c
> index def1507c2030..f6d9b9e66243 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -1182,11 +1182,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
>         return do_madvise(current, current->mm, start, len_in, behavior);
>  }
>
> -SYSCALL_DEFINE5(process_madvise, int, pidfd, unsigned long, start,
> +SYSCALL_DEFINE6(process_madvise, int, which, pid_t, upid, unsigned long, start,
>                 size_t, len_in, int, behavior, unsigned long, flags)
>  {
>         int ret;
> -       struct fd f;
>         struct pid *pid;
>         struct task_struct *task;
>         struct mm_struct *mm;
> @@ -1197,20 +1196,31 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, unsigned long, start,
>         if (!process_madvise_behavior_valid(behavior))
>                 return -EINVAL;
>
> -       f = fdget(pidfd);
> -       if (!f.file)
> -               return -EBADF;
> +       switch (which) {
> +       case P_PID:
> +               if (upid <= 0)
> +                       return -EINVAL;
> +
> +               pid = find_get_pid(upid);
> +               if (!pid)
> +                       return -ESRCH;
> +               break;
> +       case P_PIDFD:
> +               if (upid < 0)
> +                       return -EINVAL;
>
> -       pid = pidfd_pid(f.file);
> -       if (IS_ERR(pid)) {
> -               ret = PTR_ERR(pid);
> -               goto fdput;
> +               pid = pidfd_get_pid(upid);
> +               if (IS_ERR(pid))
> +                       return PTR_ERR(pid);
> +               break;
> +       default:
> +               return -EINVAL;
>         }
>
>         task = get_pid_task(pid, PIDTYPE_PID);
>         if (!task) {
>                 ret = -ESRCH;
> -               goto fdput;
> +               goto put_pid;
>         }
>
>         mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
> @@ -1223,7 +1233,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, unsigned long, start,
>         mmput(mm);
>  release_task:
>         put_task_struct(task);
> -fdput:
> -       fdput(f);
> +put_pid:
> +       put_pid(pid);
>         return ret;
>  }
> --
> 2.25.0.265.gbab2e86ba0-goog
>

Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Minchan Kim March 2, 2020, 7:23 p.m. UTC | #2
On Fri, Feb 28, 2020 at 02:41:07PM -0800, Suren Baghdasaryan wrote:
> On Tue, Feb 18, 2020 at 5:44 PM Minchan Kim <minchan@kernel.org> wrote:
> >
> > There is a demand[1] to support pid as well pidfd for process_madvise
> > to reduce unnecessary syscall to get pidfd if the user has control of
> > the target process(ie, they could guarantee the process is not gone
> > or pid is not reused. Or, it might be okay to give a hint to wrong
> > process).
> 
> nit: When would "give a hint to wrong process" be ok? I would just
> remove this part.

I wanted to say non destructive hints. It's already true for other
some hints because they are just best effort so it's not critical
to be failed. If you mind it, I will remove the phrase.

Thanks.

> 
> >
> > This patch aims for supporting both options like waitid(2). So, the
> > syscall is currently,
> >
> >         int process_madvise(int which, pid_t pid, void *addr,
> >                 size_t length, int advise, unsigned long flag);
> >
> > @which is actually idtype_t for userspace libray and currently,
> > it supports P_PID and P_PIDFD.
> >
> > [1]  https://lore.kernel.org/linux-mm/9d849087-3359-c4ab-fbec-859e8186c509@virtuozzo.com/
> >
> > Cc: Christian Brauner <christian@brauner.io>
> > Suggested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > ---
> >  include/linux/syscalls.h |  3 ++-
> >  mm/madvise.c             | 34 ++++++++++++++++++++++------------
> >  2 files changed, 24 insertions(+), 13 deletions(-)
> >
> > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > index e4cd2c2f8bb4..f5ada20e2943 100644
> > --- a/include/linux/syscalls.h
> > +++ b/include/linux/syscalls.h
> > @@ -876,7 +876,8 @@ asmlinkage long sys_munlockall(void);
> >  asmlinkage long sys_mincore(unsigned long start, size_t len,
> >                                 unsigned char __user * vec);
> >  asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
> > -asmlinkage long sys_process_madvise(int pidfd, unsigned long start,
> > +
> > +asmlinkage long sys_process_madvise(int which, pid_t pid, unsigned long start,
> >                         size_t len, int behavior, unsigned long flags);
> >  asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
> >                         unsigned long prot, unsigned long pgoff,
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index def1507c2030..f6d9b9e66243 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -1182,11 +1182,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
> >         return do_madvise(current, current->mm, start, len_in, behavior);
> >  }
> >
> > -SYSCALL_DEFINE5(process_madvise, int, pidfd, unsigned long, start,
> > +SYSCALL_DEFINE6(process_madvise, int, which, pid_t, upid, unsigned long, start,
> >                 size_t, len_in, int, behavior, unsigned long, flags)
> >  {
> >         int ret;
> > -       struct fd f;
> >         struct pid *pid;
> >         struct task_struct *task;
> >         struct mm_struct *mm;
> > @@ -1197,20 +1196,31 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, unsigned long, start,
> >         if (!process_madvise_behavior_valid(behavior))
> >                 return -EINVAL;
> >
> > -       f = fdget(pidfd);
> > -       if (!f.file)
> > -               return -EBADF;
> > +       switch (which) {
> > +       case P_PID:
> > +               if (upid <= 0)
> > +                       return -EINVAL;
> > +
> > +               pid = find_get_pid(upid);
> > +               if (!pid)
> > +                       return -ESRCH;
> > +               break;
> > +       case P_PIDFD:
> > +               if (upid < 0)
> > +                       return -EINVAL;
> >
> > -       pid = pidfd_pid(f.file);
> > -       if (IS_ERR(pid)) {
> > -               ret = PTR_ERR(pid);
> > -               goto fdput;
> > +               pid = pidfd_get_pid(upid);
> > +               if (IS_ERR(pid))
> > +                       return PTR_ERR(pid);
> > +               break;
> > +       default:
> > +               return -EINVAL;
> >         }
> >
> >         task = get_pid_task(pid, PIDTYPE_PID);
> >         if (!task) {
> >                 ret = -ESRCH;
> > -               goto fdput;
> > +               goto put_pid;
> >         }
> >
> >         mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
> > @@ -1223,7 +1233,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, unsigned long, start,
> >         mmput(mm);
> >  release_task:
> >         put_task_struct(task);
> > -fdput:
> > -       fdput(f);
> > +put_pid:
> > +       put_pid(pid);
> >         return ret;
> >  }
> > --
> > 2.25.0.265.gbab2e86ba0-goog
> >
> 
> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Suren Baghdasaryan March 2, 2020, 7:38 p.m. UTC | #3
On Mon, Mar 2, 2020 at 11:23 AM Minchan Kim <minchan@kernel.org> wrote:
>
> On Fri, Feb 28, 2020 at 02:41:07PM -0800, Suren Baghdasaryan wrote:
> > On Tue, Feb 18, 2020 at 5:44 PM Minchan Kim <minchan@kernel.org> wrote:
> > >
> > > There is a demand[1] to support pid as well pidfd for process_madvise
> > > to reduce unnecessary syscall to get pidfd if the user has control of
> > > the target process(ie, they could guarantee the process is not gone
> > > or pid is not reused. Or, it might be okay to give a hint to wrong
> > > process).
> >
> > nit: When would "give a hint to wrong process" be ok? I would just
> > remove this part.
>
> I wanted to say non destructive hints. It's already true for other
> some hints because they are just best effort so it's not critical
> to be failed. If you mind it, I will remove the phrase.

Up to you, or maybe call it a "non-fatal" error? Saying that it's ok
to hint a wrong process sounds wrong to me.

>
> Thanks.
>
> >
> > >
> > > This patch aims for supporting both options like waitid(2). So, the
> > > syscall is currently,
> > >
> > >         int process_madvise(int which, pid_t pid, void *addr,
> > >                 size_t length, int advise, unsigned long flag);
> > >
> > > @which is actually idtype_t for userspace libray and currently,
> > > it supports P_PID and P_PIDFD.
> > >
> > > [1]  https://lore.kernel.org/linux-mm/9d849087-3359-c4ab-fbec-859e8186c509@virtuozzo.com/
> > >
> > > Cc: Christian Brauner <christian@brauner.io>
> > > Suggested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> > > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > ---
> > >  include/linux/syscalls.h |  3 ++-
> > >  mm/madvise.c             | 34 ++++++++++++++++++++++------------
> > >  2 files changed, 24 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > > index e4cd2c2f8bb4..f5ada20e2943 100644
> > > --- a/include/linux/syscalls.h
> > > +++ b/include/linux/syscalls.h
> > > @@ -876,7 +876,8 @@ asmlinkage long sys_munlockall(void);
> > >  asmlinkage long sys_mincore(unsigned long start, size_t len,
> > >                                 unsigned char __user * vec);
> > >  asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
> > > -asmlinkage long sys_process_madvise(int pidfd, unsigned long start,
> > > +
> > > +asmlinkage long sys_process_madvise(int which, pid_t pid, unsigned long start,
> > >                         size_t len, int behavior, unsigned long flags);
> > >  asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
> > >                         unsigned long prot, unsigned long pgoff,
> > > diff --git a/mm/madvise.c b/mm/madvise.c
> > > index def1507c2030..f6d9b9e66243 100644
> > > --- a/mm/madvise.c
> > > +++ b/mm/madvise.c
> > > @@ -1182,11 +1182,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
> > >         return do_madvise(current, current->mm, start, len_in, behavior);
> > >  }
> > >
> > > -SYSCALL_DEFINE5(process_madvise, int, pidfd, unsigned long, start,
> > > +SYSCALL_DEFINE6(process_madvise, int, which, pid_t, upid, unsigned long, start,
> > >                 size_t, len_in, int, behavior, unsigned long, flags)
> > >  {
> > >         int ret;
> > > -       struct fd f;
> > >         struct pid *pid;
> > >         struct task_struct *task;
> > >         struct mm_struct *mm;
> > > @@ -1197,20 +1196,31 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, unsigned long, start,
> > >         if (!process_madvise_behavior_valid(behavior))
> > >                 return -EINVAL;
> > >
> > > -       f = fdget(pidfd);
> > > -       if (!f.file)
> > > -               return -EBADF;
> > > +       switch (which) {
> > > +       case P_PID:
> > > +               if (upid <= 0)
> > > +                       return -EINVAL;
> > > +
> > > +               pid = find_get_pid(upid);
> > > +               if (!pid)
> > > +                       return -ESRCH;
> > > +               break;
> > > +       case P_PIDFD:
> > > +               if (upid < 0)
> > > +                       return -EINVAL;
> > >
> > > -       pid = pidfd_pid(f.file);
> > > -       if (IS_ERR(pid)) {
> > > -               ret = PTR_ERR(pid);
> > > -               goto fdput;
> > > +               pid = pidfd_get_pid(upid);
> > > +               if (IS_ERR(pid))
> > > +                       return PTR_ERR(pid);
> > > +               break;
> > > +       default:
> > > +               return -EINVAL;
> > >         }
> > >
> > >         task = get_pid_task(pid, PIDTYPE_PID);
> > >         if (!task) {
> > >                 ret = -ESRCH;
> > > -               goto fdput;
> > > +               goto put_pid;
> > >         }
> > >
> > >         mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
> > > @@ -1223,7 +1233,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, unsigned long, start,
> > >         mmput(mm);
> > >  release_task:
> > >         put_task_struct(task);
> > > -fdput:
> > > -       fdput(f);
> > > +put_pid:
> > > +       put_pid(pid);
> > >         return ret;
> > >  }
> > > --
> > > 2.25.0.265.gbab2e86ba0-goog
> > >
> >
> > Reviewed-by: Suren Baghdasaryan <surenb@google.com>
diff mbox series

Patch

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e4cd2c2f8bb4..f5ada20e2943 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -876,7 +876,8 @@  asmlinkage long sys_munlockall(void);
 asmlinkage long sys_mincore(unsigned long start, size_t len,
 				unsigned char __user * vec);
 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
-asmlinkage long sys_process_madvise(int pidfd, unsigned long start,
+
+asmlinkage long sys_process_madvise(int which, pid_t pid, unsigned long start,
 			size_t len, int behavior, unsigned long flags);
 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 			unsigned long prot, unsigned long pgoff,
diff --git a/mm/madvise.c b/mm/madvise.c
index def1507c2030..f6d9b9e66243 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1182,11 +1182,10 @@  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 	return do_madvise(current, current->mm, start, len_in, behavior);
 }
 
-SYSCALL_DEFINE5(process_madvise, int, pidfd, unsigned long, start,
+SYSCALL_DEFINE6(process_madvise, int, which, pid_t, upid, unsigned long, start,
 		size_t, len_in, int, behavior, unsigned long, flags)
 {
 	int ret;
-	struct fd f;
 	struct pid *pid;
 	struct task_struct *task;
 	struct mm_struct *mm;
@@ -1197,20 +1196,31 @@  SYSCALL_DEFINE5(process_madvise, int, pidfd, unsigned long, start,
 	if (!process_madvise_behavior_valid(behavior))
 		return -EINVAL;
 
-	f = fdget(pidfd);
-	if (!f.file)
-		return -EBADF;
+	switch (which) {
+	case P_PID:
+		if (upid <= 0)
+			return -EINVAL;
+
+		pid = find_get_pid(upid);
+		if (!pid)
+			return -ESRCH;
+		break;
+	case P_PIDFD:
+		if (upid < 0)
+			return -EINVAL;
 
-	pid = pidfd_pid(f.file);
-	if (IS_ERR(pid)) {
-		ret = PTR_ERR(pid);
-		goto fdput;
+		pid = pidfd_get_pid(upid);
+		if (IS_ERR(pid))
+			return PTR_ERR(pid);
+		break;
+	default:
+		return -EINVAL;
 	}
 
 	task = get_pid_task(pid, PIDTYPE_PID);
 	if (!task) {
 		ret = -ESRCH;
-		goto fdput;
+		goto put_pid;
 	}
 
 	mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
@@ -1223,7 +1233,7 @@  SYSCALL_DEFINE5(process_madvise, int, pidfd, unsigned long, start,
 	mmput(mm);
 release_task:
 	put_task_struct(task);
-fdput:
-	fdput(f);
+put_pid:
+	put_pid(pid);
 	return ret;
 }