diff mbox series

[RFC] mm: add new syscall pidfd_set_mempolicy()

Message ID 20221010094842.4123037-1-hezhongkun.hzk@bytedance.com (mailing list archive)
State New
Headers show
Series [RFC] mm: add new syscall pidfd_set_mempolicy() | expand

Commit Message

Zhongkun He Oct. 10, 2022, 9:48 a.m. UTC
There is usecase that System Management Software(SMS) want to give a
memory policy to other processes to make better use of memory.

The information about how to use memory is not known to the app.
Instead, it is known to the userspace daemon(SMS), and that daemon
will decide the memory usage policy based on different factors.

To solve the issue, this patch introduces a new syscall
pidfd_set_mempolicy(2).  it sets the NUMA memory policy of the thread
specified in pidfd.

In current process context there is no locking because only the process
accesses its own memory policy, so task_work is used in
pidfd_set_mempolicy() to update the mempolicy of the process specified
in pidfd, avoid using locks and race conditions.

The API is as follows,

		long pidfd_set_mempolicy(int pidfd, int mode,
                                     const unsigned long __user *nmask,
                                     unsigned long maxnode,
                                     unsigned int flags);

Set's the [pidfd] task's "task/process memory policy". The pidfd argument
is a PID file descriptor (see pidfd_open(2) man page) that specifies the
process to which the mempolicy is to be applied. The flags argument is
reserved for future use; currently, this argument must be specified as 0.
Please see the set_mempolicy(2) man page for more details about
other's arguments.

Suggested-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
---
 .../admin-guide/mm/numa_memory_policy.rst     | 21 ++++-
 arch/alpha/kernel/syscalls/syscall.tbl        |  1 +
 arch/arm/tools/syscall.tbl                    |  1 +
 arch/arm64/include/asm/unistd.h               |  2 +-
 arch/arm64/include/asm/unistd32.h             |  3 +-
 arch/ia64/kernel/syscalls/syscall.tbl         |  1 +
 arch/m68k/kernel/syscalls/syscall.tbl         |  1 +
 arch/microblaze/kernel/syscalls/syscall.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl     |  1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl     |  1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl     |  1 +
 arch/parisc/kernel/syscalls/syscall.tbl       |  1 +
 arch/powerpc/kernel/syscalls/syscall.tbl      |  1 +
 arch/s390/kernel/syscalls/syscall.tbl         |  1 +
 arch/sh/kernel/syscalls/syscall.tbl           |  1 +
 arch/sparc/kernel/syscalls/syscall.tbl        |  1 +
 arch/x86/entry/syscalls/syscall_32.tbl        |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl        |  1 +
 arch/xtensa/kernel/syscalls/syscall.tbl       |  1 +
 include/linux/mempolicy.h                     | 11 +++
 include/linux/syscalls.h                      |  4 +
 include/uapi/asm-generic/unistd.h             |  5 +-
 kernel/sys_ni.c                               |  1 +
 mm/mempolicy.c                                | 89 +++++++++++++++++++
 24 files changed, 146 insertions(+), 6 deletions(-)

Comments

Frank van der Linden Oct. 10, 2022, 4:22 p.m. UTC | #1
For consistency with process_madvise(), I would suggest calling it
process_set_mempolicy. Other than that, this makes sense. To complete
the set, perhaps a process_mbind() should be added as well. What do
you think?

- Frank

On Mon, Oct 10, 2022 at 2:49 AM Zhongkun He
<hezhongkun.hzk@bytedance.com> wrote:
>
> There is usecase that System Management Software(SMS) want to give a
> memory policy to other processes to make better use of memory.
>
> The information about how to use memory is not known to the app.
> Instead, it is known to the userspace daemon(SMS), and that daemon
> will decide the memory usage policy based on different factors.
>
> To solve the issue, this patch introduces a new syscall
> pidfd_set_mempolicy(2).  it sets the NUMA memory policy of the thread
> specified in pidfd.
>
> In current process context there is no locking because only the process
> accesses its own memory policy, so task_work is used in
> pidfd_set_mempolicy() to update the mempolicy of the process specified
> in pidfd, avoid using locks and race conditions.
>
> The API is as follows,
>
>                 long pidfd_set_mempolicy(int pidfd, int mode,
>                                      const unsigned long __user *nmask,
>                                      unsigned long maxnode,
>                                      unsigned int flags);
>
> Set's the [pidfd] task's "task/process memory policy". The pidfd argument
> is a PID file descriptor (see pidfd_open(2) man page) that specifies the
> process to which the mempolicy is to be applied. The flags argument is
> reserved for future use; currently, this argument must be specified as 0.
> Please see the set_mempolicy(2) man page for more details about
> other's arguments.
>
> Suggested-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
> ---
>  .../admin-guide/mm/numa_memory_policy.rst     | 21 ++++-
>  arch/alpha/kernel/syscalls/syscall.tbl        |  1 +
>  arch/arm/tools/syscall.tbl                    |  1 +
>  arch/arm64/include/asm/unistd.h               |  2 +-
>  arch/arm64/include/asm/unistd32.h             |  3 +-
>  arch/ia64/kernel/syscalls/syscall.tbl         |  1 +
>  arch/m68k/kernel/syscalls/syscall.tbl         |  1 +
>  arch/microblaze/kernel/syscalls/syscall.tbl   |  1 +
>  arch/mips/kernel/syscalls/syscall_n32.tbl     |  1 +
>  arch/mips/kernel/syscalls/syscall_n64.tbl     |  1 +
>  arch/mips/kernel/syscalls/syscall_o32.tbl     |  1 +
>  arch/parisc/kernel/syscalls/syscall.tbl       |  1 +
>  arch/powerpc/kernel/syscalls/syscall.tbl      |  1 +
>  arch/s390/kernel/syscalls/syscall.tbl         |  1 +
>  arch/sh/kernel/syscalls/syscall.tbl           |  1 +
>  arch/sparc/kernel/syscalls/syscall.tbl        |  1 +
>  arch/x86/entry/syscalls/syscall_32.tbl        |  1 +
>  arch/x86/entry/syscalls/syscall_64.tbl        |  1 +
>  arch/xtensa/kernel/syscalls/syscall.tbl       |  1 +
>  include/linux/mempolicy.h                     | 11 +++
>  include/linux/syscalls.h                      |  4 +
>  include/uapi/asm-generic/unistd.h             |  5 +-
>  kernel/sys_ni.c                               |  1 +
>  mm/mempolicy.c                                | 89 +++++++++++++++++++
>  24 files changed, 146 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
> index 5a6afecbb0d0..b864dd88b2d2 100644
> --- a/Documentation/admin-guide/mm/numa_memory_policy.rst
> +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
> @@ -408,9 +408,10 @@ follows:
>  Memory Policy APIs
>  ==================
>
> -Linux supports 4 system calls for controlling memory policy.  These APIS
> -always affect only the calling task, the calling task's address space, or
> -some shared object mapped into the calling task's address space.
> +Linux supports 5 system calls for controlling memory policy.  The first four
> +APIS affect only the calling task, the calling task's address space, or some
> +shared object mapped into the calling task's address space. The last one can
> +set the mempolicy of task specified in pidfd.
>
>  .. note::
>     the headers that define these APIs and the parameter data types for
> @@ -473,6 +474,20 @@ closest to which page allocation will come from. Specifying the home node overri
>  the default allocation policy to allocate memory close to the local node for an
>  executing CPU.
>
> +Set [pidfd Task] Memory Policy::
> +
> +        long sys_pidfd_set_mempolicy(int pidfd, int mode,
> +                                     const unsigned long __user *nmask,
> +                                     unsigned long maxnode,
> +                                     unsigned int flags);
> +
> +Set's the [pidfd] task's "task/process memory policy". The pidfd argument is
> +a PID file descriptor (see pidfd_open(2) man page) that specifies the process
> +to which the mempolicy is to be applied. The flags argument is reserved for
> +future use; currently, this argument must be specified as 0. Please see the
> +set_mempolicy(2) man page for more details about other's arguments.
> +
> +
>
>  Memory Policy Command Line Interface
>  ====================================
> diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
> index 3515bc4f16a4..d86baa5a6eca 100644
> --- a/arch/alpha/kernel/syscalls/syscall.tbl
> +++ b/arch/alpha/kernel/syscalls/syscall.tbl
> @@ -490,3 +490,4 @@
>  558    common  process_mrelease                sys_process_mrelease
>  559    common  futex_waitv                     sys_futex_waitv
>  560    common  set_mempolicy_home_node         sys_ni_syscall
> +561    common  pidfd_set_mempolicy             sys_ni_syscall
> diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
> index ac964612d8b0..a7ccab8aafef 100644
> --- a/arch/arm/tools/syscall.tbl
> +++ b/arch/arm/tools/syscall.tbl
> @@ -464,3 +464,4 @@
>  448    common  process_mrelease                sys_process_mrelease
>  449    common  futex_waitv                     sys_futex_waitv
>  450    common  set_mempolicy_home_node         sys_set_mempolicy_home_node
> +451    common  pidfd_set_mempolicy             sys_pidfd_set_mempolicy
> diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
> index 037feba03a51..64a514f90131 100644
> --- a/arch/arm64/include/asm/unistd.h
> +++ b/arch/arm64/include/asm/unistd.h
> @@ -39,7 +39,7 @@
>  #define __ARM_NR_compat_set_tls                (__ARM_NR_COMPAT_BASE + 5)
>  #define __ARM_NR_COMPAT_END            (__ARM_NR_COMPAT_BASE + 0x800)
>
> -#define __NR_compat_syscalls           451
> +#define __NR_compat_syscalls           452
>  #endif
>
>  #define __ARCH_WANT_SYS_CLONE
> diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
> index 604a2053d006..2e178e9152e6 100644
> --- a/arch/arm64/include/asm/unistd32.h
> +++ b/arch/arm64/include/asm/unistd32.h
> @@ -907,7 +907,8 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
>  __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
>  #define __NR_set_mempolicy_home_node 450
>  __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
> -
> +#define __NR_pidfd_set_mempolicy 451
> +__SYSCALL(__NR_pidfd_set_mempolicy, sys_pidfd_set_mempolicy)
>  /*
>   * Please add new compat syscalls above this comment and update
>   * __NR_compat_syscalls in asm/unistd.h.
> diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
> index 78b1d03e86e1..4e5bca7a985c 100644
> --- a/arch/ia64/kernel/syscalls/syscall.tbl
> +++ b/arch/ia64/kernel/syscalls/syscall.tbl
> @@ -371,3 +371,4 @@
>  448    common  process_mrelease                sys_process_mrelease
>  449    common  futex_waitv                     sys_futex_waitv
>  450    common  set_mempolicy_home_node         sys_set_mempolicy_home_node
> +451    common  pidfd_set_mempolicy             sys_pidfd_set_mempolicy
> diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
> index b1f3940bc298..8e50bf8ec41d 100644
> --- a/arch/m68k/kernel/syscalls/syscall.tbl
> +++ b/arch/m68k/kernel/syscalls/syscall.tbl
> @@ -450,3 +450,4 @@
>  448    common  process_mrelease                sys_process_mrelease
>  449    common  futex_waitv                     sys_futex_waitv
>  450    common  set_mempolicy_home_node         sys_set_mempolicy_home_node
> +451    common  pidfd_set_mempolicy             sys_pidfd_set_mempolicy
> diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
> index 820145e47350..f48e32532c5f 100644
> --- a/arch/microblaze/kernel/syscalls/syscall.tbl
> +++ b/arch/microblaze/kernel/syscalls/syscall.tbl
> @@ -456,3 +456,4 @@
>  448    common  process_mrelease                sys_process_mrelease
>  449    common  futex_waitv                     sys_futex_waitv
>  450    common  set_mempolicy_home_node         sys_set_mempolicy_home_node
> +451    common  pidfd_set_mempolicy             sys_pidfd_set_mempolicy
> diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
> index 253ff994ed2e..58e7c3140f4a 100644
> --- a/arch/mips/kernel/syscalls/syscall_n32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
> @@ -389,3 +389,4 @@
>  448    n32     process_mrelease                sys_process_mrelease
>  449    n32     futex_waitv                     sys_futex_waitv
>  450    n32     set_mempolicy_home_node         sys_set_mempolicy_home_node
> +451    n32     pidfd_set_mempolicy             sys_pidfd_set_mempolicy
> diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
> index 3f1886ad9d80..70090c3ada16 100644
> --- a/arch/mips/kernel/syscalls/syscall_n64.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
> @@ -365,3 +365,4 @@
>  448    n64     process_mrelease                sys_process_mrelease
>  449    n64     futex_waitv                     sys_futex_waitv
>  450    common  set_mempolicy_home_node         sys_set_mempolicy_home_node
> +451    n64     pidfd_set_mempolicy             sys_pidfd_set_mempolicy
> diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
> index 8f243e35a7b2..b0b0bad64fa0 100644
> --- a/arch/mips/kernel/syscalls/syscall_o32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
> @@ -438,3 +438,4 @@
>  448    o32     process_mrelease                sys_process_mrelease
>  449    o32     futex_waitv                     sys_futex_waitv
>  450    o32     set_mempolicy_home_node         sys_set_mempolicy_home_node
> +451    o32     pidfd_set_mempolicy             sys_pidfd_set_mempolicy
> diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
> index 8a99c998da9b..4dfd328490ad 100644
> --- a/arch/parisc/kernel/syscalls/syscall.tbl
> +++ b/arch/parisc/kernel/syscalls/syscall.tbl
> @@ -448,3 +448,4 @@
>  448    common  process_mrelease                sys_process_mrelease
>  449    common  futex_waitv                     sys_futex_waitv
>  450    common  set_mempolicy_home_node         sys_set_mempolicy_home_node
> +451    common  pidfd_set_mempolicy             sys_pidfd_set_mempolicy
> diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
> index 2600b4237292..20381463b2b5 100644
> --- a/arch/powerpc/kernel/syscalls/syscall.tbl
> +++ b/arch/powerpc/kernel/syscalls/syscall.tbl
> @@ -530,3 +530,4 @@
>  448    common  process_mrelease                sys_process_mrelease
>  449    common  futex_waitv                     sys_futex_waitv
>  450    nospu   set_mempolicy_home_node         sys_set_mempolicy_home_node
> +451    nospu   pidfd_set_mempolicy             sys_pidfd_set_mempolicy
> diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
> index 799147658dee..5e170dca81f6 100644
> --- a/arch/s390/kernel/syscalls/syscall.tbl
> +++ b/arch/s390/kernel/syscalls/syscall.tbl
> @@ -453,3 +453,4 @@
>  448  common    process_mrelease        sys_process_mrelease            sys_process_mrelease
>  449  common    futex_waitv             sys_futex_waitv                 sys_futex_waitv
>  450  common    set_mempolicy_home_node sys_set_mempolicy_home_node     sys_set_mempolicy_home_node
> +451  common    pidfd_set_mempolicy     sys_pidfd_set_mempolicy         sys_pidfd_set_mempolicy
> diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
> index 2de85c977f54..bd3a24a9b41f 100644
> --- a/arch/sh/kernel/syscalls/syscall.tbl
> +++ b/arch/sh/kernel/syscalls/syscall.tbl
> @@ -453,3 +453,4 @@
>  448    common  process_mrelease                sys_process_mrelease
>  449    common  futex_waitv                     sys_futex_waitv
>  450    common  set_mempolicy_home_node         sys_set_mempolicy_home_node
> +451    common  pidfd_set_mempolicy             sys_pidfd_set_mempolicy
> diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
> index 4398cc6fb68d..bea2b5c6314b 100644
> --- a/arch/sparc/kernel/syscalls/syscall.tbl
> +++ b/arch/sparc/kernel/syscalls/syscall.tbl
> @@ -496,3 +496,4 @@
>  448    common  process_mrelease                sys_process_mrelease
>  449    common  futex_waitv                     sys_futex_waitv
>  450    common  set_mempolicy_home_node         sys_set_mempolicy_home_node
> +451    common  pidfd_set_mempolicy             sys_pidfd_set_mempolicy
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 320480a8db4f..97bc70f5a8f7 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -455,3 +455,4 @@
>  448    i386    process_mrelease        sys_process_mrelease
>  449    i386    futex_waitv             sys_futex_waitv
>  450    i386    set_mempolicy_home_node         sys_set_mempolicy_home_node
> +451    i386    pidfd_set_mempolicy     sys_pidfd_set_mempolicy
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index c84d12608cd2..ba6d19dbd8f8 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -372,6 +372,7 @@
>  448    common  process_mrelease        sys_process_mrelease
>  449    common  futex_waitv             sys_futex_waitv
>  450    common  set_mempolicy_home_node sys_set_mempolicy_home_node
> +451    common  pidfd_set_mempolicy     sys_pidfd_set_mempolicy
>
>  #
>  # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
> index 52c94ab5c205..9f7c563da4fb 100644
> --- a/arch/xtensa/kernel/syscalls/syscall.tbl
> +++ b/arch/xtensa/kernel/syscalls/syscall.tbl
> @@ -421,3 +421,4 @@
>  448    common  process_mrelease                sys_process_mrelease
>  449    common  futex_waitv                     sys_futex_waitv
>  450    common  set_mempolicy_home_node         sys_set_mempolicy_home_node
> +451    common  pidfd_set_mempolicy             sys_pidfd_set_mempolicy
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> index 668389b4b53d..f3c522e8eb38 100644
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -54,6 +54,17 @@ struct mempolicy {
>         } w;
>  };
>
> +/*
> + *  The information of mempolicy used by task_work
> + *  to update it dynamically.
> + */
> +struct mpol_worker {
> +       unsigned short mode;
> +       unsigned short flags;
> +       nodemask_t nodes;
> +       struct rcu_head update_work;
> +};
> +
>  /*
>   * Support for managing mempolicy data objects (clone, copy, destroy)
>   * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index a34b0f9a9972..e611c088050b 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -1056,6 +1056,10 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
>  asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
>                                             unsigned long home_node,
>                                             unsigned long flags);
> +asmlinkage long sys_pidfd_set_mempolicy(int pidfd, int mode,
> +                                       const unsigned long __user *nmask,
> +                                       unsigned long maxnode,
> +                                       unsigned int flags);
>
>  /*
>   * Architecture-specific system calls
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 45fa180cc56a..c38013bbf5b0 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
>  #define __NR_set_mempolicy_home_node 450
>  __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
>
> +#define __NR_pidfd_set_mempolicy 451
> +__SYSCALL(__NR_pidfd_set_mempolicy, sys_pidfd_set_mempolicy)
> +
>  #undef __NR_syscalls
> -#define __NR_syscalls 451
> +#define __NR_syscalls 452
>
>  /*
>   * 32 bit systems traditionally used different
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 860b2dcf3ac4..5053deb888f7 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -299,6 +299,7 @@ COND_SYSCALL(set_mempolicy);
>  COND_SYSCALL(migrate_pages);
>  COND_SYSCALL(move_pages);
>  COND_SYSCALL(set_mempolicy_home_node);
> +COND_SYSCALL(pidfd_set_mempolicy);
>
>  COND_SYSCALL(perf_event_open);
>  COND_SYSCALL(accept4);
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index b73d3248d976..2ce9be1a9dfd 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -102,6 +102,7 @@
>  #include <linux/mmu_notifier.h>
>  #include <linux/printk.h>
>  #include <linux/swapops.h>
> +#include <linux/task_work.h>
>
>  #include <asm/tlbflush.h>
>  #include <asm/tlb.h>
> @@ -1572,6 +1573,94 @@ SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
>         return kernel_set_mempolicy(mode, nmask, maxnode);
>  }
>
> +/*
> + *  In process context there is no locking, so task_work is used to update
> + *  the mempolicy of process specified in pidfd, avoid using locks.
> + */
> +static void pidfd_update_mpol(struct callback_head *cb)
> +{
> +       struct mpol_worker *worker = container_of(cb, struct mpol_worker, update_work);
> +
> +       do_set_mempolicy(worker->mode, worker->flags, &worker->nodes);
> +       kfree(worker);
> +}
> +
> +/* Set the memory policy of the process specified in pidfd. */
> +static long do_pidfd_set_mempolicy(int pidfd, int mode, const unsigned long __user *nmask,
> +               unsigned long maxnode, unsigned int flags)
> +{
> +       nodemask_t nodes, task_nodes, tmp;
> +       struct mpol_worker *worker;
> +       struct task_struct *task;
> +       unsigned short mode_flags;
> +       int ret, lmode = mode;
> +       unsigned int f_flags;
> +
> +       if (flags)
> +               return -EINVAL;
> +
> +       task = pidfd_get_task(pidfd, &f_flags);
> +       if (IS_ERR(task))
> +               return PTR_ERR(task);
> +       /*
> +        * Check if this process has the right to modify the specified
> +        * process.
> +        */
> +       if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) {
> +               ret = -EPERM;
> +               goto out;
> +       }
> +
> +       /*
> +        * Require CAP_SYS_NICE for influencing process performance.
> +        * User's task is allowed only.
> +        */
> +       if (!capable(CAP_SYS_NICE) || (task->flags & PF_KTHREAD)) {
> +               ret = -EPERM;
> +               goto out;
> +       }
> +
> +       ret = get_nodes(&nodes, nmask, maxnode);
> +       if (ret)
> +               goto out;
> +
> +       ret = sanitize_mpol_flags(&lmode, &mode_flags);
> +       if (ret)
> +               goto out;
> +
> +       task_nodes = cpuset_mems_allowed(task);
> +
> +       if (mode_flags & MPOL_F_RELATIVE_NODES)
> +               mpol_relative_nodemask(&tmp, &nodes, &task_nodes);
> +       else
> +               nodes_and(tmp, nodes, task_nodes);
> +
> +       if (nodes_empty(tmp)) {
> +               ret = -EINVAL;
> +               goto out;
> +       }
> +       worker = kzalloc(sizeof(struct mpol_worker), GFP_KERNEL | __GFP_NOWARN);
> +       if (!worker) {
> +               ret = -ENOMEM;
> +               goto out;
> +       }
> +       init_task_work(&worker->update_work, pidfd_update_mpol);
> +       worker->flags = mode_flags;
> +       worker->mode = lmode;
> +       worker->nodes = nodes;
> +
> +       ret = task_work_add(task, &worker->update_work, TWA_SIGNAL);
> +out:
> +       put_task_struct(task);
> +       return ret;
> +}
> +
> +SYSCALL_DEFINE5(pidfd_set_mempolicy, int, pidfd, int, mode, const unsigned long __user *, nmask,
> +               unsigned long, maxnode, unsigned int, flags)
> +{
> +       return do_pidfd_set_mempolicy(pidfd, mode, nmask, maxnode, flags);
> +}
> +
>  static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
>                                 const unsigned long __user *old_nodes,
>                                 const unsigned long __user *new_nodes)
> --
> 2.25.1
>
>
Michal Hocko Oct. 11, 2022, 2:57 p.m. UTC | #2
On Mon 10-10-22 17:48:42, Zhongkun He wrote:
> There is usecase that System Management Software(SMS) want to give a
> memory policy to other processes to make better use of memory.
> 
> The information about how to use memory is not known to the app.
> Instead, it is known to the userspace daemon(SMS), and that daemon
> will decide the memory usage policy based on different factors.

Please add some explanation why the cpuset interface is not usable for
that usecase.

> To solve the issue, this patch introduces a new syscall
> pidfd_set_mempolicy(2).  it sets the NUMA memory policy of the thread
> specified in pidfd.
> 
> In current process context there is no locking because only the process
> accesses its own memory policy, so task_work is used in
> pidfd_set_mempolicy() to update the mempolicy of the process specified
> in pidfd, avoid using locks and race conditions.

Why cannot you alter kernel_set_mempolicy (and do_set_mempolicy) to
accept a task rather than operate on current?

I have to really say that I dislike the task_work approach because it
detaches the syscall from the actual operation and the caller simply
doesn't know when the operation has been completed.
> 
> The API is as follows,
> 
> 		long pidfd_set_mempolicy(int pidfd, int mode,
>                                      const unsigned long __user *nmask,
>                                      unsigned long maxnode,
>                                      unsigned int flags);
> 
> Set's the [pidfd] task's "task/process memory policy". The pidfd argument
> is a PID file descriptor (see pidfd_open(2) man page) that specifies the
> process to which the mempolicy is to be applied. The flags argument is
> reserved for future use; currently, this argument must be specified as 0.
> Please see the set_mempolicy(2) man page for more details about
> other's arguments.

Please also describe the security model.
Michal Hocko Oct. 11, 2022, 3 p.m. UTC | #3
On Mon 10-10-22 09:22:13, Frank van der Linden wrote:
> For consistency with process_madvise(), I would suggest calling it
> process_set_mempolicy.

This operation has per-thread rather than per-process semantic so I do
not think your proposed naming is better.

> Other than that, this makes sense. To complete
> the set, perhaps a process_mbind() should be added as well. What do
> you think?

Is there any real usecase for this interface? How is the caller supposed
to make per-range decisions without a very involved coordination with
the target process?
Frank van der Linden Oct. 11, 2022, 5:22 p.m. UTC | #4
On Tue, Oct 11, 2022 at 8:00 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 10-10-22 09:22:13, Frank van der Linden wrote:
> > For consistency with process_madvise(), I would suggest calling it
> > process_set_mempolicy.
>
> This operation has per-thread rather than per-process semantic so I do
> not think your proposed naming is better.

True. I suppose you could argue that it should have been
pidfd_madvise() then for consistency, but that ship has sailed.

>
> > Other than that, this makes sense. To complete
> > the set, perhaps a process_mbind() should be added as well. What do
> > you think?
>
> Is there any real usecase for this interface? How is the caller supposed
> to make per-range decisions without a very involved coordination with
> the target process?

The use case for a potential pidfd_mbind() is basically a combination
of what is described for in the process_madvise proposal (
https://lore.kernel.org/lkml/20200901000633.1920247-1-minchan@kernel.org/
), and what this proposal describes: system management software acting
as an orchestrator that has a better overview of the system as a whole
(NUMA nodes, memory tiering), and has knowledge of the layout of the
processes involved.

pidfd_mbind()  makes sense to me, since the notion of an external
agent with knowledge of the VM layout is already there with
process_madvise(). And since set_mempolicy and mbind are closely
related, it would seem logical to add an mbind variant as well as
pidfd_set_mempolicy().

Having said that, I'm fine with leaving that discussion for another time.

- Frank
Michal Hocko Oct. 11, 2022, 7:29 p.m. UTC | #5
On Tue 11-10-22 10:22:23, Frank van der Linden wrote:
> On Tue, Oct 11, 2022 at 8:00 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Mon 10-10-22 09:22:13, Frank van der Linden wrote:
> > > For consistency with process_madvise(), I would suggest calling it
> > > process_set_mempolicy.
> >
> > This operation has per-thread rather than per-process semantic so I do
> > not think your proposed naming is better.
> 
> True. I suppose you could argue that it should have been
> pidfd_madvise() then for consistency, but that ship has sailed.

madvise commands have per mm semantic. It is set_mempolicy which is
kinda special and it allows to have per task_struct semantic even when
the actual allocation is in the same address space. To be honest I am
not really sure why that is this way because threads aim to share memory
so why should they have different memory policies?

I suspect that the original thinking was that some portions that are
private to the process want to have different affinities (e.g. stacks
and dedicated per cpu heap arenas). E.g. worker pools which want to be
per-cpu local with their own allocations but operate on shared data that
requires different policies.

> > > Other than that, this makes sense. To complete
> > > the set, perhaps a process_mbind() should be added as well. What do
> > > you think?
> >
> > Is there any real usecase for this interface? How is the caller supposed
> > to make per-range decisions without a very involved coordination with
> > the target process?
> 
> The use case for a potential pidfd_mbind() is basically a combination
> of what is described for in the process_madvise proposal (
> https://lore.kernel.org/lkml/20200901000633.1920247-1-minchan@kernel.org/
> ), and what this proposal describes: system management software acting
> as an orchestrator that has a better overview of the system as a whole
> (NUMA nodes, memory tiering), and has knowledge of the layout of the
> processes involved.

Well, per address range operation is a completely different beast I
would say. External tool would need to a) understand what that range is
used for (e.g. stack/heap ranges, mmaped shared files like libraries or
private mappings) and b) by in sync with memory layout modifications
done by applications (e.g. that an mmap has been issued to back malloc
request). Quite a lot of understanding about the specific process. I
would say that with that intimate knowledge it is quite better to be
part of the process and do those changes from within of the process
itself.
Abel Wu Oct. 12, 2022, 3:14 a.m. UTC | #6
On 10/12/22 3:29 AM, Michal Hocko wrote:
> On Tue 11-10-22 10:22:23, Frank van der Linden wrote:
>> On Tue, Oct 11, 2022 at 8:00 AM Michal Hocko <mhocko@suse.com> wrote:
>>>
>>> On Mon 10-10-22 09:22:13, Frank van der Linden wrote:
>>>> For consistency with process_madvise(), I would suggest calling it
>>>> process_set_mempolicy.
>>>
>>> This operation has per-thread rather than per-process semantic so I do
>>> not think your proposed naming is better.
>>
>> True. I suppose you could argue that it should have been
>> pidfd_madvise() then for consistency, but that ship has sailed.
> 
> madvise commands have per mm semantic. It is set_mempolicy which is
> kinda special and it allows to have per task_struct semantic even when
> the actual allocation is in the same address space. To be honest I am
> not really sure why that is this way because threads aim to share memory
> so why should they have different memory policies?
> 
> I suspect that the original thinking was that some portions that are
> private to the process want to have different affinities (e.g. stacks
> and dedicated per cpu heap arenas). E.g. worker pools which want to be
> per-cpu local with their own allocations but operate on shared data that
> requires different policies.
> 
>>>> Other than that, this makes sense. To complete
>>>> the set, perhaps a process_mbind() should be added as well. What do
>>>> you think?
>>>
>>> Is there any real usecase for this interface? How is the caller supposed
>>> to make per-range decisions without a very involved coordination with
>>> the target process?
>>
>> The use case for a potential pidfd_mbind() is basically a combination
>> of what is described for in the process_madvise proposal (
>> https://lore.kernel.org/lkml/20200901000633.1920247-1-minchan@kernel.org/
>> ), and what this proposal describes: system management software acting
>> as an orchestrator that has a better overview of the system as a whole
>> (NUMA nodes, memory tiering), and has knowledge of the layout of the
>> processes involved.

This is exactly why we are proposing pidfd/process_set_mempolicy().

> 
> Well, per address range operation is a completely different beast I
> would say. External tool would need to a) understand what that range is
> used for (e.g. stack/heap ranges, mmaped shared files like libraries or
> private mappings) and b) by in sync with memory layout modifications
> done by applications (e.g. that an mmap has been issued to back malloc
> request). Quite a lot of understanding about the specific process. I
> would say that with that intimate knowledge it is quite better to be
> part of the process and do those changes from within of the process
> itself.

Agreed, the orchestrator like system management software may not have
enough knowledge about per address range. And I also don't think it is
appropriate for orchestrators to overwrite tasks' mempolicy as well,
they are set for some purpose by the apps themselves. So I suggested
a per-mm policy which have a lower priority than the tasks'.

Thanks & BR,
Abel
Bagas Sanjaya Oct. 12, 2022, 4:16 a.m. UTC | #7
On Mon, Oct 10, 2022 at 05:48:42PM +0800, Zhongkun He wrote:
> There is usecase that System Management Software(SMS) want to give a
> memory policy to other processes to make better use of memory.
> 

Better say "There are usecases when system management utilities
want to apply memory policy to processes to make better use of memory".

> The information about how to use memory is not known to the app.
> Instead, it is known to the userspace daemon(SMS), and that daemon
> will decide the memory usage policy based on different factors.
> 

Better say "These utilities doesn't set memory usage policy, but
rather the job of reporting memory usage and setting the policy is
offloaded to an userspace daemon."

> To solve the issue, this patch introduces a new syscall
> pidfd_set_mempolicy(2).  it sets the NUMA memory policy of the thread
> specified in pidfd.
> 

Better say "To solve the issue above, introduce new syscall
pidfd_set_mempolicy(2). The syscall sets NUMA memory policy for the
thread specified in pidfd".

> In current process context there is no locking because only the process
> accesses its own memory policy, so task_work is used in
> pidfd_set_mempolicy() to update the mempolicy of the process specified
> in pidfd, avoid using locks and race conditions.
> 

Better say "In current process context there is no locking because
only processes access their own memory policy. For this reason, task_work
is used in pidfd_set_mempolicy() to set or update the mempolicy of process
specified in pid. Thuse, it avoids into race conditions."

> The API is as follows,
> 
> 		long pidfd_set_mempolicy(int pidfd, int mode,
>                                      const unsigned long __user *nmask,
>                                      unsigned long maxnode,
>                                      unsigned int flags);
> 
> Set's the [pidfd] task's "task/process memory policy". The pidfd argument
> is a PID file descriptor (see pidfd_open(2) man page) that specifies the
> process to which the mempolicy is to be applied. The flags argument is
> reserved for future use; currently, this argument must be specified as 0.
> Please see the set_mempolicy(2) man page for more details about
> other's arguments.
> 

Why duplicating from the Documentation/ below?

> Suggested-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
> ---
>  .../admin-guide/mm/numa_memory_policy.rst     | 21 ++++-
>  arch/alpha/kernel/syscalls/syscall.tbl        |  1 +
>  arch/arm/tools/syscall.tbl                    |  1 +
>  arch/arm64/include/asm/unistd.h               |  2 +-
>  arch/arm64/include/asm/unistd32.h             |  3 +-
>  arch/ia64/kernel/syscalls/syscall.tbl         |  1 +
>  arch/m68k/kernel/syscalls/syscall.tbl         |  1 +
>  arch/microblaze/kernel/syscalls/syscall.tbl   |  1 +
>  arch/mips/kernel/syscalls/syscall_n32.tbl     |  1 +
>  arch/mips/kernel/syscalls/syscall_n64.tbl     |  1 +
>  arch/mips/kernel/syscalls/syscall_o32.tbl     |  1 +
>  arch/parisc/kernel/syscalls/syscall.tbl       |  1 +
>  arch/powerpc/kernel/syscalls/syscall.tbl      |  1 +
>  arch/s390/kernel/syscalls/syscall.tbl         |  1 +
>  arch/sh/kernel/syscalls/syscall.tbl           |  1 +
>  arch/sparc/kernel/syscalls/syscall.tbl        |  1 +
>  arch/x86/entry/syscalls/syscall_32.tbl        |  1 +
>  arch/x86/entry/syscalls/syscall_64.tbl        |  1 +
>  arch/xtensa/kernel/syscalls/syscall.tbl       |  1 +
>  include/linux/mempolicy.h                     | 11 +++
>  include/linux/syscalls.h                      |  4 +
>  include/uapi/asm-generic/unistd.h             |  5 +-
>  kernel/sys_ni.c                               |  1 +
>  mm/mempolicy.c                                | 89 +++++++++++++++++++
>  24 files changed, 146 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
> index 5a6afecbb0d0..b864dd88b2d2 100644
> --- a/Documentation/admin-guide/mm/numa_memory_policy.rst
> +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
> @@ -408,9 +408,10 @@ follows:
>  Memory Policy APIs
>  ==================
>  
> -Linux supports 4 system calls for controlling memory policy.  These APIS
> -always affect only the calling task, the calling task's address space, or
> -some shared object mapped into the calling task's address space.
> +Linux supports 5 system calls for controlling memory policy.  The first four
> +APIS affect only the calling task, the calling task's address space, or some
> +shared object mapped into the calling task's address space. The last one can
> +set the mempolicy of task specified in pidfd.
>  
>  .. note::
>     the headers that define these APIs and the parameter data types for
> @@ -473,6 +474,20 @@ closest to which page allocation will come from. Specifying the home node overri
>  the default allocation policy to allocate memory close to the local node for an
>  executing CPU.
>  
> +Set [pidfd Task] Memory Policy::
> +
> +        long sys_pidfd_set_mempolicy(int pidfd, int mode,
> +                                     const unsigned long __user *nmask,
> +                                     unsigned long maxnode,
> +                                     unsigned int flags);
> +
> +Set's the [pidfd] task's "task/process memory policy". The pidfd argument is
> +a PID file descriptor (see pidfd_open(2) man page) that specifies the process
> +to which the mempolicy is to be applied. The flags argument is reserved for
> +future use; currently, this argument must be specified as 0. Please see the
> +set_mempolicy(2) man page for more details about other's arguments.
> +
> +
>  
>  Memory Policy Command Line Interface
>  ====================================

The wording can be improved:

---- >8 ----

diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index b864dd88b2d236..6df35bf4f960bd 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -410,8 +410,8 @@ Memory Policy APIs
 
 Linux supports 5 system calls for controlling memory policy.  The first four
 APIS affect only the calling task, the calling task's address space, or some
-shared object mapped into the calling task's address space. The last one can
-set the mempolicy of task specified in pidfd.
+shared object mapped into the calling task's address space. The last one
+sets the mempolicy of task specified in the pidfd.
 
 .. note::
    the headers that define these APIs and the parameter data types for
@@ -481,11 +481,11 @@ Set [pidfd Task] Memory Policy::
                                      unsigned long maxnode,
                                      unsigned int flags);
 
-Set's the [pidfd] task's "task/process memory policy". The pidfd argument is
-a PID file descriptor (see pidfd_open(2) man page) that specifies the process
-to which the mempolicy is to be applied. The flags argument is reserved for
-future use; currently, this argument must be specified as 0. Please see the
-set_mempolicy(2) man page for more details about other's arguments.
+Sets the task/process memory policy for the [pidfd] task. The pidfd argument
+is a PID file descriptor (see pidfd_open(2) man page for details) that
+specifies the process for which the mempolicy is applied to. The flags
+argument is reserved for future use; currently, it must be specified as 0.
+For the description of all other arguments, see set_mempolicy(2) man page.
 
 
 

Thanks.
Zhongkun He Oct. 12, 2022, 7:55 a.m. UTC | #8
Hi  michal, thanks for your reply and suggestiones.

> Please add some explanation why the cpuset interface is not usable for
> that usecase.
OK.

>> To solve the issue, this patch introduces a new syscall
>> pidfd_set_mempolicy(2).  it sets the NUMA memory policy of the thread
>> specified in pidfd.
>>
>> In current process context there is no locking because only the process
>> accesses its own memory policy, so task_work is used in
>> pidfd_set_mempolicy() to update the mempolicy of the process specified
>> in pidfd, avoid using locks and race conditions.
> 
> Why cannot you alter kernel_set_mempolicy (and do_set_mempolicy) to
> accept a task rather than operate on current?

I have tried it before this patch, but I found a problem.The allocation 
and update of mempolicy are in the current context, so it is not 
protected by any lock.But when the mempolicy is modified by other 
processes, the race condition appears.
Say something like the following

	pidfd_set_mempolicy	     target task stack
				       alloc_pages
					mpol = get_task_policy;
	 task_lock(task);
	 old = task->mempolicy;
	 task->mempolicy = new;
	 task_unlock(task);
	 mpol_put(old);			
					page = __alloc_pages(mpol);
There is a situation that when the old mempolicy is released, the target 
task is still using the policy.It would be better if there are 
suggestions on this case.

> I have to really say that I dislike the task_work approach because it
> detaches the syscall from the actual operation and the caller simply
> doesn't know when the operation has been completed.

I agree with you.This is indeed a problem.

> Please also describe the security model.got it.
Zhongkun He Oct. 12, 2022, 8:18 a.m. UTC | #9
> On Mon, Oct 10, 2022 at 05:48:42PM +0800, Zhongkun He wrote:
>> There is usecase that System Management Software(SMS) want to give a
>> memory policy to other processes to make better use of memory.
>>
> 
> Better say "There are usecases when system management utilities
> want to apply memory policy to processes to make better use of memory".
> 
>> The information about how to use memory is not known to the app.
>> Instead, it is known to the userspace daemon(SMS), and that daemon
>> will decide the memory usage policy based on different factors.
>>
> 
> Better say "These utilities doesn't set memory usage policy, but
> rather the job of reporting memory usage and setting the policy is
> offloaded to an userspace daemon."
> 
>> To solve the issue, this patch introduces a new syscall
>> pidfd_set_mempolicy(2).  it sets the NUMA memory policy of the thread
>> specified in pidfd.
>>
> 
> Better say "To solve the issue above, introduce new syscall
> pidfd_set_mempolicy(2). The syscall sets NUMA memory policy for the
> thread specified in pidfd".
> 
>> In current process context there is no locking because only the process
>> accesses its own memory policy, so task_work is used in
>> pidfd_set_mempolicy() to update the mempolicy of the process specified
>> in pidfd, avoid using locks and race conditions.
>>
> 
> Better say "In current process context there is no locking because
> only processes access their own memory policy. For this reason, task_work
> is used in pidfd_set_mempolicy() to set or update the mempolicy of process
> specified in pid. Thuse, it avoids into race conditions."
> 
>> The API is as follows,
>>
>> 		long pidfd_set_mempolicy(int pidfd, int mode,
>>                                       const unsigned long __user *nmask,
>>                                       unsigned long maxnode,
>>                                       unsigned int flags);
>>
>> Set's the [pidfd] task's "task/process memory policy". The pidfd argument
>> is a PID file descriptor (see pidfd_open(2) man page) that specifies the
>> process to which the mempolicy is to be applied. The flags argument is
>> reserved for future use; currently, this argument must be specified as 0.
>> Please see the set_mempolicy(2) man page for more details about
>> other's arguments.
>>
> 
> Why duplicating from the Documentation/ below?
> 
>> Suggested-by: Michal Hocko <mhocko@suse.com>
>> Signed-off-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
>> ---
>>   .../admin-guide/mm/numa_memory_policy.rst     | 21 ++++-
>>   arch/alpha/kernel/syscalls/syscall.tbl        |  1 +
>>   arch/arm/tools/syscall.tbl                    |  1 +
>>   arch/arm64/include/asm/unistd.h               |  2 +-
>>   arch/arm64/include/asm/unistd32.h             |  3 +-
>>   arch/ia64/kernel/syscalls/syscall.tbl         |  1 +
>>   arch/m68k/kernel/syscalls/syscall.tbl         |  1 +
>>   arch/microblaze/kernel/syscalls/syscall.tbl   |  1 +
>>   arch/mips/kernel/syscalls/syscall_n32.tbl     |  1 +
>>   arch/mips/kernel/syscalls/syscall_n64.tbl     |  1 +
>>   arch/mips/kernel/syscalls/syscall_o32.tbl     |  1 +
>>   arch/parisc/kernel/syscalls/syscall.tbl       |  1 +
>>   arch/powerpc/kernel/syscalls/syscall.tbl      |  1 +
>>   arch/s390/kernel/syscalls/syscall.tbl         |  1 +
>>   arch/sh/kernel/syscalls/syscall.tbl           |  1 +
>>   arch/sparc/kernel/syscalls/syscall.tbl        |  1 +
>>   arch/x86/entry/syscalls/syscall_32.tbl        |  1 +
>>   arch/x86/entry/syscalls/syscall_64.tbl        |  1 +
>>   arch/xtensa/kernel/syscalls/syscall.tbl       |  1 +
>>   include/linux/mempolicy.h                     | 11 +++
>>   include/linux/syscalls.h                      |  4 +
>>   include/uapi/asm-generic/unistd.h             |  5 +-
>>   kernel/sys_ni.c                               |  1 +
>>   mm/mempolicy.c                                | 89 +++++++++++++++++++
>>   24 files changed, 146 insertions(+), 6 deletions(-)
>>
>> diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
>> index 5a6afecbb0d0..b864dd88b2d2 100644
>> --- a/Documentation/admin-guide/mm/numa_memory_policy.rst
>> +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
>> @@ -408,9 +408,10 @@ follows:
>>   Memory Policy APIs
>>   ==================
>>   
>> -Linux supports 4 system calls for controlling memory policy.  These APIS
>> -always affect only the calling task, the calling task's address space, or
>> -some shared object mapped into the calling task's address space.
>> +Linux supports 5 system calls for controlling memory policy.  The first four
>> +APIS affect only the calling task, the calling task's address space, or some
>> +shared object mapped into the calling task's address space. The last one can
>> +set the mempolicy of task specified in pidfd.
>>   
>>   .. note::
>>      the headers that define these APIs and the parameter data types for
>> @@ -473,6 +474,20 @@ closest to which page allocation will come from. Specifying the home node overri
>>   the default allocation policy to allocate memory close to the local node for an
>>   executing CPU.
>>   
>> +Set [pidfd Task] Memory Policy::
>> +
>> +        long sys_pidfd_set_mempolicy(int pidfd, int mode,
>> +                                     const unsigned long __user *nmask,
>> +                                     unsigned long maxnode,
>> +                                     unsigned int flags);
>> +
>> +Set's the [pidfd] task's "task/process memory policy". The pidfd argument is
>> +a PID file descriptor (see pidfd_open(2) man page) that specifies the process
>> +to which the mempolicy is to be applied. The flags argument is reserved for
>> +future use; currently, this argument must be specified as 0. Please see the
>> +set_mempolicy(2) man page for more details about other's arguments.
>> +
>> +
>>   
>>   Memory Policy Command Line Interface
>>   ====================================
> 
> The wording can be improved:
> 
> ---- >8 ----
> 
> diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
> index b864dd88b2d236..6df35bf4f960bd 100644
> --- a/Documentation/admin-guide/mm/numa_memory_policy.rst
> +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
> @@ -410,8 +410,8 @@ Memory Policy APIs
>   
>   Linux supports 5 system calls for controlling memory policy.  The first four
>   APIS affect only the calling task, the calling task's address space, or some
> -shared object mapped into the calling task's address space. The last one can
> -set the mempolicy of task specified in pidfd.
> +shared object mapped into the calling task's address space. The last one
> +sets the mempolicy of task specified in the pidfd.
>   
>   .. note::
>      the headers that define these APIs and the parameter data types for
> @@ -481,11 +481,11 @@ Set [pidfd Task] Memory Policy::
>                                        unsigned long maxnode,
>                                        unsigned int flags);
>   
> -Set's the [pidfd] task's "task/process memory policy". The pidfd argument is
> -a PID file descriptor (see pidfd_open(2) man page) that specifies the process
> -to which the mempolicy is to be applied. The flags argument is reserved for
> -future use; currently, this argument must be specified as 0. Please see the
> -set_mempolicy(2) man page for more details about other's arguments.
> +Sets the task/process memory policy for the [pidfd] task. The pidfd argument
> +is a PID file descriptor (see pidfd_open(2) man page for details) that
> +specifies the process for which the mempolicy is applied to. The flags
> +argument is reserved for future use; currently, it must be specified as 0.
> +For the description of all other arguments, see set_mempolicy(2) man page.
>   
>   
>   
> 
> Thanks.
> 

Hi Bagas

I got it, thanks for your suggestions.
Michal Hocko Oct. 12, 2022, 9:02 a.m. UTC | #10
On Wed 12-10-22 15:55:44, Zhongkun He wrote:
> Hi  michal, thanks for your reply and suggestiones.
> 
> > Please add some explanation why the cpuset interface is not usable for
> > that usecase.
> OK.
> 
> > > To solve the issue, this patch introduces a new syscall
> > > pidfd_set_mempolicy(2).  it sets the NUMA memory policy of the thread
> > > specified in pidfd.
> > > 
> > > In current process context there is no locking because only the process
> > > accesses its own memory policy, so task_work is used in
> > > pidfd_set_mempolicy() to update the mempolicy of the process specified
> > > in pidfd, avoid using locks and race conditions.
> > 
> > Why cannot you alter kernel_set_mempolicy (and do_set_mempolicy) to
> > accept a task rather than operate on current?
> 
> I have tried it before this patch, but I found a problem.The allocation and
> update of mempolicy are in the current context, so it is not protected by
> any lock.But when the mempolicy is modified by other processes, the race
> condition appears.
> Say something like the following
> 
> 	pidfd_set_mempolicy	     target task stack
> 				       alloc_pages
> 					mpol = get_task_policy;
> 	 task_lock(task);
> 	 old = task->mempolicy;
> 	 task->mempolicy = new;
> 	 task_unlock(task);
> 	 mpol_put(old);			
> 					page = __alloc_pages(mpol);
> There is a situation that when the old mempolicy is released, the target
> task is still using the policy.It would be better if there are suggestions
> on this case.

Yes, this will require some refactoring and one potential way is to make
mpol ref counting unconditional. The conditional ref. counting has
already caused issues in the past and the code is rather hard to follow
anyway. I am not really sure this optimization is worth it.

Another option would be to block the pidfd side of things on completion
which would wake it up from the task_work context but I would rather
explore the ref counting approach first and only if this is proven to be
too expensive to go with hacks like this.
Zhongkun He Oct. 12, 2022, 11:22 a.m. UTC | #11
> 
> Yes, this will require some refactoring and one potential way is to make
> mpol ref counting unconditional. The conditional ref. counting has
> already caused issues in the past and the code is rather hard to follow
> anyway. I am not really sure this optimization is worth it.
> 
> Another option would be to block the pidfd side of things on completion
> which would wake it up from the task_work context but I would rather
> explore the ref counting approach first and only if this is proven to be
> too expensive to go with hacks like this.

Hi Michal

The counting approach means executing mpol_get/put() when start/finish 
using mempolicy,right?  With the addition of lock add/dec on the hot 
path, the performance may be degraded.

I'll try it to see its performance impact in detail.


Thanks.
Michal Hocko Oct. 12, 2022, 12:15 p.m. UTC | #12
On Wed 12-10-22 19:22:21, Zhongkun He wrote:
> > 
> > Yes, this will require some refactoring and one potential way is to make
> > mpol ref counting unconditional. The conditional ref. counting has
> > already caused issues in the past and the code is rather hard to follow
> > anyway. I am not really sure this optimization is worth it.
> > 
> > Another option would be to block the pidfd side of things on completion
> > which would wake it up from the task_work context but I would rather
> > explore the ref counting approach first and only if this is proven to be
> > too expensive to go with hacks like this.
> 
> Hi Michal
> 
> The counting approach means executing mpol_get/put() when start/finish using
> mempolicy,right?

We already do that via mpol_{get,put} but there are cases where the
reference counting is ignored because it cannot be freed and also mpol_cond_put
resp. open coded versions of mpol_needs_cond_ref.
Vinicius Petrucci Oct. 12, 2022, 12:34 p.m. UTC | #13
> Well, per address range operation is a completely different beast I
> would say. External tool would need to a) understand what that range is
> used for (e.g. stack/heap ranges, mmaped shared files like libraries or
> private mappings) and b) by in sync with memory layout modifications
> done by applications (e.g. that an mmap has been issued to back malloc
> request). Quite a lot of understanding about the specific process. I
> would say that with that intimate knowledge it is quite better to be
> part of the process and do those changes from within of the process
> itself.

Sorry, this may be a digression, but just wanted to mention a
particular use case from a project I recently collaborated on (to
appear next month at IIWSC 2022:
http://www.iiswc.org/iiswc2022/index.html).

We carried out a performance analysis of the latest Linux AutoNUMA
memory tiering on graph processing applications. We noticed that hot
pages cannot be properly identified by the reactive approach used by
AutoNUMA due to irregular/random memory access patterns. Thus, as a
POC, we implemented and evaluated a simple idea of having an external
user-level process/agent that, based on prior profiling results of
memory regions, could make more effectively memory chunk/object-based
mappings (instead of page-level allocation/migration) in advance on
either DRAM or CXL/PMEM (via mbind calls). This kind of tiering
solution could deliver up to 2x more performance for graph analytics
workloads. We plan to evaluate other workloads as well.

Having a feature like "pidfd/process_mbind" would really simplify our
user-level agent implementation moving forward, as right now we are
adding a LD_PRELOAD wrapper (for signal handler) to listen and execute
"mbind" requests from another process. If there's any other
alternative solution to this already (via ptrace?), please let me
know.

Thank you!

Vinicius Petrucci
Principal Performance Engineer
Micron Technology
Michal Hocko Oct. 12, 2022, 1:07 p.m. UTC | #14
On Wed 12-10-22 07:34:06, Vinicius Petrucci wrote:
> > Well, per address range operation is a completely different beast I
> > would say. External tool would need to a) understand what that range is
> > used for (e.g. stack/heap ranges, mmaped shared files like libraries or
> > private mappings) and b) by in sync with memory layout modifications
> > done by applications (e.g. that an mmap has been issued to back malloc
> > request). Quite a lot of understanding about the specific process. I
> > would say that with that intimate knowledge it is quite better to be
> > part of the process and do those changes from within of the process
> > itself.
> 
> Sorry, this may be a digression, but just wanted to mention a
> particular use case from a project I recently collaborated on (to
> appear next month at IIWSC 2022:
> http://www.iiswc.org/iiswc2022/index.html).
> 
> We carried out a performance analysis of the latest Linux AutoNUMA
> memory tiering on graph processing applications. We noticed that hot
> pages cannot be properly identified by the reactive approach used by
> AutoNUMA due to irregular/random memory access patterns.

Yes, I can see how a reactive approach might not be the best fit.
Automatic NUMA balancing can help quite a lot where memory regions
are accessed consistently. I can imagine situations where the user space
agent can tell much better what is the best node to place data when the
access pattern is not obvbious or hard to deduce from local metrics.

My main argument is though that those are rather specialized and it is
much easier to implement the agent as a part of the process as they are
unlikely to be generic enough to serve many different processes. I might
be wrong in this of course and I am also not saying that pidfd_mbind is
a completely unreasonable idea. We just need a strong usecase before
going that way.

> Thus, as a
> POC, we implemented and evaluated a simple idea of having an external
> user-level process/agent that, based on prior profiling results of
> memory regions, could make more effectively memory chunk/object-based
> mappings (instead of page-level allocation/migration) in advance on
> either DRAM or CXL/PMEM (via mbind calls). This kind of tiering
> solution could deliver up to 2x more performance for graph analytics
> workloads. We plan to evaluate other workloads as well.
> 
> Having a feature like "pidfd/process_mbind" would really simplify our
> user-level agent implementation moving forward, as right now we are
> adding a LD_PRELOAD wrapper (for signal handler) to listen and execute
> "mbind" requests from another process. If there's any other
> alternative solution to this already (via ptrace?), please let me
> know.

userfaultfd sounds like the closest match if #PF handling under control
of an external agent is viable.
Michal Hocko Oct. 12, 2022, 1:23 p.m. UTC | #15
On Wed 12-10-22 15:07:48, Michal Hocko wrote:
> On Wed 12-10-22 07:34:06, Vinicius Petrucci wrote:
[...]
> > Having a feature like "pidfd/process_mbind" would really simplify our
> > user-level agent implementation moving forward, as right now we are
> > adding a LD_PRELOAD wrapper (for signal handler) to listen and execute
> > "mbind" requests from another process. If there's any other
> > alternative solution to this already (via ptrace?), please let me
> > know.
> 
> userfaultfd sounds like the closest match if #PF handling under control
> of an external agent is viable.

And just to clarify. I haven't ever played with using userfaultfd for
numa balancing so I might be completely wrong here.
Frank van der Linden Oct. 12, 2022, 4:51 p.m. UTC | #16
On Wed, Oct 12, 2022 at 5:34 AM Vinicius Petrucci <vpetrucci@gmail.com> wrote:
>
> > Well, per address range operation is a completely different beast I
> > would say. External tool would need to a) understand what that range is
> > used for (e.g. stack/heap ranges, mmaped shared files like libraries or
> > private mappings) and b) by in sync with memory layout modifications
> > done by applications (e.g. that an mmap has been issued to back malloc
> > request). Quite a lot of understanding about the specific process. I
> > would say that with that intimate knowledge it is quite better to be
> > part of the process and do those changes from within of the process
> > itself.
>
> Sorry, this may be a digression, but just wanted to mention a
> particular use case from a project I recently collaborated on (to
> appear next month at IIWSC 2022:
> http://www.iiswc.org/iiswc2022/index.html).
>
> We carried out a performance analysis of the latest Linux AutoNUMA
> memory tiering on graph processing applications. We noticed that hot
> pages cannot be properly identified by the reactive approach used by
> AutoNUMA due to irregular/random memory access patterns. Thus, as a
> POC, we implemented and evaluated a simple idea of having an external
> user-level process/agent that, based on prior profiling results of
> memory regions, could make more effectively memory chunk/object-based
> mappings (instead of page-level allocation/migration) in advance on
> either DRAM or CXL/PMEM (via mbind calls). This kind of tiering
> solution could deliver up to 2x more performance for graph analytics
> workloads. We plan to evaluate other workloads as well.
>
> Having a feature like "pidfd/process_mbind" would really simplify our
> user-level agent implementation moving forward, as right now we are
> adding a LD_PRELOAD wrapper (for signal handler) to listen and execute
> "mbind" requests from another process. If there's any other
> alternative solution to this already (via ptrace?), please let me
> know.
>

Interesting, looking forward to seeing your paper! This is the kind of
use case I was trying to describe for pidfd_mbind() - a userspace
orchestrator with some intimate knowledge of the process' memory
layout (through profiling, like in your case, or otherwise), that can
direct memory to the right nodes / memory tiers.

- Frank
Zhongkun He Oct. 13, 2022, 10:44 a.m. UTC | #17
> On Wed 12-10-22 19:22:21, Zhongkun He wrote:
>>>
>>> Yes, this will require some refactoring and one potential way is to make
>>> mpol ref counting unconditional. The conditional ref. counting has
>>> already caused issues in the past and the code is rather hard to follow
>>> anyway. I am not really sure this optimization is worth it.
>>>
>>> Another option would be to block the pidfd side of things on completion
>>> which would wake it up from the task_work context but I would rather
>>> explore the ref counting approach first and only if this is proven to be
>>> too expensive to go with hacks like this.
>>
>> Hi Michal
>>
>> The counting approach means executing mpol_get/put() when start/finish using
>> mempolicy,right?
> 
> We already do that via mpol_{get,put} but there are cases where the
> reference counting is ignored because it cannot be freed and also mpol_cond_put
> resp. open coded versions of mpol_needs_cond_ref.

Hi Michal

Could we try to change the MPOL_F_SHARED flag to MPOL_F_STATIC to
mark static mempolicy which cannot be freed, and mpol_needs_cond_ref
can use MPOL_F_STATIC to avoid freeing  the static mempolicy.
MPOL_F_SHARED loses its original meaning in counting approach.

Thanks.
Michal Hocko Oct. 13, 2022, 11:26 a.m. UTC | #18
On Thu 13-10-22 18:44:55, Zhongkun He wrote:
> > On Wed 12-10-22 19:22:21, Zhongkun He wrote:
> > > > 
> > > > Yes, this will require some refactoring and one potential way is to make
> > > > mpol ref counting unconditional. The conditional ref. counting has
> > > > already caused issues in the past and the code is rather hard to follow
> > > > anyway. I am not really sure this optimization is worth it.
> > > > 
> > > > Another option would be to block the pidfd side of things on completion
> > > > which would wake it up from the task_work context but I would rather
> > > > explore the ref counting approach first and only if this is proven to be
> > > > too expensive to go with hacks like this.
> > > 
> > > Hi Michal
> > > 
> > > The counting approach means executing mpol_get/put() when start/finish using
> > > mempolicy,right?
> > 
> > We already do that via mpol_{get,put} but there are cases where the
> > reference counting is ignored because it cannot be freed and also mpol_cond_put
> > resp. open coded versions of mpol_needs_cond_ref.
> 
> Hi Michal
> 
> Could we try to change the MPOL_F_SHARED flag to MPOL_F_STATIC to
> mark static mempolicy which cannot be freed, and mpol_needs_cond_ref
> can use MPOL_F_STATIC to avoid freeing  the static mempolicy.

Wouldn't it make more sense to get rid of a different treatment and
treat all memory policies the same way?
Zhongkun He Oct. 13, 2022, 12:50 p.m. UTC | #19
>> Hi Michal
>>
>> Could we try to change the MPOL_F_SHARED flag to MPOL_F_STATIC to
>> mark static mempolicy which cannot be freed, and mpol_needs_cond_ref
>> can use MPOL_F_STATIC to avoid freeing  the static mempolicy.
> 
> Wouldn't it make more sense to get rid of a different treatment and
> treat all memory policies the same way?

I found a case, not sure if it makes sense. If there is no policy
in task->mempolicy, the use of atomic_{inc,dec} can be skiped
according  to MPOL_F_STATIC. Atomic_{inc,dec} in hot path may reduces 
performance.

Thanks.
Michal Hocko Oct. 13, 2022, 1:17 p.m. UTC | #20
On Thu 13-10-22 20:50:48, Zhongkun He wrote:
> > > Hi Michal
> > > 
> > > Could we try to change the MPOL_F_SHARED flag to MPOL_F_STATIC to
> > > mark static mempolicy which cannot be freed, and mpol_needs_cond_ref
> > > can use MPOL_F_STATIC to avoid freeing  the static mempolicy.
> > 
> > Wouldn't it make more sense to get rid of a different treatment and
> > treat all memory policies the same way?
> 
> I found a case, not sure if it makes sense. If there is no policy
> in task->mempolicy, the use of atomic_{inc,dec} can be skiped
> according  to MPOL_F_STATIC. Atomic_{inc,dec} in hot path may reduces
> performance.

I would start with a simple conversion and do any potential
optimizations on top of that based on actual numbers. Maybe we can
special case default_policy to avoid reference counting a default (no
policy case). A simple check for pol == &default_policy should be
negligible.
Zhongkun He Oct. 13, 2022, 1:42 p.m. UTC | #21
> On Thu 13-10-22 20:50:48, Zhongkun He wrote:
>>>> Hi Michal
>>>>
>>>> Could we try to change the MPOL_F_SHARED flag to MPOL_F_STATIC to
>>>> mark static mempolicy which cannot be freed, and mpol_needs_cond_ref
>>>> can use MPOL_F_STATIC to avoid freeing  the static mempolicy.
>>>
>>> Wouldn't it make more sense to get rid of a different treatment and
>>> treat all memory policies the same way?
>>
>> I found a case, not sure if it makes sense. If there is no policy
>> in task->mempolicy, the use of atomic_{inc,dec} can be skiped
>> according  to MPOL_F_STATIC. Atomic_{inc,dec} in hot path may reduces
>> performance.
> 
> I would start with a simple conversion and do any potential
> optimizations on top of that based on actual numbers. Maybe we can
> special case default_policy to avoid reference counting a default (no
> policy case). A simple check for pol == &default_policy should be
> negligible.

Got it, thanks for your reply and suggestions.
diff mbox series

Patch

diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index 5a6afecbb0d0..b864dd88b2d2 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -408,9 +408,10 @@  follows:
 Memory Policy APIs
 ==================
 
-Linux supports 4 system calls for controlling memory policy.  These APIS
-always affect only the calling task, the calling task's address space, or
-some shared object mapped into the calling task's address space.
+Linux supports 5 system calls for controlling memory policy.  The first four
+APIS affect only the calling task, the calling task's address space, or some
+shared object mapped into the calling task's address space. The last one can
+set the mempolicy of task specified in pidfd.
 
 .. note::
    the headers that define these APIs and the parameter data types for
@@ -473,6 +474,20 @@  closest to which page allocation will come from. Specifying the home node overri
 the default allocation policy to allocate memory close to the local node for an
 executing CPU.
 
+Set [pidfd Task] Memory Policy::
+
+        long sys_pidfd_set_mempolicy(int pidfd, int mode,
+                                     const unsigned long __user *nmask,
+                                     unsigned long maxnode,
+                                     unsigned int flags);
+
+Set's the [pidfd] task's "task/process memory policy". The pidfd argument is
+a PID file descriptor (see pidfd_open(2) man page) that specifies the process
+to which the mempolicy is to be applied. The flags argument is reserved for
+future use; currently, this argument must be specified as 0. Please see the
+set_mempolicy(2) man page for more details about other's arguments.
+
+
 
 Memory Policy Command Line Interface
 ====================================
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 3515bc4f16a4..d86baa5a6eca 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -490,3 +490,4 @@ 
 558	common	process_mrelease		sys_process_mrelease
 559	common  futex_waitv                     sys_futex_waitv
 560	common	set_mempolicy_home_node		sys_ni_syscall
+561	common  pidfd_set_mempolicy		sys_ni_syscall
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index ac964612d8b0..a7ccab8aafef 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -464,3 +464,4 @@ 
 448	common	process_mrelease		sys_process_mrelease
 449	common	futex_waitv			sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pidfd_set_mempolicy		sys_pidfd_set_mempolicy
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 037feba03a51..64a514f90131 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@ 
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		451
+#define __NR_compat_syscalls		452
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 604a2053d006..2e178e9152e6 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -907,7 +907,8 @@  __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
 __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
 #define __NR_set_mempolicy_home_node 450
 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
-
+#define __NR_pidfd_set_mempolicy 451
+__SYSCALL(__NR_pidfd_set_mempolicy, sys_pidfd_set_mempolicy)
 /*
  * Please add new compat syscalls above this comment and update
  * __NR_compat_syscalls in asm/unistd.h.
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 78b1d03e86e1..4e5bca7a985c 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -371,3 +371,4 @@ 
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pidfd_set_mempolicy		sys_pidfd_set_mempolicy
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index b1f3940bc298..8e50bf8ec41d 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -450,3 +450,4 @@ 
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pidfd_set_mempolicy		sys_pidfd_set_mempolicy
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 820145e47350..f48e32532c5f 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -456,3 +456,4 @@ 
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pidfd_set_mempolicy		sys_pidfd_set_mempolicy
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 253ff994ed2e..58e7c3140f4a 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -389,3 +389,4 @@ 
 448	n32	process_mrelease		sys_process_mrelease
 449	n32	futex_waitv			sys_futex_waitv
 450	n32	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	n32	pidfd_set_mempolicy		sys_pidfd_set_mempolicy
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 3f1886ad9d80..70090c3ada16 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -365,3 +365,4 @@ 
 448	n64	process_mrelease		sys_process_mrelease
 449	n64	futex_waitv			sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	n64	pidfd_set_mempolicy		sys_pidfd_set_mempolicy
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 8f243e35a7b2..b0b0bad64fa0 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -438,3 +438,4 @@ 
 448	o32	process_mrelease		sys_process_mrelease
 449	o32	futex_waitv			sys_futex_waitv
 450	o32	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	o32	pidfd_set_mempolicy		sys_pidfd_set_mempolicy
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 8a99c998da9b..4dfd328490ad 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -448,3 +448,4 @@ 
 448	common	process_mrelease		sys_process_mrelease
 449	common	futex_waitv			sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pidfd_set_mempolicy		sys_pidfd_set_mempolicy
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 2600b4237292..20381463b2b5 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -530,3 +530,4 @@ 
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450 	nospu	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	nospu	pidfd_set_mempolicy		sys_pidfd_set_mempolicy
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 799147658dee..5e170dca81f6 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -453,3 +453,4 @@ 
 448  common	process_mrelease	sys_process_mrelease		sys_process_mrelease
 449  common	futex_waitv		sys_futex_waitv			sys_futex_waitv
 450  common	set_mempolicy_home_node	sys_set_mempolicy_home_node	sys_set_mempolicy_home_node
+451  common	pidfd_set_mempolicy	sys_pidfd_set_mempolicy		sys_pidfd_set_mempolicy
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 2de85c977f54..bd3a24a9b41f 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -453,3 +453,4 @@ 
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pidfd_set_mempolicy		sys_pidfd_set_mempolicy
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 4398cc6fb68d..bea2b5c6314b 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -496,3 +496,4 @@ 
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pidfd_set_mempolicy		sys_pidfd_set_mempolicy
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 320480a8db4f..97bc70f5a8f7 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -455,3 +455,4 @@ 
 448	i386	process_mrelease	sys_process_mrelease
 449	i386	futex_waitv		sys_futex_waitv
 450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	i386	pidfd_set_mempolicy	sys_pidfd_set_mempolicy
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index c84d12608cd2..ba6d19dbd8f8 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -372,6 +372,7 @@ 
 448	common	process_mrelease	sys_process_mrelease
 449	common	futex_waitv		sys_futex_waitv
 450	common	set_mempolicy_home_node	sys_set_mempolicy_home_node
+451	common	pidfd_set_mempolicy	sys_pidfd_set_mempolicy
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 52c94ab5c205..9f7c563da4fb 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -421,3 +421,4 @@ 
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pidfd_set_mempolicy		sys_pidfd_set_mempolicy
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 668389b4b53d..f3c522e8eb38 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -54,6 +54,17 @@  struct mempolicy {
 	} w;
 };
 
+/*
+ *  The information of mempolicy used by task_work
+ *  to update it dynamically.
+ */
+struct mpol_worker {
+	unsigned short mode;
+	unsigned short flags;
+	nodemask_t nodes;
+	struct rcu_head update_work;
+};
+
 /*
  * Support for managing mempolicy data objects (clone, copy, destroy)
  * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a34b0f9a9972..e611c088050b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1056,6 +1056,10 @@  asmlinkage long sys_memfd_secret(unsigned int flags);
 asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
 					    unsigned long home_node,
 					    unsigned long flags);
+asmlinkage long sys_pidfd_set_mempolicy(int pidfd, int mode,
+					const unsigned long __user *nmask,
+					unsigned long maxnode,
+					unsigned int flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 45fa180cc56a..c38013bbf5b0 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -886,8 +886,11 @@  __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
 #define __NR_set_mempolicy_home_node 450
 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
 
+#define __NR_pidfd_set_mempolicy 451
+__SYSCALL(__NR_pidfd_set_mempolicy, sys_pidfd_set_mempolicy)
+
 #undef __NR_syscalls
-#define __NR_syscalls 451
+#define __NR_syscalls 452
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 860b2dcf3ac4..5053deb888f7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -299,6 +299,7 @@  COND_SYSCALL(set_mempolicy);
 COND_SYSCALL(migrate_pages);
 COND_SYSCALL(move_pages);
 COND_SYSCALL(set_mempolicy_home_node);
+COND_SYSCALL(pidfd_set_mempolicy);
 
 COND_SYSCALL(perf_event_open);
 COND_SYSCALL(accept4);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b73d3248d976..2ce9be1a9dfd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -102,6 +102,7 @@ 
 #include <linux/mmu_notifier.h>
 #include <linux/printk.h>
 #include <linux/swapops.h>
+#include <linux/task_work.h>
 
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
@@ -1572,6 +1573,94 @@  SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
 	return kernel_set_mempolicy(mode, nmask, maxnode);
 }
 
+/*
+ *  In process context there is no locking, so task_work is used to update
+ *  the mempolicy of process specified in pidfd, avoid using locks.
+ */
+static void pidfd_update_mpol(struct callback_head *cb)
+{
+	struct mpol_worker *worker = container_of(cb, struct mpol_worker, update_work);
+
+	do_set_mempolicy(worker->mode, worker->flags, &worker->nodes);
+	kfree(worker);
+}
+
+/* Set the memory policy of the process specified in pidfd. */
+static long do_pidfd_set_mempolicy(int pidfd, int mode, const unsigned long __user *nmask,
+		unsigned long maxnode, unsigned int flags)
+{
+	nodemask_t nodes, task_nodes, tmp;
+	struct mpol_worker *worker;
+	struct task_struct *task;
+	unsigned short mode_flags;
+	int ret, lmode = mode;
+	unsigned int f_flags;
+
+	if (flags)
+		return -EINVAL;
+
+	task = pidfd_get_task(pidfd, &f_flags);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	/*
+	 * Check if this process has the right to modify the specified
+	 * process.
+	 */
+	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	/*
+	 * Require CAP_SYS_NICE for influencing process performance.
+	 * User's task is allowed only.
+	 */
+	if (!capable(CAP_SYS_NICE) || (task->flags & PF_KTHREAD)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	ret = get_nodes(&nodes, nmask, maxnode);
+	if (ret)
+		goto out;
+
+	ret = sanitize_mpol_flags(&lmode, &mode_flags);
+	if (ret)
+		goto out;
+
+	task_nodes = cpuset_mems_allowed(task);
+
+	if (mode_flags & MPOL_F_RELATIVE_NODES)
+		mpol_relative_nodemask(&tmp, &nodes, &task_nodes);
+	else
+		nodes_and(tmp, nodes, task_nodes);
+
+	if (nodes_empty(tmp)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	worker = kzalloc(sizeof(struct mpol_worker), GFP_KERNEL | __GFP_NOWARN);
+	if (!worker) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	init_task_work(&worker->update_work, pidfd_update_mpol);
+	worker->flags = mode_flags;
+	worker->mode = lmode;
+	worker->nodes = nodes;
+
+	ret = task_work_add(task, &worker->update_work, TWA_SIGNAL);
+out:
+	put_task_struct(task);
+	return ret;
+}
+
+SYSCALL_DEFINE5(pidfd_set_mempolicy, int, pidfd, int, mode, const unsigned long __user *, nmask,
+		unsigned long, maxnode, unsigned int, flags)
+{
+	return do_pidfd_set_mempolicy(pidfd, mode, nmask, maxnode, flags);
+}
+
 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
 				const unsigned long __user *old_nodes,
 				const unsigned long __user *new_nodes)