diff mbox series

[RFC,v3,1/2] mempinfd: Add new syscall to provide memory pin

Message ID 1612685884-19514-2-git-send-email-wangzhou1@hisilicon.com (mailing list archive)
State New, archived
Headers show
Series mempinfd: Add new syscall to provide memory pin | expand

Commit Message

Zhou Wang Feb. 7, 2021, 8:18 a.m. UTC
SVA(share virtual address) offers a way for device to share process virtual
address space safely, which makes more convenient for user space device
driver coding. However, IO page faults may happen when doing DMA
operations. As the latency of IO page fault is relatively big, DMA
performance will be affected severely when there are IO page faults.
From a long term view, DMA performance will be not stable.

In high-performance I/O cases, accelerators might want to perform
I/O on a memory without IO page faults which can result in dramatically
increased latency. Current memory related APIs could not achieve this
requirement, e.g. mlock can only avoid memory to swap to backup device,
page migration can still trigger IO page fault.

Various drivers working under traditional non-SVA mode are using
their own specific ioctl to do pin. Such ioctl can be seen in v4l2,
gpu, infiniband, media, vfio, etc. Drivers are usually doing dma
mapping while doing pin.

But, in SVA mode, pin could be a common need which isn't necessarily
bound with any drivers, and neither is dma mapping needed by drivers
since devices are using the virtual address of CPU. Thus, It is better
to introduce a new common syscall for it.

This patch leverages the design of userfaultfd and adds mempinfd for pin
to avoid messing up mm_struct. A fd will be got by mempinfd, then user
space can do pin/unpin pages by ioctls of this fd, all pinned pages under
one file will be unpinned in file release process. Like pin page cases in
other places, can_do_mlock is used to check permission and input
parameters.

Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
Signed-off-by: Sihang Chen <chensihang1@hisilicon.com>
Suggested-by: Barry Song <song.bao.hua@hisilicon.com>
---
 arch/arm64/include/asm/unistd.h   |   2 +-
 arch/arm64/include/asm/unistd32.h |   2 +
 fs/Makefile                       |   1 +
 fs/mempinfd.c                     | 199 ++++++++++++++++++++++++++++++++++++++
 include/linux/syscalls.h          |   1 +
 include/uapi/asm-generic/unistd.h |   4 +-
 include/uapi/linux/mempinfd.h     |  23 +++++
 init/Kconfig                      |   6 ++
 8 files changed, 236 insertions(+), 2 deletions(-)
 create mode 100644 fs/mempinfd.c
 create mode 100644 include/uapi/linux/mempinfd.h

Comments

Matthew Wilcox Feb. 7, 2021, 9:34 p.m. UTC | #1
On Sun, Feb 07, 2021 at 04:18:03PM +0800, Zhou Wang wrote:
> SVA(share virtual address) offers a way for device to share process virtual
> address space safely, which makes more convenient for user space device
> driver coding. However, IO page faults may happen when doing DMA
> operations. As the latency of IO page fault is relatively big, DMA
> performance will be affected severely when there are IO page faults.
> >From a long term view, DMA performance will be not stable.
> 
> In high-performance I/O cases, accelerators might want to perform
> I/O on a memory without IO page faults which can result in dramatically
> increased latency. Current memory related APIs could not achieve this
> requirement, e.g. mlock can only avoid memory to swap to backup device,
> page migration can still trigger IO page fault.

Well ... we have two requirements.  The application wants to not take
page faults.  The system wants to move the application to a different
NUMA node in order to optimise overall performance.  Why should the
application's desires take precedence over the kernel's desires?  And why
should it be done this way rather than by the sysadmin using numactl to
lock the application to a particular node?

> +struct mem_pin_container {
> +	struct xarray array;
> +	struct mutex lock;
> +};

I don't understand what the lock actually protects.

> +struct pin_pages {
> +	unsigned long first;
> +	unsigned long nr_pages;
> +	struct page **pages;
> +};

I don't think you need 'first', and I think you can embed the pages
array into this struct, removing one allocation.

> +	xa_for_each(&priv->array, idx, p) {
> +		unpin_user_pages(p->pages, p->nr_pages);
> +		xa_erase(&priv->array, p->first);
> +		vfree(p->pages);
> +		kfree(p);
> +	}
> +
> +	mutex_destroy(&priv->lock);
> +	xa_destroy(&priv->array);

If you just called xa_erase() on every element of the array, you don't need
to call xa_destroy().

> +	if (!can_do_mlock())
> +		return -EPERM;

You check for can_do_mlock(), but you don't account the pages to this
rlimit.

> +	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;

You don't need to mask off the bits, the shift will remove them.

> +	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;

DIV_ROUND_UP()?

> +	pages = vmalloc(nr_pages * sizeof(struct page *));

kvmalloc().  vmalloc() always allocates at least a page, so we want to
use kmalloc if the size is small.  Also, use array_size() -- I know this
can't overflow, but let's be clear

> +	ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
> +				  flags | FOLL_LONGTERM, pages);
> +	if (ret != nr_pages) {
> +		pr_err("mempinfd: Failed to pin page\n");

No.  You mustn't allow the user to be able to generate messages to syslog,
just by passing garbage to a syscall.

> +	ret = xa_insert(&priv->array, p->first, p, GFP_KERNEL);
> +	if (ret)
> +		goto unpin_pages;

Hmm.  So we can't pin two ranges which start at the same address, but we
can pin two overlapping ranges.  Is that OK?
Arnd Bergmann Feb. 7, 2021, 9:51 p.m. UTC | #2
On Sun, Feb 7, 2021 at 9:18 AM Zhou Wang <wangzhou1@hisilicon.com> wrote:

> diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
> index cccfbbe..3f49529 100644
> --- a/arch/arm64/include/asm/unistd32.h
> +++ b/arch/arm64/include/asm/unistd32.h
> @@ -891,6 +891,8 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
>  __SYSCALL(__NR_process_madvise, sys_process_madvise)
>  #define __NR_epoll_pwait2 441
>  __SYSCALL(__NR_epoll_pwait2, compat_sys_epoll_pwait2)
> +#define __NR_mempinfd 442
> +__SYSCALL(__NR_mempinfd, sys_mempinfd)

This adds a compat syscall for 32-bit tasks running on arm64 without adding
the same for the native arch/arm syscall table. Those two need to always
stay synchronized. In fact, new system call should ideally get assigned
on all architectures at the same time, with the same number (or +110
on arch/alpha).

         Arnd
Andy Lutomirski Feb. 7, 2021, 10:02 p.m. UTC | #3
> On Feb 7, 2021, at 12:31 AM, Zhou Wang <wangzhou1@hisilicon.com> wrote:
> 
> SVA(share virtual address) offers a way for device to share process virtual
> address space safely, which makes more convenient for user space device
> driver coding. However, IO page faults may happen when doing DMA
> operations. As the latency of IO page fault is relatively big, DMA
> performance will be affected severely when there are IO page faults.
> From a long term view, DMA performance will be not stable.
> 
> In high-performance I/O cases, accelerators might want to perform
> I/O on a memory without IO page faults which can result in dramatically
> increased latency. Current memory related APIs could not achieve this
> requirement, e.g. mlock can only avoid memory to swap to backup device,
> page migration can still trigger IO page fault.
> 
> Various drivers working under traditional non-SVA mode are using
> their own specific ioctl to do pin. Such ioctl can be seen in v4l2,
> gpu, infiniband, media, vfio, etc. Drivers are usually doing dma
> mapping while doing pin.
> 
> But, in SVA mode, pin could be a common need which isn't necessarily
> bound with any drivers, and neither is dma mapping needed by drivers
> since devices are using the virtual address of CPU. Thus, It is better
> to introduce a new common syscall for it.
> 
> This patch leverages the design of userfaultfd and adds mempinfd for pin
> to avoid messing up mm_struct. A fd will be got by mempinfd, then user
> space can do pin/unpin pages by ioctls of this fd, all pinned pages under
> one file will be unpinned in file release process. Like pin page cases in
> other places, can_do_mlock is used to check permission and input
> parameters.


Can you document what the syscall does?

Userfaultfd is an fd because one program controls another.  Is mempinfd like this?
Song Bao Hua (Barry Song) Feb. 7, 2021, 10:24 p.m. UTC | #4
> -----Original Message-----
> From: Matthew Wilcox [mailto:willy@infradead.org]
> Sent: Monday, February 8, 2021 10:34 AM
> To: Wangzhou (B) <wangzhou1@hisilicon.com>
> Cc: linux-kernel@vger.kernel.org; iommu@lists.linux-foundation.org;
> linux-mm@kvack.org; linux-arm-kernel@lists.infradead.org;
> linux-api@vger.kernel.org; Andrew Morton <akpm@linux-foundation.org>;
> Alexander Viro <viro@zeniv.linux.org.uk>; gregkh@linuxfoundation.org; Song
> Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>; jgg@ziepe.ca;
> kevin.tian@intel.com; jean-philippe@linaro.org; eric.auger@redhat.com;
> Liguozhu (Kenneth) <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org;
> chensihang (A) <chensihang1@hisilicon.com>
> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> pin
> 
> On Sun, Feb 07, 2021 at 04:18:03PM +0800, Zhou Wang wrote:
> > SVA(share virtual address) offers a way for device to share process virtual
> > address space safely, which makes more convenient for user space device
> > driver coding. However, IO page faults may happen when doing DMA
> > operations. As the latency of IO page fault is relatively big, DMA
> > performance will be affected severely when there are IO page faults.
> > >From a long term view, DMA performance will be not stable.
> >
> > In high-performance I/O cases, accelerators might want to perform
> > I/O on a memory without IO page faults which can result in dramatically
> > increased latency. Current memory related APIs could not achieve this
> > requirement, e.g. mlock can only avoid memory to swap to backup device,
> > page migration can still trigger IO page fault.
> 
> Well ... we have two requirements.  The application wants to not take
> page faults.  The system wants to move the application to a different
> NUMA node in order to optimise overall performance.  Why should the
> application's desires take precedence over the kernel's desires?  And why
> should it be done this way rather than by the sysadmin using numactl to
> lock the application to a particular node?

NUMA balancer is just one of many reasons for page migration. Even one
simple alloc_pages() can cause memory migration in just single NUMA
node or UMA system.

The other reasons for page migration include but are not limited to:
* memory move due to CMA
* memory move due to huge pages creation

Hardly we can ask users to disable the COMPACTION, CMA and Huge Page
in the whole system.

On the other hand, numactl doesn't always bind memory to single NUMA
node, sometimes, while applications require many cpu, it could bind
more than one memory node.

Thanks
Barry
Matthew Wilcox Feb. 8, 2021, 1:30 a.m. UTC | #5
On Sun, Feb 07, 2021 at 10:24:28PM +0000, Song Bao Hua (Barry Song) wrote:
> > > In high-performance I/O cases, accelerators might want to perform
> > > I/O on a memory without IO page faults which can result in dramatically
> > > increased latency. Current memory related APIs could not achieve this
> > > requirement, e.g. mlock can only avoid memory to swap to backup device,
> > > page migration can still trigger IO page fault.
> > 
> > Well ... we have two requirements.  The application wants to not take
> > page faults.  The system wants to move the application to a different
> > NUMA node in order to optimise overall performance.  Why should the
> > application's desires take precedence over the kernel's desires?  And why
> > should it be done this way rather than by the sysadmin using numactl to
> > lock the application to a particular node?
> 
> NUMA balancer is just one of many reasons for page migration. Even one
> simple alloc_pages() can cause memory migration in just single NUMA
> node or UMA system.
> 
> The other reasons for page migration include but are not limited to:
> * memory move due to CMA
> * memory move due to huge pages creation
> 
> Hardly we can ask users to disable the COMPACTION, CMA and Huge Page
> in the whole system.

You're dodging the question.  Should the CMA allocation fail because
another application is using SVA?

I would say no.  The application using SVA should take the one-time
performance hit from having its memory moved around.
David Rientjes Feb. 8, 2021, 2:18 a.m. UTC | #6
On Sun, 7 Feb 2021, Song Bao Hua (Barry Song) wrote:

> NUMA balancer is just one of many reasons for page migration. Even one
> simple alloc_pages() can cause memory migration in just single NUMA
> node or UMA system.
> 
> The other reasons for page migration include but are not limited to:
> * memory move due to CMA
> * memory move due to huge pages creation
> 
> Hardly we can ask users to disable the COMPACTION, CMA and Huge Page
> in the whole system.
> 

What about only for mlocked memory, i.e. disable 
vm.compact_unevictable_allowed?

Adding syscalls is a big deal, we can make a reasonable inference that 
we'll have to support this forever if it's merged.  I haven't seen mention 
of what other unevictable memory *should* be migratable that would be 
adversely affected if we disable that sysctl.  Maybe that gets you part of 
the way there and there are some other deficiencies, but it seems like a 
good start would be to describe how CONFIG_NUMA_BALANCING=n + 
vm.compact_unevcitable_allowed + mlock() doesn't get you mostly there and 
then look into what's missing.

If it's a very compelling case where there simply are no alternatives, it 
would make sense.  Alternative is to find a more generic way, perhaps in 
combination with vm.compact_unevictable_allowed, to achieve what you're 
looking to do that can be useful even beyond your originally intended use 
case.
Song Bao Hua (Barry Song) Feb. 8, 2021, 2:27 a.m. UTC | #7
> -----Original Message-----
> From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On Behalf Of
> Matthew Wilcox
> Sent: Monday, February 8, 2021 2:31 PM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
> Morton <akpm@linux-foundation.org>; Alexander Viro <viro@zeniv.linux.org.uk>;
> gregkh@linuxfoundation.org; jgg@ziepe.ca; kevin.tian@intel.com;
> jean-philippe@linaro.org; eric.auger@redhat.com; Liguozhu (Kenneth)
> <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org; chensihang (A)
> <chensihang1@hisilicon.com>
> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> pin
> 
> On Sun, Feb 07, 2021 at 10:24:28PM +0000, Song Bao Hua (Barry Song) wrote:
> > > > In high-performance I/O cases, accelerators might want to perform
> > > > I/O on a memory without IO page faults which can result in dramatically
> > > > increased latency. Current memory related APIs could not achieve this
> > > > requirement, e.g. mlock can only avoid memory to swap to backup device,
> > > > page migration can still trigger IO page fault.
> > >
> > > Well ... we have two requirements.  The application wants to not take
> > > page faults.  The system wants to move the application to a different
> > > NUMA node in order to optimise overall performance.  Why should the
> > > application's desires take precedence over the kernel's desires?  And why
> > > should it be done this way rather than by the sysadmin using numactl to
> > > lock the application to a particular node?
> >
> > NUMA balancer is just one of many reasons for page migration. Even one
> > simple alloc_pages() can cause memory migration in just single NUMA
> > node or UMA system.
> >
> > The other reasons for page migration include but are not limited to:
> > * memory move due to CMA
> > * memory move due to huge pages creation
> >
> > Hardly we can ask users to disable the COMPACTION, CMA and Huge Page
> > in the whole system.
> 
> You're dodging the question.  Should the CMA allocation fail because
> another application is using SVA?
> 
> I would say no.  

I would say no as well.

While IOMMU is enabled, CMA almost has one user only: IOMMU driver
as other drivers will depend on iommu to use non-contiguous memory
though they are still calling dma_alloc_coherent().

In iommu driver, dma_alloc_coherent is called during initialization
and there is no new allocation afterwards. So it wouldn't cause
runtime impact on SVA performance. Even there is new allocations,
CMA will fall back to general alloc_pages() and iommu drivers are
almost allocating small memory for command queues.

So I would say general compound pages, huge pages, especially
transparent huge pages, would be bigger concerns than CMA for
internal page migration within one NUMA. 

Not like CMA, general alloc_pages() can get memory by moving
pages other than those pinned.

And there is no guarantee we can always bind the memory of
SVA applications to single one NUMA, so NUMA balancing is
still a concern.

But I agree we need a way to make CMA success while the userspace
pages are pinned. Since pin has been viral in many drivers, I
assume there is a way to handle this. Otherwise, APIs like 
V4L2_MEMORY_USERPTR[1] will possibly make CMA fail as there
is no guarantee that usersspace will allocate unmovable memory
and there is no guarantee the fallback path- alloc_pages() can
succeed while allocating big memory.

Will investigate more.

> The application using SVA should take the one-time
> performance hit from having its memory moved around.

Sometimes I also feel SVA is doomed to suffer from performance
impact due to page migration. But we are still trying to
extend its use cases to high-performance I/O.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/media/v4l2-core/videobuf-dma-sg.c

Thanks
Barry
Song Bao Hua (Barry Song) Feb. 8, 2021, 5:34 a.m. UTC | #8
> -----Original Message-----
> From: David Rientjes [mailto:rientjes@google.com]
> Sent: Monday, February 8, 2021 3:18 PM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> Cc: Matthew Wilcox <willy@infradead.org>; Wangzhou (B)
> <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
> Morton <akpm@linux-foundation.org>; Alexander Viro <viro@zeniv.linux.org.uk>;
> gregkh@linuxfoundation.org; jgg@ziepe.ca; kevin.tian@intel.com;
> jean-philippe@linaro.org; eric.auger@redhat.com; Liguozhu (Kenneth)
> <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org; chensihang (A)
> <chensihang1@hisilicon.com>
> Subject: RE: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> pin
> 
> On Sun, 7 Feb 2021, Song Bao Hua (Barry Song) wrote:
> 
> > NUMA balancer is just one of many reasons for page migration. Even one
> > simple alloc_pages() can cause memory migration in just single NUMA
> > node or UMA system.
> >
> > The other reasons for page migration include but are not limited to:
> > * memory move due to CMA
> > * memory move due to huge pages creation
> >
> > Hardly we can ask users to disable the COMPACTION, CMA and Huge Page
> > in the whole system.
> >
> 
> What about only for mlocked memory, i.e. disable
> vm.compact_unevictable_allowed?
> 
> Adding syscalls is a big deal, we can make a reasonable inference that
> we'll have to support this forever if it's merged.  I haven't seen mention
> of what other unevictable memory *should* be migratable that would be
> adversely affected if we disable that sysctl.  Maybe that gets you part of
> the way there and there are some other deficiencies, but it seems like a
> good start would be to describe how CONFIG_NUMA_BALANCING=n +
> vm.compact_unevcitable_allowed + mlock() doesn't get you mostly there and
> then look into what's missing.
> 

I believe it can resolve the performance problem for the SVA
applications if we disable vm.compact_unevcitable_allowed and
NUMA_BALANCE, and use mlock().

The problem is that it is insensible to ask users to disable
unevictable_allowed or numa balancing of the whole system
only because there is one SVA application in the system.

SVA, for itself, is a mechanism to let cpu and devices share same
address space. In a typical server system, there are many processes,
the better way would be only changing the behavior of the specific
process rather than changing the whole system. It is hard to ask
users to do that only because there is a SVA monster.
Plus, this might negatively affect those applications not using SVA.

> If it's a very compelling case where there simply are no alternatives, it
> would make sense.  Alternative is to find a more generic way, perhaps in
> combination with vm.compact_unevictable_allowed, to achieve what you're
> looking to do that can be useful even beyond your originally intended use
> case.

sensible. Actually pin is exactly the way to disable migration for specific
pages AKA. disabling "vm.compact_unevictable_allowed" on those pages.

It is hard to differentiate what pages should not be migrated. Only apps
know that as even SVA applications can allocate many non-IO pages which
should be able to move.

Thanks
Barry
David Hildenbrand Feb. 8, 2021, 8:14 a.m. UTC | #9
On 07.02.21 09:18, Zhou Wang wrote:
> SVA(share virtual address) offers a way for device to share process virtual
> address space safely, which makes more convenient for user space device
> driver coding. However, IO page faults may happen when doing DMA
> operations. As the latency of IO page fault is relatively big, DMA
> performance will be affected severely when there are IO page faults.
>  From a long term view, DMA performance will be not stable.
> 
> In high-performance I/O cases, accelerators might want to perform
> I/O on a memory without IO page faults which can result in dramatically
> increased latency. Current memory related APIs could not achieve this
> requirement, e.g. mlock can only avoid memory to swap to backup device,
> page migration can still trigger IO page fault.
> 
> Various drivers working under traditional non-SVA mode are using
> their own specific ioctl to do pin. Such ioctl can be seen in v4l2,
> gpu, infiniband, media, vfio, etc. Drivers are usually doing dma
> mapping while doing pin.
> 
> But, in SVA mode, pin could be a common need which isn't necessarily
> bound with any drivers, and neither is dma mapping needed by drivers
> since devices are using the virtual address of CPU. Thus, It is better
> to introduce a new common syscall for it.
> 
> This patch leverages the design of userfaultfd and adds mempinfd for pin
> to avoid messing up mm_struct. A fd will be got by mempinfd, then user
> space can do pin/unpin pages by ioctls of this fd, all pinned pages under
> one file will be unpinned in file release process. Like pin page cases in
> other places, can_do_mlock is used to check permission and input
> parameters.
> 
> Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
> Signed-off-by: Sihang Chen <chensihang1@hisilicon.com>
> Suggested-by: Barry Song <song.bao.hua@hisilicon.com>
> ---
>   arch/arm64/include/asm/unistd.h   |   2 +-
>   arch/arm64/include/asm/unistd32.h |   2 +
>   fs/Makefile                       |   1 +
>   fs/mempinfd.c                     | 199 ++++++++++++++++++++++++++++++++++++++
>   include/linux/syscalls.h          |   1 +
>   include/uapi/asm-generic/unistd.h |   4 +-
>   include/uapi/linux/mempinfd.h     |  23 +++++
>   init/Kconfig                      |   6 ++
>   8 files changed, 236 insertions(+), 2 deletions(-)
>   create mode 100644 fs/mempinfd.c
>   create mode 100644 include/uapi/linux/mempinfd.h
> 
> diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
> index 86a9d7b3..949788f 100644
> --- a/arch/arm64/include/asm/unistd.h
> +++ b/arch/arm64/include/asm/unistd.h
> @@ -38,7 +38,7 @@
>   #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
>   #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
>   
> -#define __NR_compat_syscalls		442
> +#define __NR_compat_syscalls		443
>   #endif
>   
>   #define __ARCH_WANT_SYS_CLONE
> diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
> index cccfbbe..3f49529 100644
> --- a/arch/arm64/include/asm/unistd32.h
> +++ b/arch/arm64/include/asm/unistd32.h
> @@ -891,6 +891,8 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
>   __SYSCALL(__NR_process_madvise, sys_process_madvise)
>   #define __NR_epoll_pwait2 441
>   __SYSCALL(__NR_epoll_pwait2, compat_sys_epoll_pwait2)
> +#define __NR_mempinfd 442
> +__SYSCALL(__NR_mempinfd, sys_mempinfd)
>   
>   /*
>    * Please add new compat syscalls above this comment and update
> diff --git a/fs/Makefile b/fs/Makefile
> index 999d1a2..e1cbf12 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -54,6 +54,7 @@ obj-$(CONFIG_COREDUMP)		+= coredump.o
>   obj-$(CONFIG_SYSCTL)		+= drop_caches.o
>   
>   obj-$(CONFIG_FHANDLE)		+= fhandle.o
> +obj-$(CONFIG_MEMPINFD)		+= mempinfd.o
>   obj-y				+= iomap/
>   
>   obj-y				+= quota/
> diff --git a/fs/mempinfd.c b/fs/mempinfd.c
> new file mode 100644
> index 0000000..23d3911
> --- /dev/null
> +++ b/fs/mempinfd.c
> @@ -0,0 +1,199 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2021 HiSilicon Limited. */
> +#include <linux/anon_inodes.h>
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +#include <linux/mm_types.h>
> +#include <linux/slab.h>
> +#include <linux/syscalls.h>
> +#include <linux/uaccess.h>
> +#include <linux/vmalloc.h>
> +#include <linux/xarray.h>
> +#include <uapi/linux/mempinfd.h>
> +
> +struct mem_pin_container {
> +	struct xarray array;
> +	struct mutex lock;
> +};
> +
> +struct pin_pages {
> +	unsigned long first;
> +	unsigned long nr_pages;
> +	struct page **pages;
> +};
> +
> +static int mempinfd_release(struct inode *inode, struct file *file)
> +{
> +	struct mem_pin_container *priv = file->private_data;
> +	struct pin_pages *p;
> +	unsigned long idx;
> +
> +	xa_for_each(&priv->array, idx, p) {
> +		unpin_user_pages(p->pages, p->nr_pages);
> +		xa_erase(&priv->array, p->first);
> +		vfree(p->pages);
> +		kfree(p);
> +	}
> +
> +	mutex_destroy(&priv->lock);
> +	xa_destroy(&priv->array);
> +	kfree(priv);
> +
> +	return 0;
> +}
> +
> +static int mempinfd_input_check(u64 addr, u64 size)
> +{
> +	if (!size || addr + size < addr)
> +		return -EINVAL;
> +
> +	if (!can_do_mlock())
> +		return -EPERM;
> +
> +	return 0;
> +}
> +
> +static int mem_pin_page(struct mem_pin_container *priv,
> +			struct mem_pin_address *addr)
> +{
> +	unsigned int flags = FOLL_FORCE | FOLL_WRITE;
> +	unsigned long first, last, nr_pages;
> +	struct page **pages;
> +	struct pin_pages *p;
> +	int ret;
> +
> +	if (mempinfd_input_check(addr->addr, addr->size))
> +		return -EINVAL;
> +
> +	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
> +	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
> +	nr_pages = last - first + 1;
> +
> +	pages = vmalloc(nr_pages * sizeof(struct page *));
> +	if (!pages)
> +		return -ENOMEM;
> +
> +	p = kzalloc(sizeof(*p), GFP_KERNEL);
> +	if (!p) {
> +		ret = -ENOMEM;
> +		goto free;
> +	}
> +
> +	mutex_lock(&priv->lock);
> +
> +	ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
> +				  flags | FOLL_LONGTERM, pages);
> +	if (ret != nr_pages) {
> +		pr_err("mempinfd: Failed to pin page\n");
> +		goto unlock;
> +	}

People are constantly struggling with the effects of long term pinnings 
under user space control, like we already have with vfio and RDMA.

And here we are, adding yet another, easier way to mess with core MM in 
the same way. This feels like a step backwards to me.
David Hildenbrand Feb. 8, 2021, 8:21 a.m. UTC | #10
On 08.02.21 03:27, Song Bao Hua (Barry Song) wrote:
> 
> 
>> -----Original Message-----
>> From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On Behalf Of
>> Matthew Wilcox
>> Sent: Monday, February 8, 2021 2:31 PM
>> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
>> Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
>> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
>> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
>> Morton <akpm@linux-foundation.org>; Alexander Viro <viro@zeniv.linux.org.uk>;
>> gregkh@linuxfoundation.org; jgg@ziepe.ca; kevin.tian@intel.com;
>> jean-philippe@linaro.org; eric.auger@redhat.com; Liguozhu (Kenneth)
>> <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org; chensihang (A)
>> <chensihang1@hisilicon.com>
>> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
>> pin
>>
>> On Sun, Feb 07, 2021 at 10:24:28PM +0000, Song Bao Hua (Barry Song) wrote:
>>>>> In high-performance I/O cases, accelerators might want to perform
>>>>> I/O on a memory without IO page faults which can result in dramatically
>>>>> increased latency. Current memory related APIs could not achieve this
>>>>> requirement, e.g. mlock can only avoid memory to swap to backup device,
>>>>> page migration can still trigger IO page fault.
>>>>
>>>> Well ... we have two requirements.  The application wants to not take
>>>> page faults.  The system wants to move the application to a different
>>>> NUMA node in order to optimise overall performance.  Why should the
>>>> application's desires take precedence over the kernel's desires?  And why
>>>> should it be done this way rather than by the sysadmin using numactl to
>>>> lock the application to a particular node?
>>>
>>> NUMA balancer is just one of many reasons for page migration. Even one
>>> simple alloc_pages() can cause memory migration in just single NUMA
>>> node or UMA system.
>>>
>>> The other reasons for page migration include but are not limited to:
>>> * memory move due to CMA
>>> * memory move due to huge pages creation
>>>
>>> Hardly we can ask users to disable the COMPACTION, CMA and Huge Page
>>> in the whole system.
>>
>> You're dodging the question.  Should the CMA allocation fail because
>> another application is using SVA?
>>
>> I would say no.
> 
> I would say no as well.
> 
> While IOMMU is enabled, CMA almost has one user only: IOMMU driver
> as other drivers will depend on iommu to use non-contiguous memory
> though they are still calling dma_alloc_coherent().
> 
> In iommu driver, dma_alloc_coherent is called during initialization
> and there is no new allocation afterwards. So it wouldn't cause
> runtime impact on SVA performance. Even there is new allocations,
> CMA will fall back to general alloc_pages() and iommu drivers are
> almost allocating small memory for command queues.
> 
> So I would say general compound pages, huge pages, especially
> transparent huge pages, would be bigger concerns than CMA for
> internal page migration within one NUMA.
> 
> Not like CMA, general alloc_pages() can get memory by moving
> pages other than those pinned.
> 
> And there is no guarantee we can always bind the memory of
> SVA applications to single one NUMA, so NUMA balancing is
> still a concern.
> 
> But I agree we need a way to make CMA success while the userspace
> pages are pinned. Since pin has been viral in many drivers, I
> assume there is a way to handle this. Otherwise, APIs like
> V4L2_MEMORY_USERPTR[1] will possibly make CMA fail as there
> is no guarantee that usersspace will allocate unmovable memory
> and there is no guarantee the fallback path- alloc_pages() can
> succeed while allocating big memory.
> 

Long term pinnings cannot go onto CMA-reserved memory, and there is 
similar work to also fix ZONE_MOVABLE in that regard.

https://lkml.kernel.org/r/20210125194751.1275316-1-pasha.tatashin@soleen.com

One of the reasons I detest using long term pinning of pages where it 
could be avoided. Take VFIO and RDMA as an example: these things 
currently can't work without them.

What I read here: "DMA performance will be affected severely". That does 
not sound like a compelling argument to me for long term pinnings. 
Please find another way to achieve the same goal without long term 
pinnings controlled by user space - e.g., controlling when migration 
actually happens.

For example, CMA/alloc_contig_range()/memory unplug are corner cases 
that happen rarely, you shouldn't have to worry about them messing with 
your DMA performance.
Song Bao Hua (Barry Song) Feb. 8, 2021, 10:13 a.m. UTC | #11
> -----Original Message-----
> From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On Behalf Of
> David Hildenbrand
> Sent: Monday, February 8, 2021 9:22 PM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>; Matthew Wilcox
> <willy@infradead.org>
> Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
> Morton <akpm@linux-foundation.org>; Alexander Viro <viro@zeniv.linux.org.uk>;
> gregkh@linuxfoundation.org; jgg@ziepe.ca; kevin.tian@intel.com;
> jean-philippe@linaro.org; eric.auger@redhat.com; Liguozhu (Kenneth)
> <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org; chensihang (A)
> <chensihang1@hisilicon.com>
> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> pin
> 
> On 08.02.21 03:27, Song Bao Hua (Barry Song) wrote:
> >
> >
> >> -----Original Message-----
> >> From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On Behalf
> Of
> >> Matthew Wilcox
> >> Sent: Monday, February 8, 2021 2:31 PM
> >> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> >> Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
> >> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
> >> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
> >> Morton <akpm@linux-foundation.org>; Alexander Viro
> <viro@zeniv.linux.org.uk>;
> >> gregkh@linuxfoundation.org; jgg@ziepe.ca; kevin.tian@intel.com;
> >> jean-philippe@linaro.org; eric.auger@redhat.com; Liguozhu (Kenneth)
> >> <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org; chensihang (A)
> >> <chensihang1@hisilicon.com>
> >> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> >> pin
> >>
> >> On Sun, Feb 07, 2021 at 10:24:28PM +0000, Song Bao Hua (Barry Song) wrote:
> >>>>> In high-performance I/O cases, accelerators might want to perform
> >>>>> I/O on a memory without IO page faults which can result in dramatically
> >>>>> increased latency. Current memory related APIs could not achieve this
> >>>>> requirement, e.g. mlock can only avoid memory to swap to backup device,
> >>>>> page migration can still trigger IO page fault.
> >>>>
> >>>> Well ... we have two requirements.  The application wants to not take
> >>>> page faults.  The system wants to move the application to a different
> >>>> NUMA node in order to optimise overall performance.  Why should the
> >>>> application's desires take precedence over the kernel's desires?  And why
> >>>> should it be done this way rather than by the sysadmin using numactl to
> >>>> lock the application to a particular node?
> >>>
> >>> NUMA balancer is just one of many reasons for page migration. Even one
> >>> simple alloc_pages() can cause memory migration in just single NUMA
> >>> node or UMA system.
> >>>
> >>> The other reasons for page migration include but are not limited to:
> >>> * memory move due to CMA
> >>> * memory move due to huge pages creation
> >>>
> >>> Hardly we can ask users to disable the COMPACTION, CMA and Huge Page
> >>> in the whole system.
> >>
> >> You're dodging the question.  Should the CMA allocation fail because
> >> another application is using SVA?
> >>
> >> I would say no.
> >
> > I would say no as well.
> >
> > While IOMMU is enabled, CMA almost has one user only: IOMMU driver
> > as other drivers will depend on iommu to use non-contiguous memory
> > though they are still calling dma_alloc_coherent().
> >
> > In iommu driver, dma_alloc_coherent is called during initialization
> > and there is no new allocation afterwards. So it wouldn't cause
> > runtime impact on SVA performance. Even there is new allocations,
> > CMA will fall back to general alloc_pages() and iommu drivers are
> > almost allocating small memory for command queues.
> >
> > So I would say general compound pages, huge pages, especially
> > transparent huge pages, would be bigger concerns than CMA for
> > internal page migration within one NUMA.
> >
> > Not like CMA, general alloc_pages() can get memory by moving
> > pages other than those pinned.
> >
> > And there is no guarantee we can always bind the memory of
> > SVA applications to single one NUMA, so NUMA balancing is
> > still a concern.
> >
> > But I agree we need a way to make CMA success while the userspace
> > pages are pinned. Since pin has been viral in many drivers, I
> > assume there is a way to handle this. Otherwise, APIs like
> > V4L2_MEMORY_USERPTR[1] will possibly make CMA fail as there
> > is no guarantee that usersspace will allocate unmovable memory
> > and there is no guarantee the fallback path- alloc_pages() can
> > succeed while allocating big memory.
> >
> 
> Long term pinnings cannot go onto CMA-reserved memory, and there is
> similar work to also fix ZONE_MOVABLE in that regard.
> 
> https://lkml.kernel.org/r/20210125194751.1275316-1-pasha.tatashin@soleen.c
> om
> 
> One of the reasons I detest using long term pinning of pages where it
> could be avoided. Take VFIO and RDMA as an example: these things
> currently can't work without them.
> 
> What I read here: "DMA performance will be affected severely". That does
> not sound like a compelling argument to me for long term pinnings.
> Please find another way to achieve the same goal without long term
> pinnings controlled by user space - e.g., controlling when migration
> actually happens.
> 
> For example, CMA/alloc_contig_range()/memory unplug are corner cases
> that happen rarely, you shouldn't have to worry about them messing with
> your DMA performance.

I agree CMA/alloc_contig_range()/memory unplug would be corner cases,
the major cases would be THP, NUMA balancing while we could totally
disable them but it seems insensible to do that only because there is
a process using SVA in the system.

> 
> --
> Thanks,
> 
> David / dhildenb

Thanks
Barry
David Hildenbrand Feb. 8, 2021, 10:37 a.m. UTC | #12
On 08.02.21 11:13, Song Bao Hua (Barry Song) wrote:
> 
> 
>> -----Original Message-----
>> From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On Behalf Of
>> David Hildenbrand
>> Sent: Monday, February 8, 2021 9:22 PM
>> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>; Matthew Wilcox
>> <willy@infradead.org>
>> Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
>> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
>> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
>> Morton <akpm@linux-foundation.org>; Alexander Viro <viro@zeniv.linux.org.uk>;
>> gregkh@linuxfoundation.org; jgg@ziepe.ca; kevin.tian@intel.com;
>> jean-philippe@linaro.org; eric.auger@redhat.com; Liguozhu (Kenneth)
>> <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org; chensihang (A)
>> <chensihang1@hisilicon.com>
>> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
>> pin
>>
>> On 08.02.21 03:27, Song Bao Hua (Barry Song) wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On Behalf
>> Of
>>>> Matthew Wilcox
>>>> Sent: Monday, February 8, 2021 2:31 PM
>>>> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
>>>> Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
>>>> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
>>>> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
>>>> Morton <akpm@linux-foundation.org>; Alexander Viro
>> <viro@zeniv.linux.org.uk>;
>>>> gregkh@linuxfoundation.org; jgg@ziepe.ca; kevin.tian@intel.com;
>>>> jean-philippe@linaro.org; eric.auger@redhat.com; Liguozhu (Kenneth)
>>>> <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org; chensihang (A)
>>>> <chensihang1@hisilicon.com>
>>>> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
>>>> pin
>>>>
>>>> On Sun, Feb 07, 2021 at 10:24:28PM +0000, Song Bao Hua (Barry Song) wrote:
>>>>>>> In high-performance I/O cases, accelerators might want to perform
>>>>>>> I/O on a memory without IO page faults which can result in dramatically
>>>>>>> increased latency. Current memory related APIs could not achieve this
>>>>>>> requirement, e.g. mlock can only avoid memory to swap to backup device,
>>>>>>> page migration can still trigger IO page fault.
>>>>>>
>>>>>> Well ... we have two requirements.  The application wants to not take
>>>>>> page faults.  The system wants to move the application to a different
>>>>>> NUMA node in order to optimise overall performance.  Why should the
>>>>>> application's desires take precedence over the kernel's desires?  And why
>>>>>> should it be done this way rather than by the sysadmin using numactl to
>>>>>> lock the application to a particular node?
>>>>>
>>>>> NUMA balancer is just one of many reasons for page migration. Even one
>>>>> simple alloc_pages() can cause memory migration in just single NUMA
>>>>> node or UMA system.
>>>>>
>>>>> The other reasons for page migration include but are not limited to:
>>>>> * memory move due to CMA
>>>>> * memory move due to huge pages creation
>>>>>
>>>>> Hardly we can ask users to disable the COMPACTION, CMA and Huge Page
>>>>> in the whole system.
>>>>
>>>> You're dodging the question.  Should the CMA allocation fail because
>>>> another application is using SVA?
>>>>
>>>> I would say no.
>>>
>>> I would say no as well.
>>>
>>> While IOMMU is enabled, CMA almost has one user only: IOMMU driver
>>> as other drivers will depend on iommu to use non-contiguous memory
>>> though they are still calling dma_alloc_coherent().
>>>
>>> In iommu driver, dma_alloc_coherent is called during initialization
>>> and there is no new allocation afterwards. So it wouldn't cause
>>> runtime impact on SVA performance. Even there is new allocations,
>>> CMA will fall back to general alloc_pages() and iommu drivers are
>>> almost allocating small memory for command queues.
>>>
>>> So I would say general compound pages, huge pages, especially
>>> transparent huge pages, would be bigger concerns than CMA for
>>> internal page migration within one NUMA.
>>>
>>> Not like CMA, general alloc_pages() can get memory by moving
>>> pages other than those pinned.
>>>
>>> And there is no guarantee we can always bind the memory of
>>> SVA applications to single one NUMA, so NUMA balancing is
>>> still a concern.
>>>
>>> But I agree we need a way to make CMA success while the userspace
>>> pages are pinned. Since pin has been viral in many drivers, I
>>> assume there is a way to handle this. Otherwise, APIs like
>>> V4L2_MEMORY_USERPTR[1] will possibly make CMA fail as there
>>> is no guarantee that usersspace will allocate unmovable memory
>>> and there is no guarantee the fallback path- alloc_pages() can
>>> succeed while allocating big memory.
>>>
>>
>> Long term pinnings cannot go onto CMA-reserved memory, and there is
>> similar work to also fix ZONE_MOVABLE in that regard.
>>
>> https://lkml.kernel.org/r/20210125194751.1275316-1-pasha.tatashin@soleen.c
>> om
>>
>> One of the reasons I detest using long term pinning of pages where it
>> could be avoided. Take VFIO and RDMA as an example: these things
>> currently can't work without them.
>>
>> What I read here: "DMA performance will be affected severely". That does
>> not sound like a compelling argument to me for long term pinnings.
>> Please find another way to achieve the same goal without long term
>> pinnings controlled by user space - e.g., controlling when migration
>> actually happens.
>>
>> For example, CMA/alloc_contig_range()/memory unplug are corner cases
>> that happen rarely, you shouldn't have to worry about them messing with
>> your DMA performance.
> 
> I agree CMA/alloc_contig_range()/memory unplug would be corner cases,
> the major cases would be THP, NUMA balancing while we could totally
> disable them but it seems insensible to do that only because there is
> a process using SVA in the system.

Can't you use huge pages in your application that uses SVA and prevent 
THP/NUMA balancing from kicking in?
Jason Gunthorpe Feb. 8, 2021, 6:33 p.m. UTC | #13
On Mon, Feb 08, 2021 at 09:14:28AM +0100, David Hildenbrand wrote:

> People are constantly struggling with the effects of long term pinnings
> under user space control, like we already have with vfio and RDMA.
> 
> And here we are, adding yet another, easier way to mess with core MM in the
> same way. This feels like a step backwards to me.

Yes, this seems like a very poor candidate to be a system call in this
format. Much too narrow, poorly specified, and possibly security
implications to allow any process whatsoever to pin memory.

I keep encouraging people to explore a standard shared SVA interface
that can cover all these topics (and no, uaccel is not that
interface), that seems much more natural.

I still haven't seen an explanation why DMA is so special here,
migration and so forth jitter the CPU too, environments that care
about jitter have to turn this stuff off.

Jason
Song Bao Hua (Barry Song) Feb. 8, 2021, 8:35 p.m. UTC | #14
> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@ziepe.ca]
> Sent: Tuesday, February 9, 2021 7:34 AM
> To: David Hildenbrand <david@redhat.com>
> Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
> Morton <akpm@linux-foundation.org>; Alexander Viro <viro@zeniv.linux.org.uk>;
> gregkh@linuxfoundation.org; Song Bao Hua (Barry Song)
> <song.bao.hua@hisilicon.com>; kevin.tian@intel.com;
> jean-philippe@linaro.org; eric.auger@redhat.com; Liguozhu (Kenneth)
> <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org; chensihang (A)
> <chensihang1@hisilicon.com>
> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> pin
> 
> On Mon, Feb 08, 2021 at 09:14:28AM +0100, David Hildenbrand wrote:
> 
> > People are constantly struggling with the effects of long term pinnings
> > under user space control, like we already have with vfio and RDMA.
> >
> > And here we are, adding yet another, easier way to mess with core MM in the
> > same way. This feels like a step backwards to me.
> 
> Yes, this seems like a very poor candidate to be a system call in this
> format. Much too narrow, poorly specified, and possibly security
> implications to allow any process whatsoever to pin memory.
> 
> I keep encouraging people to explore a standard shared SVA interface
> that can cover all these topics (and no, uaccel is not that
> interface), that seems much more natural.
> 
> I still haven't seen an explanation why DMA is so special here,
> migration and so forth jitter the CPU too, environments that care
> about jitter have to turn this stuff off.

This paper has a good explanation:
https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7482091

mainly because page fault can go directly to the CPU and we have
many CPUs. But IO Page Faults go a different way, thus mean much
higher latency 3-80x slower than page fault:
events in hardware queue -> Interrupts -> cpu processing page fault
-> return events to iommu/device -> continue I/O.

Copied from the paper:

If the IOMMU's page table walker fails to find the desired
translation in the page table, it sends an ATS response to
the GPU notifying it of this failure. This in turn corresponds
to a page fault. In response, the GPU sends another request to
the IOMMU called a Peripheral Page Request (PPR). The IOMMU
places this request in a memory-mapped queue and raises an
interrupt on the CPU. Multiple PPR requests can be queued
before the CPU is interrupted. The OS must have a suitable
IOMMU driver to process this interrupt and the queued PPR
requests. In Linux, while in an interrupt context, the driver
pulls PPR requests from the queue and places them in a work-queue
for later processing. Presumably this design decision was made
to minimize the time spent executing in an interrupt context,
where lower priority interrupts would be dis-abled. At a later
time, an OS worker-thread calls back into the driver to process
page fault requests in the work-queue. Once the requests are
serviced, the driver notifies the IOMMU. In turn, the IOMMU
notifies the GPU. The GPU then sends an-other ATS request to
retry the translation for the original fault-ing address.

Comparison with CPU: On the CPU, a hardware excep-tion is
raised on a page fault, which immediately switches to the
OS. In most cases in Linux, this routine services the page
fault directly, instead of queuing it for later processing.
Con-trast this with a page fault from an accelerator, where
the IOMMU has to interrupt the CPU to request service on
its be-half, and also note the several back-and-forth messages
be-tween the accelerator, the IOMMU, and the CPU. Further-more,
page faults on the CPU are generally handled one at a time
on the CPU, while for the GPU they are batched by the IOMMU
and OS work-queue mechanism.

> 
> Jason

Thanks
Barry
Song Bao Hua (Barry Song) Feb. 8, 2021, 8:52 p.m. UTC | #15
> -----Original Message-----
> From: David Hildenbrand [mailto:david@redhat.com]
> Sent: Monday, February 8, 2021 11:37 PM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>; Matthew Wilcox
> <willy@infradead.org>
> Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
> Morton <akpm@linux-foundation.org>; Alexander Viro <viro@zeniv.linux.org.uk>;
> gregkh@linuxfoundation.org; jgg@ziepe.ca; kevin.tian@intel.com;
> jean-philippe@linaro.org; eric.auger@redhat.com; Liguozhu (Kenneth)
> <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org; chensihang (A)
> <chensihang1@hisilicon.com>
> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> pin
> 
> On 08.02.21 11:13, Song Bao Hua (Barry Song) wrote:
> >
> >
> >> -----Original Message-----
> >> From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On Behalf
> Of
> >> David Hildenbrand
> >> Sent: Monday, February 8, 2021 9:22 PM
> >> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>; Matthew Wilcox
> >> <willy@infradead.org>
> >> Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
> >> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
> >> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
> >> Morton <akpm@linux-foundation.org>; Alexander Viro
> <viro@zeniv.linux.org.uk>;
> >> gregkh@linuxfoundation.org; jgg@ziepe.ca; kevin.tian@intel.com;
> >> jean-philippe@linaro.org; eric.auger@redhat.com; Liguozhu (Kenneth)
> >> <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org; chensihang (A)
> >> <chensihang1@hisilicon.com>
> >> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> >> pin
> >>
> >> On 08.02.21 03:27, Song Bao Hua (Barry Song) wrote:
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On
> Behalf
> >> Of
> >>>> Matthew Wilcox
> >>>> Sent: Monday, February 8, 2021 2:31 PM
> >>>> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> >>>> Cc: Wangzhou (B) <wangzhou1@hisilicon.com>;
> linux-kernel@vger.kernel.org;
> >>>> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
> >>>> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
> >>>> Morton <akpm@linux-foundation.org>; Alexander Viro
> >> <viro@zeniv.linux.org.uk>;
> >>>> gregkh@linuxfoundation.org; jgg@ziepe.ca; kevin.tian@intel.com;
> >>>> jean-philippe@linaro.org; eric.auger@redhat.com; Liguozhu (Kenneth)
> >>>> <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org; chensihang (A)
> >>>> <chensihang1@hisilicon.com>
> >>>> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> >>>> pin
> >>>>
> >>>> On Sun, Feb 07, 2021 at 10:24:28PM +0000, Song Bao Hua (Barry Song) wrote:
> >>>>>>> In high-performance I/O cases, accelerators might want to perform
> >>>>>>> I/O on a memory without IO page faults which can result in dramatically
> >>>>>>> increased latency. Current memory related APIs could not achieve this
> >>>>>>> requirement, e.g. mlock can only avoid memory to swap to backup device,
> >>>>>>> page migration can still trigger IO page fault.
> >>>>>>
> >>>>>> Well ... we have two requirements.  The application wants to not take
> >>>>>> page faults.  The system wants to move the application to a different
> >>>>>> NUMA node in order to optimise overall performance.  Why should the
> >>>>>> application's desires take precedence over the kernel's desires?  And
> why
> >>>>>> should it be done this way rather than by the sysadmin using numactl
> to
> >>>>>> lock the application to a particular node?
> >>>>>
> >>>>> NUMA balancer is just one of many reasons for page migration. Even one
> >>>>> simple alloc_pages() can cause memory migration in just single NUMA
> >>>>> node or UMA system.
> >>>>>
> >>>>> The other reasons for page migration include but are not limited to:
> >>>>> * memory move due to CMA
> >>>>> * memory move due to huge pages creation
> >>>>>
> >>>>> Hardly we can ask users to disable the COMPACTION, CMA and Huge Page
> >>>>> in the whole system.
> >>>>
> >>>> You're dodging the question.  Should the CMA allocation fail because
> >>>> another application is using SVA?
> >>>>
> >>>> I would say no.
> >>>
> >>> I would say no as well.
> >>>
> >>> While IOMMU is enabled, CMA almost has one user only: IOMMU driver
> >>> as other drivers will depend on iommu to use non-contiguous memory
> >>> though they are still calling dma_alloc_coherent().
> >>>
> >>> In iommu driver, dma_alloc_coherent is called during initialization
> >>> and there is no new allocation afterwards. So it wouldn't cause
> >>> runtime impact on SVA performance. Even there is new allocations,
> >>> CMA will fall back to general alloc_pages() and iommu drivers are
> >>> almost allocating small memory for command queues.
> >>>
> >>> So I would say general compound pages, huge pages, especially
> >>> transparent huge pages, would be bigger concerns than CMA for
> >>> internal page migration within one NUMA.
> >>>
> >>> Not like CMA, general alloc_pages() can get memory by moving
> >>> pages other than those pinned.
> >>>
> >>> And there is no guarantee we can always bind the memory of
> >>> SVA applications to single one NUMA, so NUMA balancing is
> >>> still a concern.
> >>>
> >>> But I agree we need a way to make CMA success while the userspace
> >>> pages are pinned. Since pin has been viral in many drivers, I
> >>> assume there is a way to handle this. Otherwise, APIs like
> >>> V4L2_MEMORY_USERPTR[1] will possibly make CMA fail as there
> >>> is no guarantee that usersspace will allocate unmovable memory
> >>> and there is no guarantee the fallback path- alloc_pages() can
> >>> succeed while allocating big memory.
> >>>
> >>
> >> Long term pinnings cannot go onto CMA-reserved memory, and there is
> >> similar work to also fix ZONE_MOVABLE in that regard.
> >>
> >>
> https://lkml.kernel.org/r/20210125194751.1275316-1-pasha.tatashin@soleen.c
> >> om
> >>
> >> One of the reasons I detest using long term pinning of pages where it
> >> could be avoided. Take VFIO and RDMA as an example: these things
> >> currently can't work without them.
> >>
> >> What I read here: "DMA performance will be affected severely". That does
> >> not sound like a compelling argument to me for long term pinnings.
> >> Please find another way to achieve the same goal without long term
> >> pinnings controlled by user space - e.g., controlling when migration
> >> actually happens.
> >>
> >> For example, CMA/alloc_contig_range()/memory unplug are corner cases
> >> that happen rarely, you shouldn't have to worry about them messing with
> >> your DMA performance.
> >
> > I agree CMA/alloc_contig_range()/memory unplug would be corner cases,
> > the major cases would be THP, NUMA balancing while we could totally
> > disable them but it seems insensible to do that only because there is
> > a process using SVA in the system.
> 
> Can't you use huge pages in your application that uses SVA and prevent
> THP/NUMA balancing from kicking in?

Yes. That's exactly we have done in userspace for the applications which
can directly call UADK (the user-space accelerator framework based on
uacce) to use accelerators for zip, encryption:

+-------------------------------------------+
 |                                           |
 |applications using accelerators            |
 +-------------------------------------------+


     alloc from pool             free to pool
           +                      ++
           |                       |
           |                       |
           |                       |
           |                       |
           |                       |
           |                       |
           |                       |
+----------+-----------------------+---------+
|                                            |
|                                            |
|      HugeTLB memory pool                   |
|                                            |
|                                            |
+--------------------------------------------+

Those applications can get memory from the hugetlb pool and avoid
IO page faults.

The problem is that not every application can do this. Many applications
such as Nginx, Ceph,  are just calling zlib/openssl to use accelerators,
they are not calling the UADK pool based on HugeTLB and they are not
customized.

"vm.compact_unevictable_allowed=0 + mlock + numa_balancing disabling"
which David Rientjes mentioned seem to be a good direction to
investigate on. but it would be better if those settings only affect
the specific process using SVA.

> 
> --
> Thanks,
> 
> David / dhildenb

Thanks
Barry
Jason Gunthorpe Feb. 8, 2021, 9:30 p.m. UTC | #16
On Mon, Feb 08, 2021 at 08:35:31PM +0000, Song Bao Hua (Barry Song) wrote:
> 
> 
> > From: Jason Gunthorpe [mailto:jgg@ziepe.ca]
> > Sent: Tuesday, February 9, 2021 7:34 AM
> > To: David Hildenbrand <david@redhat.com>
> > Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
> > iommu@lists.linux-foundation.org; linux-mm@kvack.org;
> > linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
> > Morton <akpm@linux-foundation.org>; Alexander Viro <viro@zeniv.linux.org.uk>;
> > gregkh@linuxfoundation.org; Song Bao Hua (Barry Song)
> > <song.bao.hua@hisilicon.com>; kevin.tian@intel.com;
> > jean-philippe@linaro.org; eric.auger@redhat.com; Liguozhu (Kenneth)
> > <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org; chensihang (A)
> > <chensihang1@hisilicon.com>
> > Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> > pin
> > 
> > On Mon, Feb 08, 2021 at 09:14:28AM +0100, David Hildenbrand wrote:
> > 
> > > People are constantly struggling with the effects of long term pinnings
> > > under user space control, like we already have with vfio and RDMA.
> > >
> > > And here we are, adding yet another, easier way to mess with core MM in the
> > > same way. This feels like a step backwards to me.
> > 
> > Yes, this seems like a very poor candidate to be a system call in this
> > format. Much too narrow, poorly specified, and possibly security
> > implications to allow any process whatsoever to pin memory.
> > 
> > I keep encouraging people to explore a standard shared SVA interface
> > that can cover all these topics (and no, uaccel is not that
> > interface), that seems much more natural.
> > 
> > I still haven't seen an explanation why DMA is so special here,
> > migration and so forth jitter the CPU too, environments that care
> > about jitter have to turn this stuff off.
> 
> This paper has a good explanation:
> https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7482091
> 
> mainly because page fault can go directly to the CPU and we have
> many CPUs. But IO Page Faults go a different way, thus mean much
> higher latency 3-80x slower than page fault:
> events in hardware queue -> Interrupts -> cpu processing page fault
> -> return events to iommu/device -> continue I/O.

The justifications for this was migration scenarios and migration is
short. If you take a fault on what you are migrating only then does it
slow down the CPU.

Are you also working with HW where the IOMMU becomes invalidated after
a migration and doesn't reload?

ie not true SVA but the sort of emulated SVA we see in a lot of
places?

It would be much better to work improve that to have closer sync with the
CPU page table than to use pinning.

Jason
Song Bao Hua (Barry Song) Feb. 9, 2021, 3:01 a.m. UTC | #17
> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@ziepe.ca]
> Sent: Tuesday, February 9, 2021 10:30 AM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> Cc: David Hildenbrand <david@redhat.com>; Wangzhou (B)
> <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
> Morton <akpm@linux-foundation.org>; Alexander Viro <viro@zeniv.linux.org.uk>;
> gregkh@linuxfoundation.org; kevin.tian@intel.com; jean-philippe@linaro.org;
> eric.auger@redhat.com; Liguozhu (Kenneth) <liguozhu@hisilicon.com>;
> zhangfei.gao@linaro.org; chensihang (A) <chensihang1@hisilicon.com>
> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> pin
> 
> On Mon, Feb 08, 2021 at 08:35:31PM +0000, Song Bao Hua (Barry Song) wrote:
> >
> >
> > > From: Jason Gunthorpe [mailto:jgg@ziepe.ca]
> > > Sent: Tuesday, February 9, 2021 7:34 AM
> > > To: David Hildenbrand <david@redhat.com>
> > > Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
> > > iommu@lists.linux-foundation.org; linux-mm@kvack.org;
> > > linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
> > > Morton <akpm@linux-foundation.org>; Alexander Viro
> <viro@zeniv.linux.org.uk>;
> > > gregkh@linuxfoundation.org; Song Bao Hua (Barry Song)
> > > <song.bao.hua@hisilicon.com>; kevin.tian@intel.com;
> > > jean-philippe@linaro.org; eric.auger@redhat.com; Liguozhu (Kenneth)
> > > <liguozhu@hisilicon.com>; zhangfei.gao@linaro.org; chensihang (A)
> > > <chensihang1@hisilicon.com>
> > > Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> > > pin
> > >
> > > On Mon, Feb 08, 2021 at 09:14:28AM +0100, David Hildenbrand wrote:
> > >
> > > > People are constantly struggling with the effects of long term pinnings
> > > > under user space control, like we already have with vfio and RDMA.
> > > >
> > > > And here we are, adding yet another, easier way to mess with core MM in
> the
> > > > same way. This feels like a step backwards to me.
> > >
> > > Yes, this seems like a very poor candidate to be a system call in this
> > > format. Much too narrow, poorly specified, and possibly security
> > > implications to allow any process whatsoever to pin memory.
> > >
> > > I keep encouraging people to explore a standard shared SVA interface
> > > that can cover all these topics (and no, uaccel is not that
> > > interface), that seems much more natural.
> > >
> > > I still haven't seen an explanation why DMA is so special here,
> > > migration and so forth jitter the CPU too, environments that care
> > > about jitter have to turn this stuff off.
> >
> > This paper has a good explanation:
> > https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7482091
> >
> > mainly because page fault can go directly to the CPU and we have
> > many CPUs. But IO Page Faults go a different way, thus mean much
> > higher latency 3-80x slower than page fault:
> > events in hardware queue -> Interrupts -> cpu processing page fault
> > -> return events to iommu/device -> continue I/O.
> 
> The justifications for this was migration scenarios and migration is
> short. If you take a fault on what you are migrating only then does it
> slow down the CPU.

I agree this can slow down CPU, but not as much as IO page fault.

On the other hand, wouldn't it be the benefit of hardware accelerators
to have a lower and more stable latency zip/encryption than CPU?

> 
> Are you also working with HW where the IOMMU becomes invalidated after
> a migration and doesn't reload?
> 
> ie not true SVA but the sort of emulated SVA we see in a lot of
> places?

Yes. It is true SVA not emulated SVA.

> 
> It would be much better to work improve that to have closer sync with the
> CPU page table than to use pinning.

Absolutely I agree improving IOPF and making IOPF catch up with the 
performance of page fault is the best way. but it would take much
long time to optimize both HW and SW. While waiting for them to
mature, probably some way which can minimize IOPF should be used to
take the responsivity.

> 
> Jason

Thanks
Barry
Zhou Wang Feb. 9, 2021, 9:02 a.m. UTC | #18
On 2021/2/8 5:34, Matthew Wilcox wrote:
> On Sun, Feb 07, 2021 at 04:18:03PM +0800, Zhou Wang wrote:
>> SVA(share virtual address) offers a way for device to share process virtual
>> address space safely, which makes more convenient for user space device
>> driver coding. However, IO page faults may happen when doing DMA
>> operations. As the latency of IO page fault is relatively big, DMA
>> performance will be affected severely when there are IO page faults.
>> >From a long term view, DMA performance will be not stable.
>>
>> In high-performance I/O cases, accelerators might want to perform
>> I/O on a memory without IO page faults which can result in dramatically
>> increased latency. Current memory related APIs could not achieve this
>> requirement, e.g. mlock can only avoid memory to swap to backup device,
>> page migration can still trigger IO page fault.
> 
> Well ... we have two requirements.  The application wants to not take
> page faults.  The system wants to move the application to a different
> NUMA node in order to optimise overall performance.  Why should the
> application's desires take precedence over the kernel's desires?  And why
> should it be done this way rather than by the sysadmin using numactl to
> lock the application to a particular node?

Just as Barry mentioned, there are other cases which could trigger IOPF.
Only numactl is not enough.

> 
>> +struct mem_pin_container {
>> +	struct xarray array;
>> +	struct mutex lock;
>> +};
> 
> I don't understand what the lock actually protects.

This lock protects pin/unpin and record/remove.
 - pin pages and record them
 - unpin pages and remove them
should be exlusive.

> 
>> +struct pin_pages {
>> +	unsigned long first;
>> +	unsigned long nr_pages;
>> +	struct page **pages;
>> +};
> 
> I don't think you need 'first', and I think you can embed the pages
> array into this struct, removing one allocation.

'first' will be recorded and be used to unpin later. We use it
as an index to get pinned pages and do unpin operation.

> 
>> +	xa_for_each(&priv->array, idx, p) {
>> +		unpin_user_pages(p->pages, p->nr_pages);
>> +		xa_erase(&priv->array, p->first);
>> +		vfree(p->pages);
>> +		kfree(p);
>> +	}
>> +
>> +	mutex_destroy(&priv->lock);
>> +	xa_destroy(&priv->array);
> 
> If you just called xa_erase() on every element of the array, you don't need
> to call xa_destroy().

OK.

> 
>> +	if (!can_do_mlock())
>> +		return -EPERM;
>
> You check for can_do_mlock(), but you don't account the pages to this
> rlimit.

Here I just copied it from ib_umen_get and do_mlock. If needed, we can
add account for pages here.

> 
>> +	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
> 
> You don't need to mask off the bits, the shift will remove them.

OK.

> 
>> +	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
> 
> DIV_ROUND_UP()?

addr->size is input pin page size which is not same as PAGE_SIZE.
So seems we can not use this macro.

> 
>> +	pages = vmalloc(nr_pages * sizeof(struct page *));
> 
> kvmalloc().  vmalloc() always allocates at least a page, so we want to
> use kmalloc if the size is small.  Also, use array_size() -- I know this
> can't overflow, but let's be clear

Yes, will use kvmalloc and array_size here.

> 
>> +	ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
>> +				  flags | FOLL_LONGTERM, pages);
>> +	if (ret != nr_pages) {
>> +		pr_err("mempinfd: Failed to pin page\n");
> 
> No.  You mustn't allow the user to be able to generate messages to syslog,
> just by passing garbage to a syscall.

OK, will remove this.

> 
>> +	ret = xa_insert(&priv->array, p->first, p, GFP_KERNEL);
>> +	if (ret)
>> +		goto unpin_pages;
> 
> Hmm.  So we can't pin two ranges which start at the same address, but we
> can pin two overlapping ranges.  Is that OK?

The design here is only supporting pin a range with different start address.
If two overlapping ranges with different start address, we will not prevent
this. If this will not go wrong, let's make it simple firstly.

Best,
Zhou

> 
> 
> .
>
Zhou Wang Feb. 9, 2021, 9:17 a.m. UTC | #19
On 2021/2/8 6:02, Andy Lutomirski wrote:
> 
> 
>> On Feb 7, 2021, at 12:31 AM, Zhou Wang <wangzhou1@hisilicon.com> wrote:
>>
>> SVA(share virtual address) offers a way for device to share process virtual
>> address space safely, which makes more convenient for user space device
>> driver coding. However, IO page faults may happen when doing DMA
>> operations. As the latency of IO page fault is relatively big, DMA
>> performance will be affected severely when there are IO page faults.
>> From a long term view, DMA performance will be not stable.
>>
>> In high-performance I/O cases, accelerators might want to perform
>> I/O on a memory without IO page faults which can result in dramatically
>> increased latency. Current memory related APIs could not achieve this
>> requirement, e.g. mlock can only avoid memory to swap to backup device,
>> page migration can still trigger IO page fault.
>>
>> Various drivers working under traditional non-SVA mode are using
>> their own specific ioctl to do pin. Such ioctl can be seen in v4l2,
>> gpu, infiniband, media, vfio, etc. Drivers are usually doing dma
>> mapping while doing pin.
>>
>> But, in SVA mode, pin could be a common need which isn't necessarily
>> bound with any drivers, and neither is dma mapping needed by drivers
>> since devices are using the virtual address of CPU. Thus, It is better
>> to introduce a new common syscall for it.
>>
>> This patch leverages the design of userfaultfd and adds mempinfd for pin
>> to avoid messing up mm_struct. A fd will be got by mempinfd, then user
>> space can do pin/unpin pages by ioctls of this fd, all pinned pages under
>> one file will be unpinned in file release process. Like pin page cases in
>> other places, can_do_mlock is used to check permission and input
>> parameters.
> 
> 
> Can you document what the syscall does?

Will add related document in Documentation/vm.

> 
> Userfaultfd is an fd because one program controls another.  Is mempinfd like this?

We use mempinfd like: (see patch 2/2)

fd = mempinfd();
va = malloc(size);
struct mem_pin_address addr;
addr.va = va;
addr.size = size;
ioctl(fd, MEM_CMD_PIN, &addr);
ioctl(fd, MEM_CMD_UNPIN, &addr);
close(fd);

Best,
Zhou

> .
>
Zhou Wang Feb. 9, 2021, 9:27 a.m. UTC | #20
On 2021/2/8 5:51, Arnd Bergmann wrote:
> On Sun, Feb 7, 2021 at 9:18 AM Zhou Wang <wangzhou1@hisilicon.com> wrote:
> 
>> diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
>> index cccfbbe..3f49529 100644
>> --- a/arch/arm64/include/asm/unistd32.h
>> +++ b/arch/arm64/include/asm/unistd32.h
>> @@ -891,6 +891,8 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
>>  __SYSCALL(__NR_process_madvise, sys_process_madvise)
>>  #define __NR_epoll_pwait2 441
>>  __SYSCALL(__NR_epoll_pwait2, compat_sys_epoll_pwait2)
>> +#define __NR_mempinfd 442
>> +__SYSCALL(__NR_mempinfd, sys_mempinfd)
> 
> This adds a compat syscall for 32-bit tasks running on arm64 without adding
> the same for the native arch/arm syscall table. Those two need to always
> stay synchronized. In fact, new system call should ideally get assigned
> on all architectures at the same time, with the same number (or +110
> on arch/alpha).

Thank for pointing out this. I use an ARM64 machine to test, so
currently only add it for ARM64 :)

Best,
Zhou

> 
>          Arnd
> 
> .
>
Greg Kroah-Hartman Feb. 9, 2021, 9:37 a.m. UTC | #21
On Tue, Feb 09, 2021 at 05:17:46PM +0800, Zhou Wang wrote:
> On 2021/2/8 6:02, Andy Lutomirski wrote:
> > 
> > 
> >> On Feb 7, 2021, at 12:31 AM, Zhou Wang <wangzhou1@hisilicon.com> wrote:
> >>
> >> SVA(share virtual address) offers a way for device to share process virtual
> >> address space safely, which makes more convenient for user space device
> >> driver coding. However, IO page faults may happen when doing DMA
> >> operations. As the latency of IO page fault is relatively big, DMA
> >> performance will be affected severely when there are IO page faults.
> >> From a long term view, DMA performance will be not stable.
> >>
> >> In high-performance I/O cases, accelerators might want to perform
> >> I/O on a memory without IO page faults which can result in dramatically
> >> increased latency. Current memory related APIs could not achieve this
> >> requirement, e.g. mlock can only avoid memory to swap to backup device,
> >> page migration can still trigger IO page fault.
> >>
> >> Various drivers working under traditional non-SVA mode are using
> >> their own specific ioctl to do pin. Such ioctl can be seen in v4l2,
> >> gpu, infiniband, media, vfio, etc. Drivers are usually doing dma
> >> mapping while doing pin.
> >>
> >> But, in SVA mode, pin could be a common need which isn't necessarily
> >> bound with any drivers, and neither is dma mapping needed by drivers
> >> since devices are using the virtual address of CPU. Thus, It is better
> >> to introduce a new common syscall for it.
> >>
> >> This patch leverages the design of userfaultfd and adds mempinfd for pin
> >> to avoid messing up mm_struct. A fd will be got by mempinfd, then user
> >> space can do pin/unpin pages by ioctls of this fd, all pinned pages under
> >> one file will be unpinned in file release process. Like pin page cases in
> >> other places, can_do_mlock is used to check permission and input
> >> parameters.
> > 
> > 
> > Can you document what the syscall does?
> 
> Will add related document in Documentation/vm.

A manpage is always good, and will be required eventually :)

thanks,

greg k-h
Zhou Wang Feb. 9, 2021, 11:58 a.m. UTC | #22
On 2021/2/9 17:37, Greg KH wrote:
> On Tue, Feb 09, 2021 at 05:17:46PM +0800, Zhou Wang wrote:
>> On 2021/2/8 6:02, Andy Lutomirski wrote:
>>>
>>>
>>>> On Feb 7, 2021, at 12:31 AM, Zhou Wang <wangzhou1@hisilicon.com> wrote:
>>>>
>>>> SVA(share virtual address) offers a way for device to share process virtual
>>>> address space safely, which makes more convenient for user space device
>>>> driver coding. However, IO page faults may happen when doing DMA
>>>> operations. As the latency of IO page fault is relatively big, DMA
>>>> performance will be affected severely when there are IO page faults.
>>>> From a long term view, DMA performance will be not stable.
>>>>
>>>> In high-performance I/O cases, accelerators might want to perform
>>>> I/O on a memory without IO page faults which can result in dramatically
>>>> increased latency. Current memory related APIs could not achieve this
>>>> requirement, e.g. mlock can only avoid memory to swap to backup device,
>>>> page migration can still trigger IO page fault.
>>>>
>>>> Various drivers working under traditional non-SVA mode are using
>>>> their own specific ioctl to do pin. Such ioctl can be seen in v4l2,
>>>> gpu, infiniband, media, vfio, etc. Drivers are usually doing dma
>>>> mapping while doing pin.
>>>>
>>>> But, in SVA mode, pin could be a common need which isn't necessarily
>>>> bound with any drivers, and neither is dma mapping needed by drivers
>>>> since devices are using the virtual address of CPU. Thus, It is better
>>>> to introduce a new common syscall for it.
>>>>
>>>> This patch leverages the design of userfaultfd and adds mempinfd for pin
>>>> to avoid messing up mm_struct. A fd will be got by mempinfd, then user
>>>> space can do pin/unpin pages by ioctls of this fd, all pinned pages under
>>>> one file will be unpinned in file release process. Like pin page cases in
>>>> other places, can_do_mlock is used to check permission and input
>>>> parameters.
>>>
>>>
>>> Can you document what the syscall does?
>>
>> Will add related document in Documentation/vm.
> 
> A manpage is always good, and will be required eventually :)

manpage is maintained in another repo. Do you mean add a manpage
patch in this series?

Best,
Zhou

> 
> thanks,
> 
> greg k-h
> 
> .
>
Greg Kroah-Hartman Feb. 9, 2021, 12:01 p.m. UTC | #23
On Tue, Feb 09, 2021 at 07:58:15PM +0800, Zhou Wang wrote:
> On 2021/2/9 17:37, Greg KH wrote:
> > On Tue, Feb 09, 2021 at 05:17:46PM +0800, Zhou Wang wrote:
> >> On 2021/2/8 6:02, Andy Lutomirski wrote:
> >>>
> >>>
> >>>> On Feb 7, 2021, at 12:31 AM, Zhou Wang <wangzhou1@hisilicon.com> wrote:
> >>>>
> >>>> SVA(share virtual address) offers a way for device to share process virtual
> >>>> address space safely, which makes more convenient for user space device
> >>>> driver coding. However, IO page faults may happen when doing DMA
> >>>> operations. As the latency of IO page fault is relatively big, DMA
> >>>> performance will be affected severely when there are IO page faults.
> >>>> From a long term view, DMA performance will be not stable.
> >>>>
> >>>> In high-performance I/O cases, accelerators might want to perform
> >>>> I/O on a memory without IO page faults which can result in dramatically
> >>>> increased latency. Current memory related APIs could not achieve this
> >>>> requirement, e.g. mlock can only avoid memory to swap to backup device,
> >>>> page migration can still trigger IO page fault.
> >>>>
> >>>> Various drivers working under traditional non-SVA mode are using
> >>>> their own specific ioctl to do pin. Such ioctl can be seen in v4l2,
> >>>> gpu, infiniband, media, vfio, etc. Drivers are usually doing dma
> >>>> mapping while doing pin.
> >>>>
> >>>> But, in SVA mode, pin could be a common need which isn't necessarily
> >>>> bound with any drivers, and neither is dma mapping needed by drivers
> >>>> since devices are using the virtual address of CPU. Thus, It is better
> >>>> to introduce a new common syscall for it.
> >>>>
> >>>> This patch leverages the design of userfaultfd and adds mempinfd for pin
> >>>> to avoid messing up mm_struct. A fd will be got by mempinfd, then user
> >>>> space can do pin/unpin pages by ioctls of this fd, all pinned pages under
> >>>> one file will be unpinned in file release process. Like pin page cases in
> >>>> other places, can_do_mlock is used to check permission and input
> >>>> parameters.
> >>>
> >>>
> >>> Can you document what the syscall does?
> >>
> >> Will add related document in Documentation/vm.
> > 
> > A manpage is always good, and will be required eventually :)
> 
> manpage is maintained in another repo. Do you mean add a manpage
> patch in this series?

It's good to show how it will be used, don't you think?

thanks,

greg k-h
Zhou Wang Feb. 9, 2021, 12:20 p.m. UTC | #24
On 2021/2/9 20:01, Greg KH wrote:
> On Tue, Feb 09, 2021 at 07:58:15PM +0800, Zhou Wang wrote:
>> On 2021/2/9 17:37, Greg KH wrote:
>>> On Tue, Feb 09, 2021 at 05:17:46PM +0800, Zhou Wang wrote:
>>>> On 2021/2/8 6:02, Andy Lutomirski wrote:
>>>>>
>>>>>
>>>>>> On Feb 7, 2021, at 12:31 AM, Zhou Wang <wangzhou1@hisilicon.com> wrote:
>>>>>>
>>>>>> SVA(share virtual address) offers a way for device to share process virtual
>>>>>> address space safely, which makes more convenient for user space device
>>>>>> driver coding. However, IO page faults may happen when doing DMA
>>>>>> operations. As the latency of IO page fault is relatively big, DMA
>>>>>> performance will be affected severely when there are IO page faults.
>>>>>> From a long term view, DMA performance will be not stable.
>>>>>>
>>>>>> In high-performance I/O cases, accelerators might want to perform
>>>>>> I/O on a memory without IO page faults which can result in dramatically
>>>>>> increased latency. Current memory related APIs could not achieve this
>>>>>> requirement, e.g. mlock can only avoid memory to swap to backup device,
>>>>>> page migration can still trigger IO page fault.
>>>>>>
>>>>>> Various drivers working under traditional non-SVA mode are using
>>>>>> their own specific ioctl to do pin. Such ioctl can be seen in v4l2,
>>>>>> gpu, infiniband, media, vfio, etc. Drivers are usually doing dma
>>>>>> mapping while doing pin.
>>>>>>
>>>>>> But, in SVA mode, pin could be a common need which isn't necessarily
>>>>>> bound with any drivers, and neither is dma mapping needed by drivers
>>>>>> since devices are using the virtual address of CPU. Thus, It is better
>>>>>> to introduce a new common syscall for it.
>>>>>>
>>>>>> This patch leverages the design of userfaultfd and adds mempinfd for pin
>>>>>> to avoid messing up mm_struct. A fd will be got by mempinfd, then user
>>>>>> space can do pin/unpin pages by ioctls of this fd, all pinned pages under
>>>>>> one file will be unpinned in file release process. Like pin page cases in
>>>>>> other places, can_do_mlock is used to check permission and input
>>>>>> parameters.
>>>>>
>>>>>
>>>>> Can you document what the syscall does?
>>>>
>>>> Will add related document in Documentation/vm.
>>>
>>> A manpage is always good, and will be required eventually :)
>>
>> manpage is maintained in another repo. Do you mean add a manpage
>> patch in this series?
> 
> It's good to show how it will be used, don't you think?

Agree, will add it in next version.

Thanks,
Zhou

> 
> thanks,
> 
> greg k-h
> 
> .
>
Jason Gunthorpe Feb. 9, 2021, 1:53 p.m. UTC | #25
On Tue, Feb 09, 2021 at 03:01:42AM +0000, Song Bao Hua (Barry Song) wrote:

> On the other hand, wouldn't it be the benefit of hardware accelerators
> to have a lower and more stable latency zip/encryption than CPU?

No, I don't think so.

If this is an important problem then it should apply equally to CPU
and IO jitter.

Honestly I find the idea that occasional migration jitters CPU and DMA
to not be very compelling. Such specialized applications should
allocate special pages to avoid this, not adding an API to be able to
lock down any page
 
Jason
Song Bao Hua (Barry Song) Feb. 9, 2021, 10:22 p.m. UTC | #26
> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@ziepe.ca]
> Sent: Wednesday, February 10, 2021 2:54 AM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> Cc: David Hildenbrand <david@redhat.com>; Wangzhou (B)
> <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
> Morton <akpm@linux-foundation.org>; Alexander Viro <viro@zeniv.linux.org.uk>;
> gregkh@linuxfoundation.org; kevin.tian@intel.com; jean-philippe@linaro.org;
> eric.auger@redhat.com; Liguozhu (Kenneth) <liguozhu@hisilicon.com>;
> zhangfei.gao@linaro.org; chensihang (A) <chensihang1@hisilicon.com>
> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> pin
> 
> On Tue, Feb 09, 2021 at 03:01:42AM +0000, Song Bao Hua (Barry Song) wrote:
> 
> > On the other hand, wouldn't it be the benefit of hardware accelerators
> > to have a lower and more stable latency zip/encryption than CPU?
> 
> No, I don't think so.

Fortunately or unfortunately, I think my people have this target to have
a lower-latency and more stable zip/encryption by using accelerators,
otherwise, they are going to use CPU directly if there is no advantage
of accelerators.

> 
> If this is an important problem then it should apply equally to CPU
> and IO jitter.
> 
> Honestly I find the idea that occasional migration jitters CPU and DMA
> to not be very compelling. Such specialized applications should
> allocate special pages to avoid this, not adding an API to be able to
> lock down any page

That is exactly what we have done to provide a hugeTLB pool so that
applications can allocate memory from this pool.

+-------------------------------------------+
 |                                           |
 |applications using accelerators            |
 +-------------------------------------------+


     alloc from pool             free to pool
           +                      ++
           |                       |
           |                       |
           |                       |
           |                       |
           |                       |
           |                       |
           |                       |
+----------+-----------------------+---------+
|                                            |
|                                            |
|      HugeTLB memory pool                   |
|                                            |
|                                            |
+--------------------------------------------+

The problem is that SVA declares we can use any memory of a process
to do I/O. And in real scenarios, we are unable to customize most
applications to make them use the pool. So we are looking for some
extension generically for applications such as Nginx, Ceph.

I am also thinking about leveraging vm.compact_unevictable_allowed
which David suggested and making an extension on it, for example,
permit users to disable compaction and numa balancing on unevictable
pages of SVA process,  which might be a smaller deal.

> 
> Jason

Thanks
Barry
Jason Gunthorpe Feb. 10, 2021, 6:04 p.m. UTC | #27
On Tue, Feb 09, 2021 at 10:22:47PM +0000, Song Bao Hua (Barry Song) wrote:

> The problem is that SVA declares we can use any memory of a process
> to do I/O. And in real scenarios, we are unable to customize most
> applications to make them use the pool. So we are looking for some
> extension generically for applications such as Nginx, Ceph.

But those applications will suffer jitter even if their are using CPU
to do the same work. I fail to see why adding an accelerator suddenly
means the application owner will care about jitter introduced by
migration/etc.

Again in proper SVA it should be quite unlikely to take a fault caused
by something like migration, on the same likelyhood as the CPU. If
things are faulting so much this is a problem then I think it is a
system level problem with doing too much page motion.

Jason
Matthew Wilcox Feb. 10, 2021, 6:50 p.m. UTC | #28
On Tue, Feb 09, 2021 at 08:20:18PM +0800, Zhou Wang wrote:
> Agree, will add it in next version.

No, don't do another version.  Jason is right, this approach is wrong.
The point of SVA is that it doesn't require the application to do
anything special.  If jitter from too-frequent page migration is actually
a problem, then fix the frequency of page migration.  Don't pretend that
this particular application is so important that it prevents the kernel
from doing its housekeeping.
Song Bao Hua (Barry Song) Feb. 10, 2021, 9:39 p.m. UTC | #29
> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@ziepe.ca]
> Sent: Thursday, February 11, 2021 7:04 AM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> Cc: David Hildenbrand <david@redhat.com>; Wangzhou (B)
> <wangzhou1@hisilicon.com>; linux-kernel@vger.kernel.org;
> iommu@lists.linux-foundation.org; linux-mm@kvack.org;
> linux-arm-kernel@lists.infradead.org; linux-api@vger.kernel.org; Andrew
> Morton <akpm@linux-foundation.org>; Alexander Viro <viro@zeniv.linux.org.uk>;
> gregkh@linuxfoundation.org; kevin.tian@intel.com; jean-philippe@linaro.org;
> eric.auger@redhat.com; Liguozhu (Kenneth) <liguozhu@hisilicon.com>;
> zhangfei.gao@linaro.org; chensihang (A) <chensihang1@hisilicon.com>
> Subject: Re: [RFC PATCH v3 1/2] mempinfd: Add new syscall to provide memory
> pin
> 
> On Tue, Feb 09, 2021 at 10:22:47PM +0000, Song Bao Hua (Barry Song) wrote:
> 
> > The problem is that SVA declares we can use any memory of a process
> > to do I/O. And in real scenarios, we are unable to customize most
> > applications to make them use the pool. So we are looking for some
> > extension generically for applications such as Nginx, Ceph.
> 
> But those applications will suffer jitter even if their are using CPU
> to do the same work. I fail to see why adding an accelerator suddenly
> means the application owner will care about jitter introduced by
> migration/etc.

The only point for this is that when migration occurs on the accelerator,
the impact/jitter is much bigger than it does on CPU. Then the accelerator
might be unhelpful.

> 
> Again in proper SVA it should be quite unlikely to take a fault caused
> by something like migration, on the same likelyhood as the CPU. If
> things are faulting so much this is a problem then I think it is a
> system level problem with doing too much page motion.

My point is that single one SVA application shouldn't require system
to make global changes, such as disabling numa balancing, disabling
THP, to decrease page fault frequency by affecting other applications.

Anyway, guys are in lunar new year. Hopefully, we are getting more
real benchmark data afterwards to make the discussion more targeted.

> 
> Jason

Thanks
Barry
David Hildenbrand Feb. 11, 2021, 10:28 a.m. UTC | #30
>>
>> Again in proper SVA it should be quite unlikely to take a fault caused
>> by something like migration, on the same likelyhood as the CPU. If
>> things are faulting so much this is a problem then I think it is a
>> system level problem with doing too much page motion.
> 
> My point is that single one SVA application shouldn't require system
> to make global changes, such as disabling numa balancing, disabling
> THP, to decrease page fault frequency by affecting other applications.
> 
> Anyway, guys are in lunar new year. Hopefully, we are getting more
> real benchmark data afterwards to make the discussion more targeted.

Right, but I think functionality as proposed in this patch is highly 
unlikely to make it into the kernel. I'd be interested in approaches to 
mitigate this per process. E.g., temporarily stop the kernel from 
messing with THP of this specific process.

But even then, why should some random library make such decisions for a 
whole process? Just as, why should some random library pin pages never 
allocated by it and stop THP from being created or NUMA layout from 
getting optimized? This looks like a clear layer violation to me.

I fully agree with Jason: Why do the events happen that often such that 
your use cases are affected that heavily, such that we even need such 
ugly handling?

What mempinfd does is exposing dangerous functionality that we don't 
want 99.99996% of all user space to ever use via a syscall to generic 
users to fix broken* hw.

*broken might be over-stressing the situation, but the HW (SVA) 
obviously seems to perform way worse than ordinary CPUs.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 86a9d7b3..949788f 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@ 
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		442
+#define __NR_compat_syscalls		443
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index cccfbbe..3f49529 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -891,6 +891,8 @@  __SYSCALL(__NR_faccessat2, sys_faccessat2)
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
 #define __NR_epoll_pwait2 441
 __SYSCALL(__NR_epoll_pwait2, compat_sys_epoll_pwait2)
+#define __NR_mempinfd 442
+__SYSCALL(__NR_mempinfd, sys_mempinfd)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/fs/Makefile b/fs/Makefile
index 999d1a2..e1cbf12 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -54,6 +54,7 @@  obj-$(CONFIG_COREDUMP)		+= coredump.o
 obj-$(CONFIG_SYSCTL)		+= drop_caches.o
 
 obj-$(CONFIG_FHANDLE)		+= fhandle.o
+obj-$(CONFIG_MEMPINFD)		+= mempinfd.o
 obj-y				+= iomap/
 
 obj-y				+= quota/
diff --git a/fs/mempinfd.c b/fs/mempinfd.c
new file mode 100644
index 0000000..23d3911
--- /dev/null
+++ b/fs/mempinfd.c
@@ -0,0 +1,199 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2021 HiSilicon Limited. */
+#include <linux/anon_inodes.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/xarray.h>
+#include <uapi/linux/mempinfd.h>
+
+struct mem_pin_container {
+	struct xarray array;
+	struct mutex lock;
+};
+
+struct pin_pages {
+	unsigned long first;
+	unsigned long nr_pages;
+	struct page **pages;
+};
+
+static int mempinfd_release(struct inode *inode, struct file *file)
+{
+	struct mem_pin_container *priv = file->private_data;
+	struct pin_pages *p;
+	unsigned long idx;
+
+	xa_for_each(&priv->array, idx, p) {
+		unpin_user_pages(p->pages, p->nr_pages);
+		xa_erase(&priv->array, p->first);
+		vfree(p->pages);
+		kfree(p);
+	}
+
+	mutex_destroy(&priv->lock);
+	xa_destroy(&priv->array);
+	kfree(priv);
+
+	return 0;
+}
+
+static int mempinfd_input_check(u64 addr, u64 size)
+{
+	if (!size || addr + size < addr)
+		return -EINVAL;
+
+	if (!can_do_mlock())
+		return -EPERM;
+
+	return 0;
+}
+
+static int mem_pin_page(struct mem_pin_container *priv,
+			struct mem_pin_address *addr)
+{
+	unsigned int flags = FOLL_FORCE | FOLL_WRITE;
+	unsigned long first, last, nr_pages;
+	struct page **pages;
+	struct pin_pages *p;
+	int ret;
+
+	if (mempinfd_input_check(addr->addr, addr->size))
+		return -EINVAL;
+
+	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
+	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
+	nr_pages = last - first + 1;
+
+	pages = vmalloc(nr_pages * sizeof(struct page *));
+	if (!pages)
+		return -ENOMEM;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	mutex_lock(&priv->lock);
+
+	ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
+				  flags | FOLL_LONGTERM, pages);
+	if (ret != nr_pages) {
+		pr_err("mempinfd: Failed to pin page\n");
+		goto unlock;
+	}
+	p->first = first;
+	p->nr_pages = nr_pages;
+	p->pages = pages;
+
+	ret = xa_insert(&priv->array, p->first, p, GFP_KERNEL);
+	if (ret)
+		goto unpin_pages;
+
+	mutex_unlock(&priv->lock);
+
+	return 0;
+
+unpin_pages:
+	unpin_user_pages(pages, nr_pages);
+unlock:
+	mutex_unlock(&priv->lock);
+	kfree(p);
+free:
+	vfree(pages);
+	return ret;
+}
+
+static int mem_unpin_page(struct mem_pin_container *priv,
+			  struct mem_pin_address *addr)
+{
+	unsigned long first, last, nr_pages;
+	struct pin_pages *p;
+	int ret;
+
+	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
+	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
+	nr_pages = last - first + 1;
+
+	mutex_lock(&priv->lock);
+
+	p = xa_load(&priv->array, first);
+	if (!p) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
+	if (p->nr_pages != nr_pages) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	unpin_user_pages(p->pages, p->nr_pages);
+	xa_erase(&priv->array, first);
+
+	mutex_unlock(&priv->lock);
+
+	vfree(p->pages);
+	kfree(p);
+
+	return 0;
+
+unlock:
+	mutex_unlock(&priv->lock);
+	return ret;
+}
+
+static long mempinfd_ioctl(struct file *file, unsigned int cmd,
+			   unsigned long arg)
+{
+	struct mem_pin_container *p = file->private_data;
+	struct mem_pin_address addr;
+
+	if (copy_from_user(&addr, (void __user *)arg,
+			   sizeof(struct mem_pin_address)))
+		return -EFAULT;
+
+	switch (cmd) {
+	case MEM_CMD_PIN:
+		return mem_pin_page(p, &addr);
+
+	case MEM_CMD_UNPIN:
+		return mem_unpin_page(p, &addr);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct file_operations mempinfd_fops = {
+	.release	= mempinfd_release,
+	.unlocked_ioctl = mempinfd_ioctl,
+};
+
+SYSCALL_DEFINE0(mempinfd)
+{
+	struct mem_pin_container *p;
+	int fd;
+
+	WARN_ON(!current->mm);
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	xa_init(&p->array);
+	mutex_init(&p->lock);
+
+	fd = anon_inode_getfd("[mempinfd]", &mempinfd_fops, p, O_CLOEXEC);
+	if (fd < 0) {
+		mutex_destroy(&p->lock);
+		xa_destroy(&p->array);
+		kfree(p);
+	}
+
+	return fd;
+}
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7688bc9..0960257 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1005,6 +1005,7 @@  asmlinkage long sys_execveat(int dfd, const char __user *filename,
 			const char __user *const __user *argv,
 			const char __user *const __user *envp, int flags);
 asmlinkage long sys_userfaultfd(int flags);
+asmlinkage long sys_mempinfd(void);
 asmlinkage long sys_membarrier(int cmd, unsigned int flags, int cpu_id);
 asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
 asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 7287529..2625b62 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -861,9 +861,11 @@  __SYSCALL(__NR_faccessat2, sys_faccessat2)
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
 #define __NR_epoll_pwait2 441
 __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
+#define __NR_mempinfd 442
+__SYSCALL(__NR_mempinfd, sys_mempinfd)
 
 #undef __NR_syscalls
-#define __NR_syscalls 442
+#define __NR_syscalls 443
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/mempinfd.h b/include/uapi/linux/mempinfd.h
new file mode 100644
index 0000000..ade3e15
--- /dev/null
+++ b/include/uapi/linux/mempinfd.h
@@ -0,0 +1,23 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _LINUX_MEMPINFD_H
+#define _LINUX_MEMPINFD_H
+
+#include <linux/types.h>
+
+/**
+ * struct mem_pin_address - Expected pin user space address and size
+ * @addr: Address to pin
+ * @size: Size of pin address
+ */
+struct mem_pin_address {
+	__u64 addr;
+	__u64 size;
+};
+
+/* MEM_CMD_PIN: Pin a range of memory */
+#define MEM_CMD_PIN		_IOW('M', 0, struct mem_pin_address)
+
+/* UACCE_CMD_UNPIN: Unpin a range of memory */
+#define MEM_CMD_UNPIN		_IOW('M', 1, struct mem_pin_address)
+
+#endif /* _LINUX_MEMPINFD_H */
diff --git a/init/Kconfig b/init/Kconfig
index b77c60f..5f2ba55 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1731,6 +1731,12 @@  config USERFAULTFD
 	  Enable the userfaultfd() system call that allows to intercept and
 	  handle page faults in userland.
 
+config MEMPINFD
+	bool "Enable mempinfd() system call"
+	help
+	  Enable the mempinfd() system call that allows to pin/unpin memory
+	  in userland.
+
 config ARCH_HAS_MEMBARRIER_CALLBACKS
 	bool