diff mbox series

[RFC,14/15] PCI/P2PDMA: Introduce pci_mmap_p2pmem()

Message ID 20201106170036.18713-15-logang@deltatee.com (mailing list archive)
State New, archived
Headers show
Series Userspace P2PDMA with O_DIRECT NVMe devices | expand

Commit Message

Logan Gunthorpe Nov. 6, 2020, 5 p.m. UTC
Introduce pci_mmap_p2pmem() which is a helper to allocate and mmap
a hunk of p2pmem into userspace.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
---
 drivers/pci/p2pdma.c       | 104 +++++++++++++++++++++++++++++++++++++
 include/linux/pci-p2pdma.h |   6 +++
 2 files changed, 110 insertions(+)

Comments

Jason Gunthorpe Nov. 6, 2020, 5:22 p.m. UTC | #1
On Fri, Nov 06, 2020 at 10:00:35AM -0700, Logan Gunthorpe wrote:
> Introduce pci_mmap_p2pmem() which is a helper to allocate and mmap
> a hunk of p2pmem into userspace.
> 
> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
>  drivers/pci/p2pdma.c       | 104 +++++++++++++++++++++++++++++++++++++
>  include/linux/pci-p2pdma.h |   6 +++
>  2 files changed, 110 insertions(+)
> 
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index 9961e779f430..8eab53ac59ae 100644
> +++ b/drivers/pci/p2pdma.c
> @@ -16,6 +16,7 @@
>  #include <linux/genalloc.h>
>  #include <linux/memremap.h>
>  #include <linux/percpu-refcount.h>
> +#include <linux/pfn_t.h>
>  #include <linux/random.h>
>  #include <linux/seq_buf.h>
>  #include <linux/xarray.h>
> @@ -1055,3 +1056,106 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
>  	return sprintf(page, "%s\n", pci_name(p2p_dev));
>  }
>  EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
> +
> +struct pci_p2pdma_map {
> +	struct kref ref;
> +	struct pci_dev *pdev;
> +	void *kaddr;
> +	size_t len;
> +};

Why have this at all? Nothing uses it and no vm_operations ops are
implemented?

This is very inflexable, it would be better if this is designed like
io_remap_pfn where it just preps and fills the VMA, doesn't take
ownership of the entire VMA

Jason
Logan Gunthorpe Nov. 6, 2020, 5:28 p.m. UTC | #2
On 2020-11-06 10:22 a.m., Jason Gunthorpe wrote:
> On Fri, Nov 06, 2020 at 10:00:35AM -0700, Logan Gunthorpe wrote:
>> Introduce pci_mmap_p2pmem() which is a helper to allocate and mmap
>> a hunk of p2pmem into userspace.
>>
>> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
>>  drivers/pci/p2pdma.c       | 104 +++++++++++++++++++++++++++++++++++++
>>  include/linux/pci-p2pdma.h |   6 +++
>>  2 files changed, 110 insertions(+)
>>
>> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
>> index 9961e779f430..8eab53ac59ae 100644
>> +++ b/drivers/pci/p2pdma.c
>> @@ -16,6 +16,7 @@
>>  #include <linux/genalloc.h>
>>  #include <linux/memremap.h>
>>  #include <linux/percpu-refcount.h>
>> +#include <linux/pfn_t.h>
>>  #include <linux/random.h>
>>  #include <linux/seq_buf.h>
>>  #include <linux/xarray.h>
>> @@ -1055,3 +1056,106 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
>>  	return sprintf(page, "%s\n", pci_name(p2p_dev));
>>  }
>>  EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
>> +
>> +struct pci_p2pdma_map {
>> +	struct kref ref;
>> +	struct pci_dev *pdev;
>> +	void *kaddr;
>> +	size_t len;
>> +};
> 
> Why have this at all? Nothing uses it and no vm_operations ops are
> implemented?

It's necessary to free the allocated p2pmem when the mapping is torn down.

> This is very inflexable, it would be better if this is designed like
> io_remap_pfn where it just preps and fills the VMA, doesn't take
> ownership of the entire VMA

If someone wants to manage their own P2P memory and create their own
userspace mapping with vmf_insert_mixed they are free to do so. But this
helper is specifically for users of pci_p2pdma_map_alloc(). I know you
don't intend to use that but it doesn't make it any less valid.

Logan
Jason Gunthorpe Nov. 6, 2020, 5:42 p.m. UTC | #3
On Fri, Nov 06, 2020 at 10:28:00AM -0700, Logan Gunthorpe wrote:
> 
> 
> On 2020-11-06 10:22 a.m., Jason Gunthorpe wrote:
> > On Fri, Nov 06, 2020 at 10:00:35AM -0700, Logan Gunthorpe wrote:
> >> Introduce pci_mmap_p2pmem() which is a helper to allocate and mmap
> >> a hunk of p2pmem into userspace.
> >>
> >> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> >>  drivers/pci/p2pdma.c       | 104 +++++++++++++++++++++++++++++++++++++
> >>  include/linux/pci-p2pdma.h |   6 +++
> >>  2 files changed, 110 insertions(+)
> >>
> >> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> >> index 9961e779f430..8eab53ac59ae 100644
> >> +++ b/drivers/pci/p2pdma.c
> >> @@ -16,6 +16,7 @@
> >>  #include <linux/genalloc.h>
> >>  #include <linux/memremap.h>
> >>  #include <linux/percpu-refcount.h>
> >> +#include <linux/pfn_t.h>
> >>  #include <linux/random.h>
> >>  #include <linux/seq_buf.h>
> >>  #include <linux/xarray.h>
> >> @@ -1055,3 +1056,106 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
> >>  	return sprintf(page, "%s\n", pci_name(p2p_dev));
> >>  }
> >>  EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
> >> +
> >> +struct pci_p2pdma_map {
> >> +	struct kref ref;
> >> +	struct pci_dev *pdev;
> >> +	void *kaddr;
> >> +	size_t len;
> >> +};
> > 
> > Why have this at all? Nothing uses it and no vm_operations ops are
> > implemented?
> 
> It's necessary to free the allocated p2pmem when the mapping is torn down.

That's suspicious.. Once in a VMA the lifetime of the page must be
controlled by the page refcount, it can't be put back into the genpool
just because the vma was destroed.

Jason
Logan Gunthorpe Nov. 6, 2020, 5:53 p.m. UTC | #4
On 2020-11-06 10:42 a.m., Jason Gunthorpe wrote:
> On Fri, Nov 06, 2020 at 10:28:00AM -0700, Logan Gunthorpe wrote:
>>
>>
>> On 2020-11-06 10:22 a.m., Jason Gunthorpe wrote:
>>> On Fri, Nov 06, 2020 at 10:00:35AM -0700, Logan Gunthorpe wrote:
>>>> Introduce pci_mmap_p2pmem() which is a helper to allocate and mmap
>>>> a hunk of p2pmem into userspace.
>>>>
>>>> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
>>>>  drivers/pci/p2pdma.c       | 104 +++++++++++++++++++++++++++++++++++++
>>>>  include/linux/pci-p2pdma.h |   6 +++
>>>>  2 files changed, 110 insertions(+)
>>>>
>>>> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
>>>> index 9961e779f430..8eab53ac59ae 100644
>>>> +++ b/drivers/pci/p2pdma.c
>>>> @@ -16,6 +16,7 @@
>>>>  #include <linux/genalloc.h>
>>>>  #include <linux/memremap.h>
>>>>  #include <linux/percpu-refcount.h>
>>>> +#include <linux/pfn_t.h>
>>>>  #include <linux/random.h>
>>>>  #include <linux/seq_buf.h>
>>>>  #include <linux/xarray.h>
>>>> @@ -1055,3 +1056,106 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
>>>>  	return sprintf(page, "%s\n", pci_name(p2p_dev));
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
>>>> +
>>>> +struct pci_p2pdma_map {
>>>> +	struct kref ref;
>>>> +	struct pci_dev *pdev;
>>>> +	void *kaddr;
>>>> +	size_t len;
>>>> +};
>>>
>>> Why have this at all? Nothing uses it and no vm_operations ops are
>>> implemented?
>>
>> It's necessary to free the allocated p2pmem when the mapping is torn down.
> 
> That's suspicious.. Once in a VMA the lifetime of the page must be
> controlled by the page refcount, it can't be put back into the genpool
> just because the vma was destroed.

Ah, hmm, yes. I guess the pages have to be hooked and returned to the
genalloc through free_devmap_managed_page(). Seems like it might be
doable... but it will complicate things for users that don't want to use
the genpool (though no such users exist upstream).

Logan
Jason Gunthorpe Nov. 6, 2020, 6:09 p.m. UTC | #5
On Fri, Nov 06, 2020 at 10:53:45AM -0700, Logan Gunthorpe wrote:
> 
> 
> On 2020-11-06 10:42 a.m., Jason Gunthorpe wrote:
> > On Fri, Nov 06, 2020 at 10:28:00AM -0700, Logan Gunthorpe wrote:
> >>
> >>
> >> On 2020-11-06 10:22 a.m., Jason Gunthorpe wrote:
> >>> On Fri, Nov 06, 2020 at 10:00:35AM -0700, Logan Gunthorpe wrote:
> >>>> Introduce pci_mmap_p2pmem() which is a helper to allocate and mmap
> >>>> a hunk of p2pmem into userspace.
> >>>>
> >>>> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> >>>>  drivers/pci/p2pdma.c       | 104 +++++++++++++++++++++++++++++++++++++
> >>>>  include/linux/pci-p2pdma.h |   6 +++
> >>>>  2 files changed, 110 insertions(+)
> >>>>
> >>>> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> >>>> index 9961e779f430..8eab53ac59ae 100644
> >>>> +++ b/drivers/pci/p2pdma.c
> >>>> @@ -16,6 +16,7 @@
> >>>>  #include <linux/genalloc.h>
> >>>>  #include <linux/memremap.h>
> >>>>  #include <linux/percpu-refcount.h>
> >>>> +#include <linux/pfn_t.h>
> >>>>  #include <linux/random.h>
> >>>>  #include <linux/seq_buf.h>
> >>>>  #include <linux/xarray.h>
> >>>> @@ -1055,3 +1056,106 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
> >>>>  	return sprintf(page, "%s\n", pci_name(p2p_dev));
> >>>>  }
> >>>>  EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
> >>>> +
> >>>> +struct pci_p2pdma_map {
> >>>> +	struct kref ref;
> >>>> +	struct pci_dev *pdev;
> >>>> +	void *kaddr;
> >>>> +	size_t len;
> >>>> +};
> >>>
> >>> Why have this at all? Nothing uses it and no vm_operations ops are
> >>> implemented?
> >>
> >> It's necessary to free the allocated p2pmem when the mapping is torn down.
> > 
> > That's suspicious.. Once in a VMA the lifetime of the page must be
> > controlled by the page refcount, it can't be put back into the genpool
> > just because the vma was destroed.
> 
> Ah, hmm, yes. I guess the pages have to be hooked and returned to the
> genalloc through free_devmap_managed_page(). 

That sounds about right, but in this case it doesn't need the VMA
operations.

> Seems like it might be doable... but it will complicate things for
> users that don't want to use the genpool (though no such users exist
> upstream).

I would like to use this stuff in RDMA pretty much immediately and the
genpool is harmful for those cases, so please don't make decisions
that are tying thing to genpool

Jason
Logan Gunthorpe Nov. 6, 2020, 6:20 p.m. UTC | #6
On 2020-11-06 11:09 a.m., Jason Gunthorpe wrote:
>> Ah, hmm, yes. I guess the pages have to be hooked and returned to the
>> genalloc through free_devmap_managed_page(). 
> 
> That sounds about right, but in this case it doesn't need the VMA
> operations.
> 
>> Seems like it might be doable... but it will complicate things for
>> users that don't want to use the genpool (though no such users exist
>> upstream).
> 
> I would like to use this stuff in RDMA pretty much immediately and the
> genpool is harmful for those cases, so please don't make decisions
> that are tying thing to genpool

I certainly can't make decisions for code that isn't currently upstream.
So you will almost certainly have to make changes for the code you want
to add, as is the standard procedure. I can't and should not export APIs
that you might need that have no upstream users, but you are certainly
free to send patches that create them when you add the use case.

Ultimately, if you aren't using the genpool you will have to implement
your own mmap operation that somehow allocates the pages and your own
page_free hook. The latter can be accommodated for by a patch that
splits off pci_p2pdma_add_resource() into a function that doesn't use
the genpool (I've already seen two independent developers create a
similar patch for this but with no upstream user, they couldn't be taken
upstream).

I also don't expect this to be going upstream in the near term so don't
get too excited about using it.

Logan
Jason Gunthorpe Nov. 6, 2020, 7:30 p.m. UTC | #7
On Fri, Nov 06, 2020 at 11:20:05AM -0700, Logan Gunthorpe wrote:
> 
> 
> On 2020-11-06 11:09 a.m., Jason Gunthorpe wrote:
> >> Ah, hmm, yes. I guess the pages have to be hooked and returned to the
> >> genalloc through free_devmap_managed_page(). 
> > 
> > That sounds about right, but in this case it doesn't need the VMA
> > operations.
> > 
> >> Seems like it might be doable... but it will complicate things for
> >> users that don't want to use the genpool (though no such users exist
> >> upstream).
> > 
> > I would like to use this stuff in RDMA pretty much immediately and the
> > genpool is harmful for those cases, so please don't make decisions
> > that are tying thing to genpool
> 
> I certainly can't make decisions for code that isn't currently
> upstream.

The rdma drivers are all upstream, what are you thinking about?

> Ultimately, if you aren't using the genpool you will have to implement
> your own mmap operation that somehow allocates the pages and your own
> page_free hook. 

Sure, the mlx5 driver already has a specialized alloctor for it's BAR
pages.

> I also don't expect this to be going upstream in the near term so don't
> get too excited about using it.

I don't know, it is actually not that horrible, the GUP and IOMMU
related changes are simpler than I expected

Jason
Logan Gunthorpe Nov. 6, 2020, 7:44 p.m. UTC | #8
On 2020-11-06 12:30 p.m., Jason Gunthorpe wrote:
>> I certainly can't make decisions for code that isn't currently
>> upstream.
> 
> The rdma drivers are all upstream, what are you thinking about?

Really? I feel like you should know what I mean here...

I mean upstream code that actually uses the APIs that I'd have to
introduce. I can't say here's an API feature that no code uses but the
already upstream rdma driver might use eventually. It's fairly easy to
send patches that make the necessary changes when someone adds a use of
those changes inside the rdma code.

>> Ultimately, if you aren't using the genpool you will have to implement
>> your own mmap operation that somehow allocates the pages and your own
>> page_free hook. 
> 
> Sure, the mlx5 driver already has a specialized alloctor for it's BAR
> pages.

So it *might* make sense to carve out a common helper to setup a VMA for
P2PDMA to do the vm_flags check and set VM_MIXEDMAP... but besides that,
there's no code that would be common to the two cases.

>> I also don't expect this to be going upstream in the near term so don't
>> get too excited about using it.
> 
> I don't know, it is actually not that horrible, the GUP and IOMMU
> related changes are simpler than I expected

I think the deal breaker is the SGL hack and the fact that there are
important IOMMU implementations that won't have support.

Logan
Jason Gunthorpe Nov. 6, 2020, 7:53 p.m. UTC | #9
On Fri, Nov 06, 2020 at 12:44:59PM -0700, Logan Gunthorpe wrote:
> 
> 
> On 2020-11-06 12:30 p.m., Jason Gunthorpe wrote:
> >> I certainly can't make decisions for code that isn't currently
> >> upstream.
> > 
> > The rdma drivers are all upstream, what are you thinking about?
> 
> Really? I feel like you should know what I mean here...
> 
> I mean upstream code that actually uses the APIs that I'd have to
> introduce. I can't say here's an API feature that no code uses but the
> already upstream rdma driver might use eventually. It's fairly easy to
> send patches that make the necessary changes when someone adds a use of
> those changes inside the rdma code.

Sure, but that doesn't mean you have to actively design things to be
unusable beyond this narrow case. The RDMA drivers are all there, all
upstream, if this work is accepted then the changes to insert P2P
pages into their existing mmap flows is a reasonable usecase to
consider at this point when building core code APIs.

You shouldn't add dead code, but at least have a design in mind for
what it needs to look like and some allowance.

> >> Ultimately, if you aren't using the genpool you will have to implement
> >> your own mmap operation that somehow allocates the pages and your own
> >> page_free hook. 
> > 
> > Sure, the mlx5 driver already has a specialized alloctor for it's BAR
> > pages.
> 
> So it *might* make sense to carve out a common helper to setup a VMA for
> P2PDMA to do the vm_flags check and set VM_MIXEDMAP... but besides that,
> there's no code that would be common to the two cases.

I think the whole insertion of P2PDMA pages into a VMA should be
similar to io_remap_pfn() so all the VM flags, pgprot_decrypted and
other subtle details are all in one place. (also it needs a
pgprot_decrypted before doing vmf_insert, I just learned that this
month)

> >> I also don't expect this to be going upstream in the near term so don't
> >> get too excited about using it.
> > 
> > I don't know, it is actually not that horrible, the GUP and IOMMU
> > related changes are simpler than I expected
> 
> I think the deal breaker is the SGL hack and the fact that there are
> important IOMMU implementations that won't have support.

Yes, that is pretty hacky, maybe someone will have a better idea

Jason
Logan Gunthorpe Nov. 6, 2020, 8:03 p.m. UTC | #10
On 2020-11-06 12:53 p.m., Jason Gunthorpe wrote:
> On Fri, Nov 06, 2020 at 12:44:59PM -0700, Logan Gunthorpe wrote:
>>
>>
>> On 2020-11-06 12:30 p.m., Jason Gunthorpe wrote:
>>>> I certainly can't make decisions for code that isn't currently
>>>> upstream.
>>>
>>> The rdma drivers are all upstream, what are you thinking about?
>>
>> Really? I feel like you should know what I mean here...
>>
>> I mean upstream code that actually uses the APIs that I'd have to
>> introduce. I can't say here's an API feature that no code uses but the
>> already upstream rdma driver might use eventually. It's fairly easy to
>> send patches that make the necessary changes when someone adds a use of
>> those changes inside the rdma code.
> 
> Sure, but that doesn't mean you have to actively design things to be
> unusable beyond this narrow case. The RDMA drivers are all there, all
> upstream, if this work is accepted then the changes to insert P2P
> pages into their existing mmap flows is a reasonable usecase to
> consider at this point when building core code APIs.
> 
> You shouldn't add dead code, but at least have a design in mind for
> what it needs to look like and some allowance.

I don't see anything I've done that's at odds with that. You will still
need to make changes to the p2pdma code to implement your use case.

>>>> Ultimately, if you aren't using the genpool you will have to implement
>>>> your own mmap operation that somehow allocates the pages and your own
>>>> page_free hook. 
>>>
>>> Sure, the mlx5 driver already has a specialized alloctor for it's BAR
>>> pages.
>>
>> So it *might* make sense to carve out a common helper to setup a VMA for
>> P2PDMA to do the vm_flags check and set VM_MIXEDMAP... but besides that,
>> there's no code that would be common to the two cases.
> 
> I think the whole insertion of P2PDMA pages into a VMA should be
> similar to io_remap_pfn() so all the VM flags, pgprot_decrypted and
> other subtle details are all in one place. (also it needs a
> pgprot_decrypted before doing vmf_insert, I just learned that this
> month)

I don't think a function like that will work for the p2pmem use case. In
order to implement proper page freeing I expect I'll need a loop around
the allocator and vm_insert_mixed()... Something roughly like:

for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
        vaddr = pci_alloc_p2pmem(pdev, PAGE_SIZE);
	ret = vmf_insert_mixed(vma, addr,
  		       __pfn_to_pfn_t(virt_to_pfn(vaddr), PFN_DEV | PFN_MAP));
}

That way we can call pci_free_p2pmem() when a page's ref count goes to
zero. I suspect your use case will need to do something similar.

Logan
Jason Gunthorpe Nov. 7, 2020, 12:14 a.m. UTC | #11
On Fri, Nov 06, 2020 at 01:03:26PM -0700, Logan Gunthorpe wrote:
> I don't think a function like that will work for the p2pmem use case. In
> order to implement proper page freeing I expect I'll need a loop around
> the allocator and vm_insert_mixed()... Something roughly like:
> 
> for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
>         vaddr = pci_alloc_p2pmem(pdev, PAGE_SIZE);
> 	ret = vmf_insert_mixed(vma, addr,
>   		       __pfn_to_pfn_t(virt_to_pfn(vaddr), PFN_DEV | PFN_MAP));
> }
> 
> That way we can call pci_free_p2pmem() when a page's ref count goes to
> zero. I suspect your use case will need to do something similar.

Yes, but I would say the pci_alloc_p2pmem() layer should be able to
free pages on a page-by-page basis so you don't have to do stuff like
the above.

There is often a lot of value in having physical contiguous addresses,
so allocating page by page as well seems poor.

Jason
Logan Gunthorpe Nov. 7, 2020, 2:50 a.m. UTC | #12
On 2020-11-06 5:14 p.m., Jason Gunthorpe wrote:
> On Fri, Nov 06, 2020 at 01:03:26PM -0700, Logan Gunthorpe wrote:
>> I don't think a function like that will work for the p2pmem use case. In
>> order to implement proper page freeing I expect I'll need a loop around
>> the allocator and vm_insert_mixed()... Something roughly like:
>>
>> for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
>>         vaddr = pci_alloc_p2pmem(pdev, PAGE_SIZE);
>> 	ret = vmf_insert_mixed(vma, addr,
>>   		       __pfn_to_pfn_t(virt_to_pfn(vaddr), PFN_DEV | PFN_MAP));
>> }
>>
>> That way we can call pci_free_p2pmem() when a page's ref count goes to
>> zero. I suspect your use case will need to do something similar.
> 
> Yes, but I would say the pci_alloc_p2pmem() layer should be able to
> free pages on a page-by-page basis so you don't have to do stuff like
> the above.
> 
> There is often a lot of value in having physical contiguous addresses,
> so allocating page by page as well seems poor.

Agreed. But I'll have to dig to see if genalloc supports freeing blocks
in different sizes than the allocations.

Logan
diff mbox series

Patch

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 9961e779f430..8eab53ac59ae 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -16,6 +16,7 @@ 
 #include <linux/genalloc.h>
 #include <linux/memremap.h>
 #include <linux/percpu-refcount.h>
+#include <linux/pfn_t.h>
 #include <linux/random.h>
 #include <linux/seq_buf.h>
 #include <linux/xarray.h>
@@ -1055,3 +1056,106 @@  ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
 	return sprintf(page, "%s\n", pci_name(p2p_dev));
 }
 EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
+
+struct pci_p2pdma_map {
+	struct kref ref;
+	struct pci_dev *pdev;
+	void *kaddr;
+	size_t len;
+};
+
+static struct pci_p2pdma_map *pci_p2pdma_map_alloc(struct pci_dev *pdev,
+						   size_t len)
+{
+	struct pci_p2pdma_map *pmap;
+
+	pmap = kzalloc(sizeof(*pmap), GFP_KERNEL);
+	if (!pmap)
+		return NULL;
+
+	kref_init(&pmap->ref);
+	pmap->pdev = pdev;
+	pmap->len = len;
+
+	pmap->kaddr = pci_alloc_p2pmem(pdev, len);
+	if (!pmap->kaddr) {
+		kfree(pmap);
+		return NULL;
+	}
+
+	return pmap;
+}
+
+static void pci_p2pdma_map_free(struct kref *ref)
+{
+	struct pci_p2pdma_map *pmap =
+		container_of(ref, struct pci_p2pdma_map, ref);
+
+	pci_free_p2pmem(pmap->pdev, pmap->kaddr, pmap->len);
+	kfree(pmap);
+}
+
+static void pci_p2pdma_vma_open(struct vm_area_struct *vma)
+{
+	struct pci_p2pdma_map *pmap = vma->vm_private_data;
+
+	kref_get(&pmap->ref);
+}
+
+static void pci_p2pdma_vma_close(struct vm_area_struct *vma)
+{
+	struct pci_p2pdma_map *pmap = vma->vm_private_data;
+
+	kref_put(&pmap->ref, pci_p2pdma_map_free);
+}
+
+const struct vm_operations_struct pci_p2pdma_vmops = {
+	.open = pci_p2pdma_vma_open,
+	.close = pci_p2pdma_vma_close,
+};
+
+/**
+ * pci_mmap_p2pmem - allocate peer-to-peer DMA memory
+ * @pdev: the device to allocate memory from
+ * @vma: the userspace vma to map the memory to
+ *
+ * Returns 0 on success, or a negative error code on failure
+ */
+int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma)
+{
+	struct pci_p2pdma_map *pmap;
+	unsigned long addr, pfn;
+	vm_fault_t ret;
+
+	/* prevent private mappings from being established */
+	if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
+		pci_info_ratelimited(pdev,
+				     "%s: fail, attempted private mapping\n",
+				     current->comm);
+		return -EINVAL;
+	}
+
+	pmap = pci_p2pdma_map_alloc(pdev, vma->vm_end - vma->vm_start);
+	if (!pmap)
+		return -ENOMEM;
+
+	vma->vm_flags |= VM_MIXEDMAP;
+	vma->vm_private_data = pmap;
+	vma->vm_ops = &pci_p2pdma_vmops;
+
+	pfn = virt_to_phys(pmap->kaddr) >> PAGE_SHIFT;
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+		ret = vmf_insert_mixed(vma, addr,
+				       __pfn_to_pfn_t(pfn, PFN_DEV | PFN_MAP));
+		if (ret & VM_FAULT_ERROR)
+			goto out_error;
+		pfn++;
+	}
+
+	return 0;
+
+out_error:
+	kref_put(&pmap->ref, pci_p2pdma_map_free);
+	return -EFAULT;
+}
+EXPORT_SYMBOL_GPL(pci_mmap_p2pmem);
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index fc5de47eeac4..26fe40363d1c 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -40,6 +40,7 @@  int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
 			    bool *use_p2pdma);
 ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
 			       bool use_p2pdma);
+int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma);
 #else /* CONFIG_PCI_P2PDMA */
 static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
 		size_t size, u64 offset)
@@ -116,6 +117,11 @@  static inline ssize_t pci_p2pdma_enable_show(char *page,
 {
 	return sprintf(page, "none\n");
 }
+static inline int pci_mmap_p2pmem(struct pci_dev *pdev,
+				  struct vm_area_struct *vma)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_PCI_P2PDMA */