diff mbox series

[RFC,rdma-next,01/10] RDMA: mr: Introduce is_pmem

Message ID 20211228080717.10666-2-lizhijian@cn.fujitsu.com (mailing list archive)
State Superseded
Headers show
Series RDMA/rxe: Add RDMA FLUSH operation | expand

Commit Message

Li Zhijian Dec. 28, 2021, 8:07 a.m. UTC
We can use it to indicate whether the registering mr is associated with
a pmem/nvdimm or not.

Currently, we only assign it in rxe driver, for other device/drivers,
they should implement it if needed.

RDMA FLUSH will support the persistence feature for a pmem/nvdimm.

Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
---
 drivers/infiniband/sw/rxe/rxe_mr.c | 47 ++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h            |  1 +
 2 files changed, 48 insertions(+)

Comments

Jason Gunthorpe Jan. 6, 2022, 12:21 a.m. UTC | #1
On Tue, Dec 28, 2021 at 04:07:08PM +0800, Li Zhijian wrote:
> We can use it to indicate whether the registering mr is associated with
> a pmem/nvdimm or not.
> 
> Currently, we only assign it in rxe driver, for other device/drivers,
> they should implement it if needed.
> 
> RDMA FLUSH will support the persistence feature for a pmem/nvdimm.
> 
> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
>  drivers/infiniband/sw/rxe/rxe_mr.c | 47 ++++++++++++++++++++++++++++++
>  include/rdma/ib_verbs.h            |  1 +
>  2 files changed, 48 insertions(+)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
> index 7c4cd19a9db2..bcd5e7afa475 100644
> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c
> @@ -162,6 +162,50 @@ void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr)
>  	mr->type = IB_MR_TYPE_DMA;
>  }
>  
> +// XXX: the logic is similar with mm/memory-failure.c
> +static bool page_in_dev_pagemap(struct page *page)
> +{
> +	unsigned long pfn;
> +	struct page *p;
> +	struct dev_pagemap *pgmap = NULL;
> +
> +	pfn = page_to_pfn(page);
> +	if (!pfn) {
> +		pr_err("no such pfn for page %p\n", page);
> +		return false;
> +	}
> +
> +	p = pfn_to_online_page(pfn);
> +	if (!p) {
> +		if (pfn_valid(pfn)) {
> +			pgmap = get_dev_pagemap(pfn, NULL);
> +			if (pgmap)
> +				put_dev_pagemap(pgmap);
> +		}
> +	}
> +
> +	return !!pgmap;

You need to get Dan to check this out, but I'm pretty sure this should
be more like this:

if (is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_FS_DAX)


> +static bool iova_in_pmem(struct rxe_mr *mr, u64 iova, int length)
> +{
> +	struct page *page = NULL;
> +	char *vaddr = iova_to_vaddr(mr, iova, length);
> +
> +	if (!vaddr) {
> +		pr_err("not a valid iova %llu\n", iova);
> +		return false;
> +	}
> +
> +	page = virt_to_page(vaddr);

And obviously this isn't uniform for the entire umem, so I don't even
know what this is supposed to mean.

> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 6e9ad656ecb7..822ebb3425dc 100644
> +++ b/include/rdma/ib_verbs.h
> @@ -1807,6 +1807,7 @@ struct ib_mr {
>  	unsigned int	   page_size;
>  	enum ib_mr_type	   type;
>  	bool		   need_inval;
> +	bool		   is_pmem;

Or why it is being stored in the global struct?

Jason
Zhijian Li (Fujitsu) Jan. 6, 2022, 6:12 a.m. UTC | #2
Add Dan to the party :)

May i know whether there is any existing APIs to check whether
a va/page backs to a nvdimm/pmem ?



On 06/01/2022 08:21, Jason Gunthorpe wrote:
> On Tue, Dec 28, 2021 at 04:07:08PM +0800, Li Zhijian wrote:
>> We can use it to indicate whether the registering mr is associated with
>> a pmem/nvdimm or not.
>>
>> Currently, we only assign it in rxe driver, for other device/drivers,
>> they should implement it if needed.
>>
>> RDMA FLUSH will support the persistence feature for a pmem/nvdimm.
>>
>> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
>>   drivers/infiniband/sw/rxe/rxe_mr.c | 47 ++++++++++++++++++++++++++++++
>>   include/rdma/ib_verbs.h            |  1 +
>>   2 files changed, 48 insertions(+)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
>> index 7c4cd19a9db2..bcd5e7afa475 100644
>> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c
>> @@ -162,6 +162,50 @@ void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr)
>>   	mr->type = IB_MR_TYPE_DMA;
>>   }
>>   
>> +// XXX: the logic is similar with mm/memory-failure.c
>> +static bool page_in_dev_pagemap(struct page *page)
>> +{
>> +	unsigned long pfn;
>> +	struct page *p;
>> +	struct dev_pagemap *pgmap = NULL;
>> +
>> +	pfn = page_to_pfn(page);
>> +	if (!pfn) {
>> +		pr_err("no such pfn for page %p\n", page);
>> +		return false;
>> +	}
>> +
>> +	p = pfn_to_online_page(pfn);
>> +	if (!p) {
>> +		if (pfn_valid(pfn)) {
>> +			pgmap = get_dev_pagemap(pfn, NULL);
>> +			if (pgmap)
>> +				put_dev_pagemap(pgmap);
>> +		}
>> +	}
>> +
>> +	return !!pgmap;
> You need to get Dan to check this out, but I'm pretty sure this should
> be more like this:
>
> if (is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_FS_DAX)

Great, i have added him.



>
>
>> +static bool iova_in_pmem(struct rxe_mr *mr, u64 iova, int length)
>> +{
>> +	struct page *page = NULL;
>> +	char *vaddr = iova_to_vaddr(mr, iova, length);
>> +
>> +	if (!vaddr) {
>> +		pr_err("not a valid iova %llu\n", iova);
>> +		return false;
>> +	}
>> +
>> +	page = virt_to_page(vaddr);
> And obviously this isn't uniform for the entire umem, so I don't even
> know what this is supposed to mean.

My intention is to check if a memory region belongs to a nvdimm/pmem.
The approach is like that:
iova(user space)-+                     +-> page -> page_in_dev_pagemap()
                  |                     |
                  +-> va(kernel space) -+
Since current MR's va is associated with map_set where it record the relations
between iova and va and page. Do do you mean we should travel map_set to
get its page ? or by any other ways.

  





>> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
>> index 6e9ad656ecb7..822ebb3425dc 100644
>> +++ b/include/rdma/ib_verbs.h
>> @@ -1807,6 +1807,7 @@ struct ib_mr {
>>   	unsigned int	   page_size;
>>   	enum ib_mr_type	   type;
>>   	bool		   need_inval;
>> +	bool		   is_pmem;
> Or why it is being stored in the global struct?

Indeed, it's not strong necessary. but i think is_pmem should belongs to a ib_mr
so that it can be checked by other general code when they needed even though no
one does such checking so far.


Thanks
Zhijian



>
> Jason
Li Zhijian Jan. 14, 2022, 8:10 a.m. UTC | #3
Copied to nvdimm list

Thanks

Zhijian


on 2022/1/6 14:12, Li Zhijian wrote:
>
> Add Dan to the party :)
>
> May i know whether there is any existing APIs to check whether
> a va/page backs to a nvdimm/pmem ?
>
>
>
> On 06/01/2022 08:21, Jason Gunthorpe wrote:
>> On Tue, Dec 28, 2021 at 04:07:08PM +0800, Li Zhijian wrote:
>>> We can use it to indicate whether the registering mr is associated with
>>> a pmem/nvdimm or not.
>>>
>>> Currently, we only assign it in rxe driver, for other device/drivers,
>>> they should implement it if needed.
>>>
>>> RDMA FLUSH will support the persistence feature for a pmem/nvdimm.
>>>
>>> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
>>>   drivers/infiniband/sw/rxe/rxe_mr.c | 47 
>>> ++++++++++++++++++++++++++++++
>>>   include/rdma/ib_verbs.h            |  1 +
>>>   2 files changed, 48 insertions(+)
>>>
>>> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c 
>>> b/drivers/infiniband/sw/rxe/rxe_mr.c
>>> index 7c4cd19a9db2..bcd5e7afa475 100644
>>> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c
>>> @@ -162,6 +162,50 @@ void rxe_mr_init_dma(struct rxe_pd *pd, int 
>>> access, struct rxe_mr *mr)
>>>       mr->type = IB_MR_TYPE_DMA;
>>>   }
>>>   +// XXX: the logic is similar with mm/memory-failure.c
>>> +static bool page_in_dev_pagemap(struct page *page)
>>> +{
>>> +    unsigned long pfn;
>>> +    struct page *p;
>>> +    struct dev_pagemap *pgmap = NULL;
>>> +
>>> +    pfn = page_to_pfn(page);
>>> +    if (!pfn) {
>>> +        pr_err("no such pfn for page %p\n", page);
>>> +        return false;
>>> +    }
>>> +
>>> +    p = pfn_to_online_page(pfn);
>>> +    if (!p) {
>>> +        if (pfn_valid(pfn)) {
>>> +            pgmap = get_dev_pagemap(pfn, NULL);
>>> +            if (pgmap)
>>> +                put_dev_pagemap(pgmap);
>>> +        }
>>> +    }
>>> +
>>> +    return !!pgmap;
>> You need to get Dan to check this out, but I'm pretty sure this should
>> be more like this:
>>
>> if (is_zone_device_page(page) && page->pgmap->type == 
>> MEMORY_DEVICE_FS_DAX)
>
> Great, i have added him.
>
>
>
>>
>>
>>> +static bool iova_in_pmem(struct rxe_mr *mr, u64 iova, int length)
>>> +{
>>> +    struct page *page = NULL;
>>> +    char *vaddr = iova_to_vaddr(mr, iova, length);
>>> +
>>> +    if (!vaddr) {
>>> +        pr_err("not a valid iova %llu\n", iova);
>>> +        return false;
>>> +    }
>>> +
>>> +    page = virt_to_page(vaddr);
>> And obviously this isn't uniform for the entire umem, so I don't even
>> know what this is supposed to mean.
>
> My intention is to check if a memory region belongs to a nvdimm/pmem.
> The approach is like that:
> iova(user space)-+                     +-> page -> page_in_dev_pagemap()
>                  |                     |
>                  +-> va(kernel space) -+
> Since current MR's va is associated with map_set where it record the 
> relations
> between iova and va and page. Do do you mean we should travel map_set to
> get its page ? or by any other ways.
>
>
>
>
>
>
>
>>> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
>>> index 6e9ad656ecb7..822ebb3425dc 100644
>>> +++ b/include/rdma/ib_verbs.h
>>> @@ -1807,6 +1807,7 @@ struct ib_mr {
>>>       unsigned int       page_size;
>>>       enum ib_mr_type       type;
>>>       bool           need_inval;
>>> +    bool           is_pmem;
>> Or why it is being stored in the global struct?
>
> Indeed, it's not strong necessary. but i think is_pmem should belongs 
> to a ib_mr
> so that it can be checked by other general code when they needed even 
> though no
> one does such checking so far.
>
>
> Thanks
> Zhijian
>
>
>
>>
>> Jason
>
Dan Williams Jan. 16, 2022, 6:11 p.m. UTC | #4
On Wed, Jan 5, 2022 at 10:13 PM lizhijian@fujitsu.com
<lizhijian@fujitsu.com> wrote:
>
>
> Add Dan to the party :)
>
> May i know whether there is any existing APIs to check whether
> a va/page backs to a nvdimm/pmem ?
>
>
>
> On 06/01/2022 08:21, Jason Gunthorpe wrote:
> > On Tue, Dec 28, 2021 at 04:07:08PM +0800, Li Zhijian wrote:
> >> We can use it to indicate whether the registering mr is associated with
> >> a pmem/nvdimm or not.
> >>
> >> Currently, we only assign it in rxe driver, for other device/drivers,
> >> they should implement it if needed.
> >>
> >> RDMA FLUSH will support the persistence feature for a pmem/nvdimm.
> >>
> >> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
> >>   drivers/infiniband/sw/rxe/rxe_mr.c | 47 ++++++++++++++++++++++++++++++
> >>   include/rdma/ib_verbs.h            |  1 +
> >>   2 files changed, 48 insertions(+)
> >>
> >> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
> >> index 7c4cd19a9db2..bcd5e7afa475 100644
> >> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c
> >> @@ -162,6 +162,50 @@ void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr)
> >>      mr->type = IB_MR_TYPE_DMA;
> >>   }
> >>
> >> +// XXX: the logic is similar with mm/memory-failure.c
> >> +static bool page_in_dev_pagemap(struct page *page)
> >> +{
> >> +    unsigned long pfn;
> >> +    struct page *p;
> >> +    struct dev_pagemap *pgmap = NULL;
> >> +
> >> +    pfn = page_to_pfn(page);
> >> +    if (!pfn) {
> >> +            pr_err("no such pfn for page %p\n", page);
> >> +            return false;
> >> +    }
> >> +
> >> +    p = pfn_to_online_page(pfn);
> >> +    if (!p) {
> >> +            if (pfn_valid(pfn)) {
> >> +                    pgmap = get_dev_pagemap(pfn, NULL);
> >> +                    if (pgmap)
> >> +                            put_dev_pagemap(pgmap);
> >> +            }
> >> +    }
> >> +
> >> +    return !!pgmap;
> > You need to get Dan to check this out, but I'm pretty sure this should
> > be more like this:
> >
> > if (is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_FS_DAX)
>
> Great, i have added him.
>
>
>
> >
> >
> >> +static bool iova_in_pmem(struct rxe_mr *mr, u64 iova, int length)
> >> +{
> >> +    struct page *page = NULL;
> >> +    char *vaddr = iova_to_vaddr(mr, iova, length);
> >> +
> >> +    if (!vaddr) {
> >> +            pr_err("not a valid iova %llu\n", iova);
> >> +            return false;
> >> +    }
> >> +
> >> +    page = virt_to_page(vaddr);
> > And obviously this isn't uniform for the entire umem, so I don't even
> > know what this is supposed to mean.
>
> My intention is to check if a memory region belongs to a nvdimm/pmem.
> The approach is like that:
> iova(user space)-+                     +-> page -> page_in_dev_pagemap()
>                   |                     |
>                   +-> va(kernel space) -+
> Since current MR's va is associated with map_set where it record the relations
> between iova and va and page. Do do you mean we should travel map_set to
> get its page ? or by any other ways.

Apologies for the delay in responding.

The Subject line of this patch is confusing, if you want to know if a
pfn is in persistent memory the only mechanism for that is:

region_intersects(addr, length, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY)

...there is otherwise nothing pmem specific about the dev_pagemap
infrastructure. Yes, pmem is the primary user, but it is also used for
mapping "soft-reserved" memory (See: the EFI_MEMORY_SP) attribute, and
other users.

Can you clarify the intent? I am missing some context.
Zhijian Li (Fujitsu) Jan. 18, 2022, 8:55 a.m. UTC | #5
On 17/01/2022 02:11, Dan Williams wrote:
> On Wed, Jan 5, 2022 at 10:13 PM lizhijian@fujitsu.com
> <lizhijian@fujitsu.com> wrote:
>>
>> Add Dan to the party :)
>>
>> May i know whether there is any existing APIs to check whether
>> a va/page backs to a nvdimm/pmem ?
>>
>>
>>
>> On 06/01/2022 08:21, Jason Gunthorpe wrote:
>>> On Tue, Dec 28, 2021 at 04:07:08PM +0800, Li Zhijian wrote:
>>>> We can use it to indicate whether the registering mr is associated with
>>>> a pmem/nvdimm or not.
>>>>
>>>> Currently, we only assign it in rxe driver, for other device/drivers,
>>>> they should implement it if needed.
>>>>
>>>> RDMA FLUSH will support the persistence feature for a pmem/nvdimm.
>>>>
>>>> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
>>>>    drivers/infiniband/sw/rxe/rxe_mr.c | 47 ++++++++++++++++++++++++++++++
>>>>    include/rdma/ib_verbs.h            |  1 +
>>>>    2 files changed, 48 insertions(+)
>>>>
>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
>>>> index 7c4cd19a9db2..bcd5e7afa475 100644
>>>> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c
>>>> @@ -162,6 +162,50 @@ void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr)
>>>>       mr->type = IB_MR_TYPE_DMA;
>>>>    }
>>>>
>>>> +// XXX: the logic is similar with mm/memory-failure.c
>>>> +static bool page_in_dev_pagemap(struct page *page)
>>>> +{
>>>> +    unsigned long pfn;
>>>> +    struct page *p;
>>>> +    struct dev_pagemap *pgmap = NULL;
>>>> +
>>>> +    pfn = page_to_pfn(page);
>>>> +    if (!pfn) {
>>>> +            pr_err("no such pfn for page %p\n", page);
>>>> +            return false;
>>>> +    }
>>>> +
>>>> +    p = pfn_to_online_page(pfn);
>>>> +    if (!p) {
>>>> +            if (pfn_valid(pfn)) {
>>>> +                    pgmap = get_dev_pagemap(pfn, NULL);
>>>> +                    if (pgmap)
>>>> +                            put_dev_pagemap(pgmap);
>>>> +            }
>>>> +    }
>>>> +
>>>> +    return !!pgmap;
>>> You need to get Dan to check this out, but I'm pretty sure this should
>>> be more like this:
>>>
>>> if (is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_FS_DAX)
>> Great, i have added him.
>>
>>
>>
>>>
>>>> +static bool iova_in_pmem(struct rxe_mr *mr, u64 iova, int length)
>>>> +{
>>>> +    struct page *page = NULL;
>>>> +    char *vaddr = iova_to_vaddr(mr, iova, length);
>>>> +
>>>> +    if (!vaddr) {
>>>> +            pr_err("not a valid iova %llu\n", iova);
>>>> +            return false;
>>>> +    }
>>>> +
>>>> +    page = virt_to_page(vaddr);
>>> And obviously this isn't uniform for the entire umem, so I don't even
>>> know what this is supposed to mean.
>> My intention is to check if a memory region belongs to a nvdimm/pmem.
>> The approach is like that:
>> iova(user space)-+                     +-> page -> page_in_dev_pagemap()
>>                    |                     |
>>                    +-> va(kernel space) -+
>> Since current MR's va is associated with map_set where it record the relations
>> between iova and va and page. Do do you mean we should travel map_set to
>> get its page ? or by any other ways.
> Apologies for the delay in responding.
>
> The Subject line of this patch is confusing, if you want to know if a
> pfn is in persistent memory the only mechanism for that is:
>
> region_intersects(addr, length, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY)
>
> ...there is otherwise nothing pmem specific about the dev_pagemap
> infrastructure. Yes, pmem is the primary user, but it is also used for
> mapping "soft-reserved" memory (See: the EFI_MEMORY_SP) attribute, and
> other users.
>
> Can you clarify the intent? I am missing some context.

thanks for your help @Dan

I'm going to implement a new ibvers called RDMA FLUSH, it will support global visibility
and persistence placement type.

In my first design, only pmem can support persistence placement type, so i need to introduce
new attribute is_pmem to RDMA memory region where it associates to a user space address iova
so that i can reject a persistence placement type to DRAM(non-pmem).

Thanks
Zhijian
Dan Williams Jan. 18, 2022, 3:28 p.m. UTC | #6
On Tue, Jan 18, 2022 at 12:55 AM lizhijian@fujitsu.com
<lizhijian@fujitsu.com> wrote:
>
>
>
> On 17/01/2022 02:11, Dan Williams wrote:
> > On Wed, Jan 5, 2022 at 10:13 PM lizhijian@fujitsu.com
> > <lizhijian@fujitsu.com> wrote:
> >>
> >> Add Dan to the party :)
> >>
> >> May i know whether there is any existing APIs to check whether
> >> a va/page backs to a nvdimm/pmem ?
> >>
> >>
> >>
> >> On 06/01/2022 08:21, Jason Gunthorpe wrote:
> >>> On Tue, Dec 28, 2021 at 04:07:08PM +0800, Li Zhijian wrote:
> >>>> We can use it to indicate whether the registering mr is associated with
> >>>> a pmem/nvdimm or not.
> >>>>
> >>>> Currently, we only assign it in rxe driver, for other device/drivers,
> >>>> they should implement it if needed.
> >>>>
> >>>> RDMA FLUSH will support the persistence feature for a pmem/nvdimm.
> >>>>
> >>>> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
> >>>>    drivers/infiniband/sw/rxe/rxe_mr.c | 47 ++++++++++++++++++++++++++++++
> >>>>    include/rdma/ib_verbs.h            |  1 +
> >>>>    2 files changed, 48 insertions(+)
> >>>>
> >>>> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
> >>>> index 7c4cd19a9db2..bcd5e7afa475 100644
> >>>> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c
> >>>> @@ -162,6 +162,50 @@ void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr)
> >>>>       mr->type = IB_MR_TYPE_DMA;
> >>>>    }
> >>>>
> >>>> +// XXX: the logic is similar with mm/memory-failure.c
> >>>> +static bool page_in_dev_pagemap(struct page *page)
> >>>> +{
> >>>> +    unsigned long pfn;
> >>>> +    struct page *p;
> >>>> +    struct dev_pagemap *pgmap = NULL;
> >>>> +
> >>>> +    pfn = page_to_pfn(page);
> >>>> +    if (!pfn) {
> >>>> +            pr_err("no such pfn for page %p\n", page);
> >>>> +            return false;
> >>>> +    }
> >>>> +
> >>>> +    p = pfn_to_online_page(pfn);
> >>>> +    if (!p) {
> >>>> +            if (pfn_valid(pfn)) {
> >>>> +                    pgmap = get_dev_pagemap(pfn, NULL);
> >>>> +                    if (pgmap)
> >>>> +                            put_dev_pagemap(pgmap);
> >>>> +            }
> >>>> +    }
> >>>> +
> >>>> +    return !!pgmap;
> >>> You need to get Dan to check this out, but I'm pretty sure this should
> >>> be more like this:
> >>>
> >>> if (is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_FS_DAX)
> >> Great, i have added him.
> >>
> >>
> >>
> >>>
> >>>> +static bool iova_in_pmem(struct rxe_mr *mr, u64 iova, int length)
> >>>> +{
> >>>> +    struct page *page = NULL;
> >>>> +    char *vaddr = iova_to_vaddr(mr, iova, length);
> >>>> +
> >>>> +    if (!vaddr) {
> >>>> +            pr_err("not a valid iova %llu\n", iova);
> >>>> +            return false;
> >>>> +    }
> >>>> +
> >>>> +    page = virt_to_page(vaddr);
> >>> And obviously this isn't uniform for the entire umem, so I don't even
> >>> know what this is supposed to mean.
> >> My intention is to check if a memory region belongs to a nvdimm/pmem.
> >> The approach is like that:
> >> iova(user space)-+                     +-> page -> page_in_dev_pagemap()
> >>                    |                     |
> >>                    +-> va(kernel space) -+
> >> Since current MR's va is associated with map_set where it record the relations
> >> between iova and va and page. Do do you mean we should travel map_set to
> >> get its page ? or by any other ways.
> > Apologies for the delay in responding.
> >
> > The Subject line of this patch is confusing, if you want to know if a
> > pfn is in persistent memory the only mechanism for that is:
> >
> > region_intersects(addr, length, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY)
> >
> > ...there is otherwise nothing pmem specific about the dev_pagemap
> > infrastructure. Yes, pmem is the primary user, but it is also used for
> > mapping "soft-reserved" memory (See: the EFI_MEMORY_SP) attribute, and
> > other users.
> >
> > Can you clarify the intent? I am missing some context.
>
> thanks for your help @Dan
>
> I'm going to implement a new ibvers called RDMA FLUSH, it will support global visibility
> and persistence placement type.
>
> In my first design, only pmem can support persistence placement type, so i need to introduce
> new attribute is_pmem to RDMA memory region where it associates to a user space address iova
> so that i can reject a persistence placement type to DRAM(non-pmem).

Ok, I think for that case you are better served using the
IORES_DESC_PERSISTENT_MEMORY designation as the gate for attempting
the flush. The dev_pagemap is otherwise only indicating the presence
of device-backed memory pages, not persistence.
Zhijian Li (Fujitsu) Jan. 19, 2022, 2:01 a.m. UTC | #7
On 18/01/2022 23:28, Dan Williams wrote:
> On Tue, Jan 18, 2022 at 12:55 AM lizhijian@fujitsu.com
> <lizhijian@fujitsu.com> wrote:
>>
>>
>> On 17/01/2022 02:11, Dan Williams wrote:
>>> On Wed, Jan 5, 2022 at 10:13 PM lizhijian@fujitsu.com
>>> <lizhijian@fujitsu.com> wrote:
>>>> Add Dan to the party :)
>>>>
>>>> May i know whether there is any existing APIs to check whether
>>>> a va/page backs to a nvdimm/pmem ?
>>>>
>>>>
>>>>
>>>> On 06/01/2022 08:21, Jason Gunthorpe wrote:
>>>>> On Tue, Dec 28, 2021 at 04:07:08PM +0800, Li Zhijian wrote:
>>>>>> We can use it to indicate whether the registering mr is associated with
>>>>>> a pmem/nvdimm or not.
>>>>>>
>>>>>> Currently, we only assign it in rxe driver, for other device/drivers,
>>>>>> they should implement it if needed.
>>>>>>
>>>>>> RDMA FLUSH will support the persistence feature for a pmem/nvdimm.
>>>>>>
>>>>>> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
>>>>>>     drivers/infiniband/sw/rxe/rxe_mr.c | 47 ++++++++++++++++++++++++++++++
>>>>>>     include/rdma/ib_verbs.h            |  1 +
>>>>>>     2 files changed, 48 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
>>>>>> index 7c4cd19a9db2..bcd5e7afa475 100644
>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c
>>>>>> @@ -162,6 +162,50 @@ void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr)
>>>>>>        mr->type = IB_MR_TYPE_DMA;
>>>>>>     }
>>>>>>
>>>>>> +// XXX: the logic is similar with mm/memory-failure.c
>>>>>> +static bool page_in_dev_pagemap(struct page *page)
>>>>>> +{
>>>>>> +    unsigned long pfn;
>>>>>> +    struct page *p;
>>>>>> +    struct dev_pagemap *pgmap = NULL;
>>>>>> +
>>>>>> +    pfn = page_to_pfn(page);
>>>>>> +    if (!pfn) {
>>>>>> +            pr_err("no such pfn for page %p\n", page);
>>>>>> +            return false;
>>>>>> +    }
>>>>>> +
>>>>>> +    p = pfn_to_online_page(pfn);
>>>>>> +    if (!p) {
>>>>>> +            if (pfn_valid(pfn)) {
>>>>>> +                    pgmap = get_dev_pagemap(pfn, NULL);
>>>>>> +                    if (pgmap)
>>>>>> +                            put_dev_pagemap(pgmap);
>>>>>> +            }
>>>>>> +    }
>>>>>> +
>>>>>> +    return !!pgmap;
>>>>> You need to get Dan to check this out, but I'm pretty sure this should
>>>>> be more like this:
>>>>>
>>>>> if (is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_FS_DAX)
>>>> Great, i have added him.
>>>>
>>>>
>>>>
>>>>>> +static bool iova_in_pmem(struct rxe_mr *mr, u64 iova, int length)
>>>>>> +{
>>>>>> +    struct page *page = NULL;
>>>>>> +    char *vaddr = iova_to_vaddr(mr, iova, length);
>>>>>> +
>>>>>> +    if (!vaddr) {
>>>>>> +            pr_err("not a valid iova %llu\n", iova);
>>>>>> +            return false;
>>>>>> +    }
>>>>>> +
>>>>>> +    page = virt_to_page(vaddr);
>>>>> And obviously this isn't uniform for the entire umem, so I don't even
>>>>> know what this is supposed to mean.
>>>> My intention is to check if a memory region belongs to a nvdimm/pmem.
>>>> The approach is like that:
>>>> iova(user space)-+                     +-> page -> page_in_dev_pagemap()
>>>>                     |                     |
>>>>                     +-> va(kernel space) -+
>>>> Since current MR's va is associated with map_set where it record the relations
>>>> between iova and va and page. Do do you mean we should travel map_set to
>>>> get its page ? or by any other ways.
>>> Apologies for the delay in responding.
>>>
>>> The Subject line of this patch is confusing, if you want to know if a
>>> pfn is in persistent memory the only mechanism for that is:
>>>
>>> region_intersects(addr, length, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY)
>>>
>>> ...there is otherwise nothing pmem specific about the dev_pagemap
>>> infrastructure. Yes, pmem is the primary user, but it is also used for
>>> mapping "soft-reserved" memory (See: the EFI_MEMORY_SP) attribute, and
>>> other users.
>>>
>>> Can you clarify the intent? I am missing some context.
>> thanks for your help @Dan
>>
>> I'm going to implement a new ibvers called RDMA FLUSH, it will support global visibility
>> and persistence placement type.
>>
>> In my first design, only pmem can support persistence placement type, so i need to introduce
>> new attribute is_pmem to RDMA memory region where it associates to a user space address iova
>> so that i can reject a persistence placement type to DRAM(non-pmem).
> Ok, I think for that case you are better served using the
> IORES_DESC_PERSISTENT_MEMORY designation as the gate for attempting
> the flush. The dev_pagemap is otherwise only indicating the presence
> of device-backed memory pages, not persistence.

Thanks a million for your kindly suggestion.

Thanks
Zhijian
Jeff Moyer Jan. 27, 2022, 10:30 p.m. UTC | #8
"Li, Zhijian" <lizhijian@cn.fujitsu.com> writes:

> Copied to nvdimm list
>
> Thanks
>
> Zhijian
>
>
> on 2022/1/6 14:12, Li Zhijian wrote:
>>
>> Add Dan to the party :)
>>
>> May i know whether there is any existing APIs to check whether
>> a va/page backs to a nvdimm/pmem ?

I don't know of one.  You could try walk_system_ram_range looking for
IORES_DESC_PERSISTENT_MEMORY, but that's not very efficient.

>>> You need to get Dan to check this out, but I'm pretty sure this should
>>> be more like this:
>>>
>>> if (is_zone_device_page(page) && page->pgmap->type ==
>>> MEMORY_DEVICE_FS_DAX)

You forgot MEMORY_DEVICE_GENERIC.  However, this doesn't guarantee the
memory belongs to persistent memory, only that it is direct access
capable.

Dan, any ideas?

-Jeff
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 7c4cd19a9db2..bcd5e7afa475 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -162,6 +162,50 @@  void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr)
 	mr->type = IB_MR_TYPE_DMA;
 }
 
+// XXX: the logic is similar with mm/memory-failure.c
+static bool page_in_dev_pagemap(struct page *page)
+{
+	unsigned long pfn;
+	struct page *p;
+	struct dev_pagemap *pgmap = NULL;
+
+	pfn = page_to_pfn(page);
+	if (!pfn) {
+		pr_err("no such pfn for page %p\n", page);
+		return false;
+	}
+
+	p = pfn_to_online_page(pfn);
+	if (!p) {
+		if (pfn_valid(pfn)) {
+			pgmap = get_dev_pagemap(pfn, NULL);
+			if (pgmap)
+				put_dev_pagemap(pgmap);
+		}
+	}
+
+	return !!pgmap;
+}
+
+static bool iova_in_pmem(struct rxe_mr *mr, u64 iova, int length)
+{
+	struct page *page = NULL;
+	char *vaddr = iova_to_vaddr(mr, iova, length);
+
+	if (!vaddr) {
+		pr_err("not a valid iova %llu\n", iova);
+		return false;
+	}
+
+	page = virt_to_page(vaddr);
+	if (!page) {
+		pr_err("no such page for vaddr %p\n", vaddr);
+		return false;
+	}
+
+	return page_in_dev_pagemap(page);
+}
+
 int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
 		     int access, struct rxe_mr *mr)
 {
@@ -236,6 +280,9 @@  int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
 	set->va = start;
 	set->offset = ib_umem_offset(umem);
 
+	// iova_in_pmem must be called after set is updated
+	mr->ibmr.is_pmem = iova_in_pmem(mr, iova, length);
+
 	return 0;
 
 err_release_umem:
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 6e9ad656ecb7..822ebb3425dc 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1807,6 +1807,7 @@  struct ib_mr {
 	unsigned int	   page_size;
 	enum ib_mr_type	   type;
 	bool		   need_inval;
+	bool		   is_pmem;
 	union {
 		struct ib_uobject	*uobject;	/* user */
 		struct list_head	qp_entry;	/* FR */