Message ID | 20220927055337.22630-6-lizhijian@fujitsu.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Jason Gunthorpe |
Headers | show |
Series | RDMA/rxe: Add RDMA FLUSH operation | expand |
On Tue, Sep 27, 2022 at 01:53:31PM +0800, Li Zhijian wrote: > @@ -122,6 +129,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, > int num_buf; > void *vaddr; > int err; > + bool is_pmem = false; > int i; > > umem = ib_umem_get(&rxe->ib_dev, start, length, access); > @@ -149,6 +157,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, > num_buf = 0; > map = mr->map; > if (length > 0) { > + is_pmem = true; > buf = map[0]->buf; > > for_each_sgtable_page (&umem->sgt_append.sgt, &sg_iter, 0) { > @@ -166,6 +175,10 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, > goto err_cleanup_map; > } > > + /* True only if the *whole* MR is pmem */ > + if (is_pmem) > + is_pmem = vaddr_in_pmem(vaddr); > + I'm not so keen on this use of resources, but this should be written more like phys = page_to_phys(sg_page_iter_page(&sg_iter)) region_intersects(phys + sg_iter->offset, sg_iter->length,.. ) And you understand this will make memory registration of every RXE user a bit slower? And actual pmem will be painfully slow. It seems like we are doing something wrong here.. > @@ -174,6 +187,12 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, > } > } > > + if (!is_pmem && access & IB_ACCESS_FLUSH_PERSISTENT) { > + pr_warn("Cannot register IB_ACCESS_FLUSH_PERSISTENT for non-pmem memory\n"); > + err = -EINVAL; > + goto err_release_umem; > + } Do not pr_warn on syscall paths Jason
On 29/10/2022 01:53, Jason Gunthorpe wrote: > On Tue, Sep 27, 2022 at 01:53:31PM +0800, Li Zhijian wrote: >> @@ -122,6 +129,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, >> int num_buf; >> void *vaddr; >> int err; >> + bool is_pmem = false; >> int i; >> >> umem = ib_umem_get(&rxe->ib_dev, start, length, access); >> @@ -149,6 +157,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, >> num_buf = 0; >> map = mr->map; >> if (length > 0) { >> + is_pmem = true; >> buf = map[0]->buf; >> >> for_each_sgtable_page (&umem->sgt_append.sgt, &sg_iter, 0) { >> @@ -166,6 +175,10 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, >> goto err_cleanup_map; >> } >> >> + /* True only if the *whole* MR is pmem */ >> + if (is_pmem) >> + is_pmem = vaddr_in_pmem(vaddr); >> + > I'm not so keen on this use of resources, but this should be written more > like > > phys = page_to_phys(sg_page_iter_page(&sg_iter)) > region_intersects(phys + sg_iter->offset, sg_iter->length,.. ) > > And you understand this will make memory registration of every RXE > user a bit slower? Good catch, i missed it before. I tested it qemu guest in which pmem is backing to a normal file in host. In this case, this testing take ~+9% overhead(1.2S -> 1.3S) for 1G size mr. most the time was taken by gup. the real pmem environment will be tested later. To minimize side effect, i updated the code to do pmem mr checking on if the require_pmem is true. > region_intersects(phys + sg_iter->offset, sg_iter->length,.. ) I haven't fully apply this suggestion since i think my assumption that a page can only associate to a unique/same memory zone is true. So i only check 1 byte of each page. index 5d014cef916e..e4e7c180fa0d 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -112,6 +112,13 @@ void rxe_mr_init_dma(int access, struct rxe_mr *mr) mr->ibmr.type = IB_MR_TYPE_DMA; } +static bool paddr_in_pmem(unsigned long paddr) +{ + return REGION_INTERSECTS == + region_intersects(paddr, 1, IORESOURCE_MEM, + IORES_DESC_PERSISTENT_MEMORY); +} + int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access, struct rxe_mr *mr) { @@ -122,6 +129,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int num_buf; void *vaddr; int err; + bool require_pmem = access & IB_ACCESS_FLUSH_PERSISTENT; umem = ib_umem_get(&rxe->ib_dev, start, length, access); if (IS_ERR(umem)) { @@ -149,6 +157,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, num_buf = 0; map = mr->map; if (length > 0) { + struct page *pg; buf = map[0]->buf; for_each_sgtable_page (&umem->sgt_append.sgt, &sg_iter, 0) { @@ -158,13 +167,20 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, num_buf = 0; } - vaddr = page_address(sg_page_iter_page(&sg_iter)); + pg = sg_page_iter_page(&sg_iter); + vaddr = page_address(pg); if (!vaddr) { pr_warn("%s: Unable to get virtual address\n", __func__); err = -ENOMEM; goto err_release_umem; } + + if (require_pmem && !paddr_in_pmem(page_to_phys(pg))) { + err = -EINVAL; + goto err_release_umem; + } + buf->addr = (uintptr_t)vaddr; num_buf++; buf++; > And actual pmem will be painfully slow. > > It seems like we are doing something wrong here.. > Do you think we don't need this patch ? >> @@ -174,6 +187,12 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, >> } >> } >> >> + if (!is_pmem && access & IB_ACCESS_FLUSH_PERSISTENT) { >> + pr_warn("Cannot register IB_ACCESS_FLUSH_PERSISTENT for non-pmem memory\n"); >> + err = -EINVAL; >> + goto err_release_umem; >> + } > Do not pr_warn on syscall paths Got it, will remove it. Thanks Zhijian > > Jason
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 74a38d06332f..1da3ad5eba64 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -112,6 +112,13 @@ void rxe_mr_init_dma(int access, struct rxe_mr *mr) mr->type = IB_MR_TYPE_DMA; } +static bool vaddr_in_pmem(char *vaddr) +{ + return REGION_INTERSECTS == + region_intersects(virt_to_phys(vaddr), 1, IORESOURCE_MEM, + IORES_DESC_PERSISTENT_MEMORY); +} + int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access, struct rxe_mr *mr) { @@ -122,6 +129,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int num_buf; void *vaddr; int err; + bool is_pmem = false; int i; umem = ib_umem_get(&rxe->ib_dev, start, length, access); @@ -149,6 +157,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, num_buf = 0; map = mr->map; if (length > 0) { + is_pmem = true; buf = map[0]->buf; for_each_sgtable_page (&umem->sgt_append.sgt, &sg_iter, 0) { @@ -166,6 +175,10 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, goto err_cleanup_map; } + /* True only if the *whole* MR is pmem */ + if (is_pmem) + is_pmem = vaddr_in_pmem(vaddr); + buf->addr = (uintptr_t)vaddr; buf->size = PAGE_SIZE; num_buf++; @@ -174,6 +187,12 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, } } + if (!is_pmem && access & IB_ACCESS_FLUSH_PERSISTENT) { + pr_warn("Cannot register IB_ACCESS_FLUSH_PERSISTENT for non-pmem memory\n"); + err = -EINVAL; + goto err_release_umem; + } + mr->umem = umem; mr->access = access; mr->offset = ib_umem_offset(umem);
Memory region could support at most 2 flush access flags: IB_ACCESS_FLUSH_PERSISTENT and IB_ACCESS_FLUSH_GLOBAL But we only allow user to register persistent flush flags to the pmem MR where it has the ability of persisting data across power cycles. So register a persistent access flag to a non-pmem MR will be rejected. CC: Dan Williams <dan.j.williams@intel.com> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com> --- V5: make sure the whole MR is pmem V4: set is_pmem more simple V2: new scheme check is_pmem # Dan update commit message, get rid of confusing ib_check_flush_access_flags() # Tom --- drivers/infiniband/sw/rxe/rxe_mr.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+)