Message ID | 20250318094932.2643614-2-matsuda-daisuke@fujitsu.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | RDMA/rxe: RDMA FLUSH and ATOMIC WRITE with ODP | expand |
Hi Matsuda-san Thanks for your patches in ODP. It looks good to me. Reviewed-by: Li Zhijian <lizhijian@fujitsu.com> However, I find myself harboring a hint of hesitation. I'm wondering if we really need remap a page back from the back-end memory/pmem device for just doing a flush operation. I am uncertain about the circumstances under which ODP might occur. Does it possibly include scenarios ? 1) where a page has not yet had a mapping 2) where a page, once mapped, is subsequently swapped out When a pmem page that - for 1), it's meaningless to do the flush - for 2), a pmem page will be swaped-out to a swap-partition without flushing? Thanks Zhijian On 18/03/2025 17:49, Daisuke Matsuda wrote: > For persistent memories, add rxe_odp_flush_pmem_iova() so that ODP specific > steps are executed. Otherwise, no additional consideration is required. > > Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com> > --- > drivers/infiniband/sw/rxe/rxe.c | 1 + > drivers/infiniband/sw/rxe/rxe_loc.h | 7 ++++ > drivers/infiniband/sw/rxe/rxe_mr.c | 36 ++++++++++------ > drivers/infiniband/sw/rxe/rxe_odp.c | 62 ++++++++++++++++++++++++++-- > drivers/infiniband/sw/rxe/rxe_resp.c | 4 -- > include/rdma/ib_verbs.h | 1 + > 6 files changed, 91 insertions(+), 20 deletions(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c > index 4e56a371deb5..df66f8f9efa1 100644 > --- a/drivers/infiniband/sw/rxe/rxe.c > +++ b/drivers/infiniband/sw/rxe/rxe.c > @@ -109,6 +109,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe) > rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; > rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; > rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH; > } > } > > diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h > index feb386d98d1d..0012bebe96ef 100644 > --- a/drivers/infiniband/sw/rxe/rxe_loc.h > +++ b/drivers/infiniband/sw/rxe/rxe_loc.h > @@ -194,6 +194,8 @@ int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, > enum rxe_mr_copy_dir dir); > int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, > u64 compare, u64 swap_add, u64 *orig_val); > +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, > + unsigned int length); > #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > static inline int > rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, > @@ -212,6 +214,11 @@ rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, > { > return RESPST_ERR_UNSUPPORTED_OPCODE; > } > +static inline int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, > + unsigned int length) > +{ > + return -EOPNOTSUPP; > +} > #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > > #endif /* RXE_LOC_H */ > diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c > index 868d2f0b74e9..93e4b5acd3ac 100644 > --- a/drivers/infiniband/sw/rxe/rxe_mr.c > +++ b/drivers/infiniband/sw/rxe/rxe_mr.c > @@ -424,7 +424,7 @@ int copy_data( > return err; > } > > -int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) > +static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) > { > unsigned int page_offset; > unsigned long index; > @@ -433,16 +433,6 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) > int err; > u8 *va; > > - /* mr must be valid even if length is zero */ > - if (WARN_ON(!mr)) > - return -EINVAL; > - > - if (length == 0) > - return 0; > - > - if (mr->ibmr.type == IB_MR_TYPE_DMA) > - return -EFAULT; > - > err = mr_check_range(mr, iova, length); > if (err) > return err; > @@ -454,7 +444,7 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) > if (!page) > return -EFAULT; > bytes = min_t(unsigned int, length, > - mr_page_size(mr) - page_offset); > + mr_page_size(mr) - page_offset); > > va = kmap_local_page(page); > arch_wb_cache_pmem(va + page_offset, bytes); > @@ -468,6 +458,28 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) > return 0; > } > > +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 start, unsigned int length) > +{ > + int err; > + > + /* mr must be valid even if length is zero */ > + if (WARN_ON(!mr)) > + return -EINVAL; > + > + if (length == 0) > + return 0; > + > + if (mr->ibmr.type == IB_MR_TYPE_DMA) > + return -EFAULT; > + > + if (mr->umem->is_odp) > + err = rxe_odp_flush_pmem_iova(mr, start, length); > + else > + err = rxe_mr_flush_pmem_iova(mr, start, length); > + > + return err; > +} > + > /* Guarantee atomicity of atomic operations at the machine level. */ > DEFINE_SPINLOCK(atomic_ops_lock); > > diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c > index 9f6e2bb2a269..9a9aae967486 100644 > --- a/drivers/infiniband/sw/rxe/rxe_odp.c > +++ b/drivers/infiniband/sw/rxe/rxe_odp.c > @@ -4,6 +4,7 @@ > */ > > #include <linux/hmm.h> > +#include <linux/libnvdimm.h> > > #include <rdma/ib_umem_odp.h> > > @@ -147,6 +148,16 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, > return need_fault; > } > > +static unsigned long rxe_odp_iova_to_index(struct ib_umem_odp *umem_odp, u64 iova) > +{ > + return (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; > +} > + > +static unsigned long rxe_odp_iova_to_page_offset(struct ib_umem_odp *umem_odp, u64 iova) > +{ > + return iova & (BIT(umem_odp->page_shift) - 1); > +} > + > static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u32 flags) > { > struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > @@ -190,8 +201,8 @@ static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, > size_t offset; > u8 *user_va; > > - idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; > - offset = iova & (BIT(umem_odp->page_shift) - 1); > + idx = rxe_odp_iova_to_index(umem_odp, iova); > + offset = rxe_odp_iova_to_page_offset(umem_odp, iova); > > while (length > 0) { > u8 *src, *dest; > @@ -277,8 +288,8 @@ static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, > return RESPST_ERR_RKEY_VIOLATION; > } > > - idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; > - page_offset = iova & (BIT(umem_odp->page_shift) - 1); > + idx = rxe_odp_iova_to_index(umem_odp, iova); > + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); > page = hmm_pfn_to_page(umem_odp->pfn_list[idx]); > if (!page) > return RESPST_ERR_RKEY_VIOLATION; > @@ -324,3 +335,46 @@ int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, > > return err; > } > + > +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, > + unsigned int length) > +{ > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > + unsigned int page_offset; > + unsigned long index; > + struct page *page; > + unsigned int bytes; > + int err; > + u8 *va; > + > + err = rxe_odp_map_range_and_lock(mr, iova, length, > + RXE_PAGEFAULT_DEFAULT); > + if (err) > + return err; > + > + while (length > 0) { > + index = rxe_odp_iova_to_index(umem_odp, iova); > + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); > + > + page = hmm_pfn_to_page(umem_odp->pfn_list[index]); > + if (!page) { > + mutex_unlock(&umem_odp->umem_mutex); > + return -EFAULT; > + } > + > + bytes = min_t(unsigned int, length, > + mr_page_size(mr) - page_offset); > + > + va = kmap_local_page(page); > + arch_wb_cache_pmem(va + page_offset, bytes); > + kunmap_local(va); > + > + length -= bytes; > + iova += bytes; > + page_offset = 0; > + } > + > + mutex_unlock(&umem_odp->umem_mutex); > + > + return 0; > +} > diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c > index 54ba9ee1acc5..304e3de740ad 100644 > --- a/drivers/infiniband/sw/rxe/rxe_resp.c > +++ b/drivers/infiniband/sw/rxe/rxe_resp.c > @@ -649,10 +649,6 @@ static enum resp_states process_flush(struct rxe_qp *qp, > struct rxe_mr *mr = qp->resp.mr; > struct resp_res *res = qp->resp.res; > > - /* ODP is not supported right now. WIP. */ > - if (mr->umem->is_odp) > - return RESPST_ERR_UNSUPPORTED_OPCODE; > - > /* oA19-14, oA19-15 */ > if (res && res->replay) > return RESPST_ACKNOWLEDGE; > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h > index 9941f4185c79..da07d3e2db1d 100644 > --- a/include/rdma/ib_verbs.h > +++ b/include/rdma/ib_verbs.h > @@ -325,6 +325,7 @@ enum ib_odp_transport_cap_bits { > IB_ODP_SUPPORT_READ = 1 << 3, > IB_ODP_SUPPORT_ATOMIC = 1 << 4, > IB_ODP_SUPPORT_SRQ_RECV = 1 << 5, > + IB_ODP_SUPPORT_FLUSH = 1 << 6, > }; > > struct ib_odp_caps {
On Thu, Mar 20, 2025 3:59 PM Li, Zhijian wrote: > Hi Matsuda-san > > Thanks for your patches in ODP. > > It looks good to me. > > Reviewed-by: Li Zhijian <lizhijian@fujitsu.com> > Hi, Thanks for the review. > > However, I find myself harboring a hint of hesitation. > > I'm wondering if we really need remap a page back from the back-end > memory/pmem device for just doing a flush operation. That is a difficult question, but I think there are two reasons we should invoke the page fault in this case. 1) Even if pages are surely mapped, it may be possible that the target MR is truncate(2)-ed without notifying kernel/HW of the metadata update. I think this could potentially result in illegal memory access, and ODP can prevent that by updating driver/HW-side page table Cf. https://lore.kernel.org/lkml/Y3UmaJil5slosqjA@unreal/T/ 2) It is likely that the behavior we are discussing is not strictly defined, so it would be better to choose the safer way since there is no penalty except for performance. > > I am uncertain about the circumstances under which ODP might occur. > Does it possibly include scenarios ? > 1) where a page has not yet had a mapping > 2) where a page, once mapped, is subsequently swapped out > > When a pmem page that > - for 1), it's meaningless to do the flush > - for 2), a pmem page will be swaped-out to a swap-partition without flushing? Assuming the pmem is in fs-dax mode, I think the answer is no. We do not use page cache, so page swap will not occur. Regards, Daisuke > > Thanks > Zhijian > > On 18/03/2025 17:49, Daisuke Matsuda wrote: > > For persistent memories, add rxe_odp_flush_pmem_iova() so that ODP specific > > steps are executed. Otherwise, no additional consideration is required. > > > > Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com> > > --- > > drivers/infiniband/sw/rxe/rxe.c | 1 + > > drivers/infiniband/sw/rxe/rxe_loc.h | 7 ++++ > > drivers/infiniband/sw/rxe/rxe_mr.c | 36 ++++++++++------ > > drivers/infiniband/sw/rxe/rxe_odp.c | 62 ++++++++++++++++++++++++++-- > > drivers/infiniband/sw/rxe/rxe_resp.c | 4 -- > > include/rdma/ib_verbs.h | 1 + > > 6 files changed, 91 insertions(+), 20 deletions(-) > >
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 4e56a371deb5..df66f8f9efa1 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -109,6 +109,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH; } } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index feb386d98d1d..0012bebe96ef 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -194,6 +194,8 @@ int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, enum rxe_mr_copy_dir dir); int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, u64 compare, u64 swap_add, u64 *orig_val); +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, @@ -212,6 +214,11 @@ rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, { return RESPST_ERR_UNSUPPORTED_OPCODE; } +static inline int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ #endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 868d2f0b74e9..93e4b5acd3ac 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -424,7 +424,7 @@ int copy_data( return err; } -int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) +static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) { unsigned int page_offset; unsigned long index; @@ -433,16 +433,6 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) int err; u8 *va; - /* mr must be valid even if length is zero */ - if (WARN_ON(!mr)) - return -EINVAL; - - if (length == 0) - return 0; - - if (mr->ibmr.type == IB_MR_TYPE_DMA) - return -EFAULT; - err = mr_check_range(mr, iova, length); if (err) return err; @@ -454,7 +444,7 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) if (!page) return -EFAULT; bytes = min_t(unsigned int, length, - mr_page_size(mr) - page_offset); + mr_page_size(mr) - page_offset); va = kmap_local_page(page); arch_wb_cache_pmem(va + page_offset, bytes); @@ -468,6 +458,28 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) return 0; } +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 start, unsigned int length) +{ + int err; + + /* mr must be valid even if length is zero */ + if (WARN_ON(!mr)) + return -EINVAL; + + if (length == 0) + return 0; + + if (mr->ibmr.type == IB_MR_TYPE_DMA) + return -EFAULT; + + if (mr->umem->is_odp) + err = rxe_odp_flush_pmem_iova(mr, start, length); + else + err = rxe_mr_flush_pmem_iova(mr, start, length); + + return err; +} + /* Guarantee atomicity of atomic operations at the machine level. */ DEFINE_SPINLOCK(atomic_ops_lock); diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index 9f6e2bb2a269..9a9aae967486 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -4,6 +4,7 @@ */ #include <linux/hmm.h> +#include <linux/libnvdimm.h> #include <rdma/ib_umem_odp.h> @@ -147,6 +148,16 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, return need_fault; } +static unsigned long rxe_odp_iova_to_index(struct ib_umem_odp *umem_odp, u64 iova) +{ + return (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; +} + +static unsigned long rxe_odp_iova_to_page_offset(struct ib_umem_odp *umem_odp, u64 iova) +{ + return iova & (BIT(umem_odp->page_shift) - 1); +} + static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u32 flags) { struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); @@ -190,8 +201,8 @@ static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, size_t offset; u8 *user_va; - idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - offset = iova & (BIT(umem_odp->page_shift) - 1); + idx = rxe_odp_iova_to_index(umem_odp, iova); + offset = rxe_odp_iova_to_page_offset(umem_odp, iova); while (length > 0) { u8 *src, *dest; @@ -277,8 +288,8 @@ static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, return RESPST_ERR_RKEY_VIOLATION; } - idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - page_offset = iova & (BIT(umem_odp->page_shift) - 1); + idx = rxe_odp_iova_to_index(umem_odp, iova); + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); page = hmm_pfn_to_page(umem_odp->pfn_list[idx]); if (!page) return RESPST_ERR_RKEY_VIOLATION; @@ -324,3 +335,46 @@ int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, return err; } + +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + unsigned int page_offset; + unsigned long index; + struct page *page; + unsigned int bytes; + int err; + u8 *va; + + err = rxe_odp_map_range_and_lock(mr, iova, length, + RXE_PAGEFAULT_DEFAULT); + if (err) + return err; + + while (length > 0) { + index = rxe_odp_iova_to_index(umem_odp, iova); + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + + page = hmm_pfn_to_page(umem_odp->pfn_list[index]); + if (!page) { + mutex_unlock(&umem_odp->umem_mutex); + return -EFAULT; + } + + bytes = min_t(unsigned int, length, + mr_page_size(mr) - page_offset); + + va = kmap_local_page(page); + arch_wb_cache_pmem(va + page_offset, bytes); + kunmap_local(va); + + length -= bytes; + iova += bytes; + page_offset = 0; + } + + mutex_unlock(&umem_odp->umem_mutex); + + return 0; +} diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 54ba9ee1acc5..304e3de740ad 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -649,10 +649,6 @@ static enum resp_states process_flush(struct rxe_qp *qp, struct rxe_mr *mr = qp->resp.mr; struct resp_res *res = qp->resp.res; - /* ODP is not supported right now. WIP. */ - if (mr->umem->is_odp) - return RESPST_ERR_UNSUPPORTED_OPCODE; - /* oA19-14, oA19-15 */ if (res && res->replay) return RESPST_ACKNOWLEDGE; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9941f4185c79..da07d3e2db1d 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -325,6 +325,7 @@ enum ib_odp_transport_cap_bits { IB_ODP_SUPPORT_READ = 1 << 3, IB_ODP_SUPPORT_ATOMIC = 1 << 4, IB_ODP_SUPPORT_SRQ_RECV = 1 << 5, + IB_ODP_SUPPORT_FLUSH = 1 << 6, }; struct ib_odp_caps {
For persistent memories, add rxe_odp_flush_pmem_iova() so that ODP specific steps are executed. Otherwise, no additional consideration is required. Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com> --- drivers/infiniband/sw/rxe/rxe.c | 1 + drivers/infiniband/sw/rxe/rxe_loc.h | 7 ++++ drivers/infiniband/sw/rxe/rxe_mr.c | 36 ++++++++++------ drivers/infiniband/sw/rxe/rxe_odp.c | 62 ++++++++++++++++++++++++++-- drivers/infiniband/sw/rxe/rxe_resp.c | 4 -- include/rdma/ib_verbs.h | 1 + 6 files changed, 91 insertions(+), 20 deletions(-)