Message ID | 20250314081056.3496708-3-matsuda-daisuke@fujitsu.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | RDMA/rxe: RDMA FLUSH and ATOMIC WRITE with ODP | expand |
在 2025/3/14 9:10, Daisuke Matsuda 写道: > Add rxe_odp_do_atomic_write() so that ODP specific steps are applied to > ATOMIC WRITE requests. > > Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com> Thanks a lot. It is better if the perftest results are also attached. Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev> Zhu Yanjun > --- > drivers/infiniband/sw/rxe/rxe.c | 1 + > drivers/infiniband/sw/rxe/rxe_loc.h | 5 +++ > drivers/infiniband/sw/rxe/rxe_mr.c | 4 -- > drivers/infiniband/sw/rxe/rxe_odp.c | 59 ++++++++++++++++++++++++++++ > drivers/infiniband/sw/rxe/rxe_resp.c | 5 ++- > include/rdma/ib_verbs.h | 1 + > 6 files changed, 70 insertions(+), 5 deletions(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c > index df66f8f9efa1..21ce2d876b42 100644 > --- a/drivers/infiniband/sw/rxe/rxe.c > +++ b/drivers/infiniband/sw/rxe/rxe.c > @@ -110,6 +110,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe) > rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; > rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; > rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE; > } > } > > diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h > index 0012bebe96ef..8b1517c0894c 100644 > --- a/drivers/infiniband/sw/rxe/rxe_loc.h > +++ b/drivers/infiniband/sw/rxe/rxe_loc.h > @@ -196,6 +196,7 @@ int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, > u64 compare, u64 swap_add, u64 *orig_val); > int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, > unsigned int length); > +int rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); > #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > static inline int > rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, > @@ -219,6 +220,10 @@ static inline int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, > { > return -EOPNOTSUPP; > } > +static inline int rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) > +{ > + return RESPST_ERR_UNSUPPORTED_OPCODE; > +} > #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > > #endif /* RXE_LOC_H */ > diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c > index 868d2f0b74e9..3aecb5be26d9 100644 > --- a/drivers/infiniband/sw/rxe/rxe_mr.c > +++ b/drivers/infiniband/sw/rxe/rxe_mr.c > @@ -535,10 +535,6 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) > struct page *page; > u64 *va; > > - /* ODP is not supported right now. WIP. */ > - if (mr->umem->is_odp) > - return RESPST_ERR_UNSUPPORTED_OPCODE; > - > /* See IBA oA19-28 */ > if (unlikely(mr->state != RXE_MR_STATE_VALID)) { > rxe_dbg_mr(mr, "mr not in valid state\n"); > diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c > index c1671e5efd70..79ef5fe41f8e 100644 > --- a/drivers/infiniband/sw/rxe/rxe_odp.c > +++ b/drivers/infiniband/sw/rxe/rxe_odp.c > @@ -389,3 +389,62 @@ int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, > > return 0; > } > + > +#if defined CONFIG_64BIT > +/* only implemented or called for 64 bit architectures */ > +int rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) > +{ > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > + unsigned int page_offset; > + unsigned long index; > + struct page *page; > + int err; > + u64 *va; > + > + /* See IBA oA19-28 */ > + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { > + rxe_dbg_mr(mr, "mr not in valid state\n"); > + return RESPST_ERR_RKEY_VIOLATION; > + } > + > + /* See IBA oA19-28 */ > + err = mr_check_range(mr, iova, sizeof(value)); > + if (unlikely(err)) { > + rxe_dbg_mr(mr, "iova out of range\n"); > + return RESPST_ERR_RKEY_VIOLATION; > + } > + > + err = rxe_odp_map_range_and_lock(mr, iova, sizeof(value), > + RXE_PAGEFAULT_DEFAULT); > + if (err) > + return RESPST_ERR_RKEY_VIOLATION; > + > + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); > + index = rxe_odp_iova_to_index(umem_odp, iova); > + page = hmm_pfn_to_page(umem_odp->pfn_list[index]); > + if (!page) { > + mutex_unlock(&umem_odp->umem_mutex); > + return RESPST_ERR_RKEY_VIOLATION; > + } > + /* See IBA A19.4.2 */ > + if (unlikely(page_offset & 0x7)) { > + mutex_unlock(&umem_odp->umem_mutex); > + rxe_dbg_mr(mr, "misaligned address\n"); > + return RESPST_ERR_MISALIGNED_ATOMIC; > + } > + > + va = kmap_local_page(page); > + /* Do atomic write after all prior operations have completed */ > + smp_store_release(&va[page_offset >> 3], value); > + kunmap_local(va); > + > + mutex_unlock(&umem_odp->umem_mutex); > + > + return 0; > +} > +#else > +int rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) > +{ > + return RESPST_ERR_UNSUPPORTED_OPCODE; > +} > +#endif > diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c > index dd65a8872111..1505d933c09b 100644 > --- a/drivers/infiniband/sw/rxe/rxe_resp.c > +++ b/drivers/infiniband/sw/rxe/rxe_resp.c > @@ -754,7 +754,10 @@ static enum resp_states atomic_write_reply(struct rxe_qp *qp, > value = *(u64 *)payload_addr(pkt); > iova = qp->resp.va + qp->resp.offset; > > - err = rxe_mr_do_atomic_write(mr, iova, value); > + if (mr->umem->is_odp) > + err = rxe_odp_do_atomic_write(mr, iova, value); > + else > + err = rxe_mr_do_atomic_write(mr, iova, value); > if (err) > return err; > > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h > index da07d3e2db1d..bfa1bff3c720 100644 > --- a/include/rdma/ib_verbs.h > +++ b/include/rdma/ib_verbs.h > @@ -326,6 +326,7 @@ enum ib_odp_transport_cap_bits { > IB_ODP_SUPPORT_ATOMIC = 1 << 4, > IB_ODP_SUPPORT_SRQ_RECV = 1 << 5, > IB_ODP_SUPPORT_FLUSH = 1 << 6, > + IB_ODP_SUPPORT_ATOMIC_WRITE = 1 << 7, > }; > > struct ib_odp_caps {
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index df66f8f9efa1..21ce2d876b42 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -110,6 +110,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE; } } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 0012bebe96ef..8b1517c0894c 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -196,6 +196,7 @@ int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, u64 compare, u64 swap_add, u64 *orig_val); int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length); +int rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, @@ -219,6 +220,10 @@ static inline int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, { return -EOPNOTSUPP; } +static inline int rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +{ + return RESPST_ERR_UNSUPPORTED_OPCODE; +} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ #endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 868d2f0b74e9..3aecb5be26d9 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -535,10 +535,6 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) struct page *page; u64 *va; - /* ODP is not supported right now. WIP. */ - if (mr->umem->is_odp) - return RESPST_ERR_UNSUPPORTED_OPCODE; - /* See IBA oA19-28 */ if (unlikely(mr->state != RXE_MR_STATE_VALID)) { rxe_dbg_mr(mr, "mr not in valid state\n"); diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index c1671e5efd70..79ef5fe41f8e 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -389,3 +389,62 @@ int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, return 0; } + +#if defined CONFIG_64BIT +/* only implemented or called for 64 bit architectures */ +int rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + unsigned int page_offset; + unsigned long index; + struct page *page; + int err; + u64 *va; + + /* See IBA oA19-28 */ + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + /* See IBA oA19-28 */ + err = mr_check_range(mr, iova, sizeof(value)); + if (unlikely(err)) { + rxe_dbg_mr(mr, "iova out of range\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + err = rxe_odp_map_range_and_lock(mr, iova, sizeof(value), + RXE_PAGEFAULT_DEFAULT); + if (err) + return RESPST_ERR_RKEY_VIOLATION; + + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + index = rxe_odp_iova_to_index(umem_odp, iova); + page = hmm_pfn_to_page(umem_odp->pfn_list[index]); + if (!page) { + mutex_unlock(&umem_odp->umem_mutex); + return RESPST_ERR_RKEY_VIOLATION; + } + /* See IBA A19.4.2 */ + if (unlikely(page_offset & 0x7)) { + mutex_unlock(&umem_odp->umem_mutex); + rxe_dbg_mr(mr, "misaligned address\n"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + va = kmap_local_page(page); + /* Do atomic write after all prior operations have completed */ + smp_store_release(&va[page_offset >> 3], value); + kunmap_local(va); + + mutex_unlock(&umem_odp->umem_mutex); + + return 0; +} +#else +int rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +{ + return RESPST_ERR_UNSUPPORTED_OPCODE; +} +#endif diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index dd65a8872111..1505d933c09b 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -754,7 +754,10 @@ static enum resp_states atomic_write_reply(struct rxe_qp *qp, value = *(u64 *)payload_addr(pkt); iova = qp->resp.va + qp->resp.offset; - err = rxe_mr_do_atomic_write(mr, iova, value); + if (mr->umem->is_odp) + err = rxe_odp_do_atomic_write(mr, iova, value); + else + err = rxe_mr_do_atomic_write(mr, iova, value); if (err) return err; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index da07d3e2db1d..bfa1bff3c720 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -326,6 +326,7 @@ enum ib_odp_transport_cap_bits { IB_ODP_SUPPORT_ATOMIC = 1 << 4, IB_ODP_SUPPORT_SRQ_RECV = 1 << 5, IB_ODP_SUPPORT_FLUSH = 1 << 6, + IB_ODP_SUPPORT_ATOMIC_WRITE = 1 << 7, }; struct ib_odp_caps {
Add rxe_odp_do_atomic_write() so that ODP specific steps are applied to ATOMIC WRITE requests. Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com> --- drivers/infiniband/sw/rxe/rxe.c | 1 + drivers/infiniband/sw/rxe/rxe_loc.h | 5 +++ drivers/infiniband/sw/rxe/rxe_mr.c | 4 -- drivers/infiniband/sw/rxe/rxe_odp.c | 59 ++++++++++++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_resp.c | 5 ++- include/rdma/ib_verbs.h | 1 + 6 files changed, 70 insertions(+), 5 deletions(-)