diff mbox series

[for-next,v6,5/7] RDMA/rxe: Allow registering MRs for On-Demand Paging

Message ID 3fb02f58aa660d2d4a01bb187ce683eee23a138f.1694153251.git.matsuda-daisuke@fujitsu.com (mailing list archive)
State Superseded
Headers show
Series On-Demand Paging on SoftRoCE | expand

Commit Message

Daisuke Matsuda (Fujitsu) Sept. 8, 2023, 6:26 a.m. UTC
Allow userspace to register an ODP-enabled MR, in which case the flag
IB_ACCESS_ON_DEMAND is passed to rxe_reg_user_mr(). However, there is no
RDMA operation enabled right now. They will be supported later in the
subsequent two patches.

rxe_odp_do_pagefault() is called to initialize an ODP-enabled MR. It syncs
process address space from the CPU page table to the driver page table
(dma_list/pfn_list in umem_odp) when called with RXE_PAGEFAULT_SNAPSHOT
flag. Additionally, It can be used to trigger page fault when pages being
accessed are not present or do not have proper read/write permissions, and
possibly to prefetch pages in the future.

Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
---
 drivers/infiniband/sw/rxe/rxe.c       |   7 ++
 drivers/infiniband/sw/rxe/rxe_loc.h   |  14 +++
 drivers/infiniband/sw/rxe/rxe_mr.c    |   9 +-
 drivers/infiniband/sw/rxe/rxe_odp.c   | 122 ++++++++++++++++++++++++++
 drivers/infiniband/sw/rxe/rxe_resp.c  |  15 +++-
 drivers/infiniband/sw/rxe/rxe_verbs.c |   5 +-
 drivers/infiniband/sw/rxe/rxe_verbs.h |   1 +
 7 files changed, 167 insertions(+), 6 deletions(-)

Comments

Jason Gunthorpe Sept. 8, 2023, 2:24 p.m. UTC | #1
On Fri, Sep 08, 2023 at 03:26:46PM +0900, Daisuke Matsuda wrote:
> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
> index 834fb1a84800..713bef9161e3 100644
> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
> @@ -32,6 +32,31 @@ static void rxe_mr_unset_xarray(struct rxe_mr *mr, unsigned long start,
>  	xas_unlock(&xas);
>  }
>  
> +static void rxe_mr_set_xarray(struct rxe_mr *mr, unsigned long start,
> +			      unsigned long end, unsigned long *pfn_list)
> +{
> +	unsigned long lower = rxe_mr_iova_to_index(mr, start);
> +	unsigned long upper = rxe_mr_iova_to_index(mr, end - 1);
> +	struct page *page;
> +	void *entry;
> +
> +	XA_STATE(xas, &mr->page_list, lower);
> +
> +	/* ib_umem_odp_unmap_dma_pages() ensures pages are HMM_PFN_VALID */
> +	xas_lock(&xas);
> +	while (true) {
> +		page = hmm_pfn_to_page(pfn_list[xas.xa_index]);
> +		xas_store(&xas, page);
> +
> +		entry = xas_next(&xas);
> +		if (xas_retry(&xas, entry) || (xas.xa_index <= upper))
> +			continue;
> +
> +		break;
> +	}

while (xas.xa_index <= upper) {
      xas_store(&xas, hmm_pfn_to_page(pfn_list[xas.xa_index]));
      xas_next(&xas);
}

Again no need for retries

Jason
Zhu Yanjun Sept. 10, 2023, 8:32 a.m. UTC | #2
在 2023/9/8 14:26, Daisuke Matsuda 写道:
> Allow userspace to register an ODP-enabled MR, in which case the flag
> IB_ACCESS_ON_DEMAND is passed to rxe_reg_user_mr(). However, there is no
> RDMA operation enabled right now. They will be supported later in the
> subsequent two patches.
> 
> rxe_odp_do_pagefault() is called to initialize an ODP-enabled MR. It syncs
> process address space from the CPU page table to the driver page table
> (dma_list/pfn_list in umem_odp) when called with RXE_PAGEFAULT_SNAPSHOT
> flag. Additionally, It can be used to trigger page fault when pages being
> accessed are not present or do not have proper read/write permissions, and
> possibly to prefetch pages in the future.
> 
> Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
> ---
>   drivers/infiniband/sw/rxe/rxe.c       |   7 ++
>   drivers/infiniband/sw/rxe/rxe_loc.h   |  14 +++
>   drivers/infiniband/sw/rxe/rxe_mr.c    |   9 +-
>   drivers/infiniband/sw/rxe/rxe_odp.c   | 122 ++++++++++++++++++++++++++
>   drivers/infiniband/sw/rxe/rxe_resp.c  |  15 +++-
>   drivers/infiniband/sw/rxe/rxe_verbs.c |   5 +-
>   drivers/infiniband/sw/rxe/rxe_verbs.h |   1 +
>   7 files changed, 167 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 54c723a6edda..f2284d27229b 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -73,6 +73,13 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
>   			rxe->ndev->dev_addr);
>   
>   	rxe->max_ucontext			= RXE_MAX_UCONTEXT;
> +
> +	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
> +		rxe->attr.kernel_cap_flags |= IBK_ON_DEMAND_PAGING;
> +
> +		/* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */
> +		rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT;
> +	}
>   }
>   
>   /* initialize port attributes */
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index eb867f7d0d36..4bda154a0248 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -188,4 +188,18 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
>   	return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type];
>   }
>   
> +/* rxe_odp.c */
> +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
> +int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
> +			 u64 iova, int access_flags, struct rxe_mr *mr);
> +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
> +static inline int
> +rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
> +		     int access_flags, struct rxe_mr *mr)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
> +
>   #endif /* RXE_LOC_H */
> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
> index 86b1908d304b..384cb4ba1f2d 100644
> --- a/drivers/infiniband/sw/rxe/rxe_mr.c
> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c
> @@ -318,7 +318,10 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
>   		return err;
>   	}
>   
> -	return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
> +	if (mr->umem->is_odp)
> +		return -EOPNOTSUPP;
> +	else
> +		return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
>   }
>   
>   /* copy data in or out of a wqe, i.e. sg list
> @@ -527,6 +530,10 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
>   	struct page *page;
>   	u64 *va;
>   
> +	/* ODP is not supported right now. WIP. */
> +	if (mr->umem->is_odp)
> +		return RESPST_ERR_UNSUPPORTED_OPCODE;
> +
>   	/* See IBA oA19-28 */
>   	if (unlikely(mr->state != RXE_MR_STATE_VALID)) {
>   		rxe_dbg_mr(mr, "mr not in valid state");
> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
> index 834fb1a84800..713bef9161e3 100644
> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
> @@ -32,6 +32,31 @@ static void rxe_mr_unset_xarray(struct rxe_mr *mr, unsigned long start,
>   	xas_unlock(&xas);
>   }
>   
> +static void rxe_mr_set_xarray(struct rxe_mr *mr, unsigned long start,
> +			      unsigned long end, unsigned long *pfn_list)
> +{
> +	unsigned long lower = rxe_mr_iova_to_index(mr, start);
> +	unsigned long upper = rxe_mr_iova_to_index(mr, end - 1);
> +	struct page *page;
> +	void *entry;

Should follow Reverse Christmas Tree to declare variables.

> +
> +	XA_STATE(xas, &mr->page_list, lower);
> +
> +	/* ib_umem_odp_unmap_dma_pages() ensures pages are HMM_PFN_VALID */
> +	xas_lock(&xas);
> +	while (true) {
> +		page = hmm_pfn_to_page(pfn_list[xas.xa_index]);
> +		xas_store(&xas, page);
> +
> +		entry = xas_next(&xas);
> +		if (xas_retry(&xas, entry) || (xas.xa_index <= upper))
> +			continue;
> +
> +		break;
> +	}
> +	xas_unlock(&xas);
> +}
> +
>   static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
>   				    const struct mmu_notifier_range *range,
>   				    unsigned long cur_seq)
> @@ -62,3 +87,100 @@ static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
>   const struct mmu_interval_notifier_ops rxe_mn_ops = {
>   	.invalidate = rxe_ib_invalidate_range,
>   };
> +
> +#define RXE_PAGEFAULT_RDONLY BIT(1)
> +#define RXE_PAGEFAULT_SNAPSHOT BIT(2)
> +static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcnt, u32 flags)
> +{
> +	int np;
> +	u64 access_mask;
> +	bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT);
> +	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);

ditto

> +
> +	access_mask = ODP_READ_ALLOWED_BIT;
> +	if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY))
> +		access_mask |= ODP_WRITE_ALLOWED_BIT;
> +
> +	/*
> +	 * ib_umem_odp_map_dma_and_lock() locks umem_mutex on success.
> +	 * Callers must release the lock later to let invalidation handler
> +	 * do its work again.
> +	 */
> +	np = ib_umem_odp_map_dma_and_lock(umem_odp, user_va, bcnt,
> +					  access_mask, fault);
> +	if (np < 0)
> +		return np;
> +
> +	/*
> +	 * umem_mutex is still locked here, so we can use hmm_pfn_to_page()
> +	 * safely to fetch pages in the range.
> +	 */
> +	rxe_mr_set_xarray(mr, user_va, user_va + bcnt, umem_odp->pfn_list);
> +
> +	return np;
> +}
> +
> +static int rxe_odp_init_pages(struct rxe_mr *mr)
> +{
> +	int ret;
> +	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);

ditto

> +
> +	ret = rxe_odp_do_pagefault_and_lock(mr, mr->umem->address,
> +					    mr->umem->length,
> +					    RXE_PAGEFAULT_SNAPSHOT);
> +
> +	if (ret >= 0)
> +		mutex_unlock(&umem_odp->umem_mutex);
> +
> +	return ret >= 0 ? 0 : ret;
> +}
> +
> +int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
> +			 u64 iova, int access_flags, struct rxe_mr *mr)
> +{
> +	int err;
> +	struct ib_umem_odp *umem_odp;

ditto

Zhu Yanjun
> +
> +	if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
> +		return -EOPNOTSUPP;
> +
> +	rxe_mr_init(access_flags, mr);
> +
> +	xa_init(&mr->page_list);
> +
> +	if (!start && length == U64_MAX) {
> +		if (iova != 0)
> +			return -EINVAL;
> +		if (!(rxe->attr.odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
> +			return -EINVAL;
> +
> +		/* Never reach here, for implicit ODP is not implemented. */
> +	}
> +
> +	umem_odp = ib_umem_odp_get(&rxe->ib_dev, start, length, access_flags,
> +				   &rxe_mn_ops);
> +	if (IS_ERR(umem_odp)) {
> +		rxe_dbg_mr(mr, "Unable to create umem_odp err = %d\n",
> +			   (int)PTR_ERR(umem_odp));
> +		return PTR_ERR(umem_odp);
> +	}
> +
> +	umem_odp->private = mr;
> +
> +	mr->umem = &umem_odp->umem;
> +	mr->access = access_flags;
> +	mr->ibmr.length = length;
> +	mr->ibmr.iova = iova;
> +	mr->page_offset = ib_umem_offset(&umem_odp->umem);
> +
> +	err = rxe_odp_init_pages(mr);
> +	if (err) {
> +		ib_umem_odp_release(umem_odp);
> +		return err;
> +	}
> +
> +	mr->state = RXE_MR_STATE_VALID;
> +	mr->ibmr.type = IB_MR_TYPE_USER;
> +
> +	return err;
> +}
> diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
> index 969e057bbfd1..9159f1bdfc6f 100644
> --- a/drivers/infiniband/sw/rxe/rxe_resp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_resp.c
> @@ -635,6 +635,10 @@ static enum resp_states process_flush(struct rxe_qp *qp,
>   	struct rxe_mr *mr = qp->resp.mr;
>   	struct resp_res *res = qp->resp.res;
>   
> +	/* ODP is not supported right now. WIP. */
> +	if (mr->umem->is_odp)
> +		return RESPST_ERR_UNSUPPORTED_OPCODE;
> +
>   	/* oA19-14, oA19-15 */
>   	if (res && res->replay)
>   		return RESPST_ACKNOWLEDGE;
> @@ -688,10 +692,13 @@ static enum resp_states atomic_reply(struct rxe_qp *qp,
>   	if (!res->replay) {
>   		u64 iova = qp->resp.va + qp->resp.offset;
>   
> -		err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode,
> -					  atmeth_comp(pkt),
> -					  atmeth_swap_add(pkt),
> -					  &res->atomic.orig_val);
> +		if (mr->umem->is_odp)
> +			err = RESPST_ERR_UNSUPPORTED_OPCODE;
> +		else
> +			err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode,
> +						  atmeth_comp(pkt),
> +						  atmeth_swap_add(pkt),
> +						  &res->atomic.orig_val);
>   		if (err)
>   			return err;
>   
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
> index 48f86839d36a..192ad835c712 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
> @@ -1278,7 +1278,10 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start,
>   	mr->ibmr.pd = ibpd;
>   	mr->ibmr.device = ibpd->device;
>   
> -	err = rxe_mr_init_user(rxe, start, length, iova, access, mr);
> +	if (access & IB_ACCESS_ON_DEMAND)
> +		err = rxe_odp_mr_init_user(rxe, start, length, iova, access, mr);
> +	else
> +		err = rxe_mr_init_user(rxe, start, length, iova, access, mr);
>   	if (err) {
>   		rxe_dbg_mr(mr, "reg_user_mr failed, err = %d", err);
>   		goto err_cleanup;
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index 1058b5de8920..24dd747586e0 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -298,6 +298,7 @@ enum {
>   				| IB_ACCESS_LOCAL_WRITE
>   				| IB_ACCESS_MW_BIND
>   				| IB_ACCESS_ON_DEMAND
> +				| IB_ACCESS_HUGETLB
>   				| IB_ACCESS_FLUSH_GLOBAL
>   				| IB_ACCESS_FLUSH_PERSISTENT
>   				| IB_ACCESS_OPTIONAL,
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 54c723a6edda..f2284d27229b 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -73,6 +73,13 @@  static void rxe_init_device_param(struct rxe_dev *rxe)
 			rxe->ndev->dev_addr);
 
 	rxe->max_ucontext			= RXE_MAX_UCONTEXT;
+
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+		rxe->attr.kernel_cap_flags |= IBK_ON_DEMAND_PAGING;
+
+		/* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */
+		rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT;
+	}
 }
 
 /* initialize port attributes */
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index eb867f7d0d36..4bda154a0248 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -188,4 +188,18 @@  static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
 	return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type];
 }
 
+/* rxe_odp.c */
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
+			 u64 iova, int access_flags, struct rxe_mr *mr);
+#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+static inline int
+rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
+		     int access_flags, struct rxe_mr *mr)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
 #endif /* RXE_LOC_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 86b1908d304b..384cb4ba1f2d 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -318,7 +318,10 @@  int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
 		return err;
 	}
 
-	return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
+	if (mr->umem->is_odp)
+		return -EOPNOTSUPP;
+	else
+		return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
 }
 
 /* copy data in or out of a wqe, i.e. sg list
@@ -527,6 +530,10 @@  int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
 	struct page *page;
 	u64 *va;
 
+	/* ODP is not supported right now. WIP. */
+	if (mr->umem->is_odp)
+		return RESPST_ERR_UNSUPPORTED_OPCODE;
+
 	/* See IBA oA19-28 */
 	if (unlikely(mr->state != RXE_MR_STATE_VALID)) {
 		rxe_dbg_mr(mr, "mr not in valid state");
diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
index 834fb1a84800..713bef9161e3 100644
--- a/drivers/infiniband/sw/rxe/rxe_odp.c
+++ b/drivers/infiniband/sw/rxe/rxe_odp.c
@@ -32,6 +32,31 @@  static void rxe_mr_unset_xarray(struct rxe_mr *mr, unsigned long start,
 	xas_unlock(&xas);
 }
 
+static void rxe_mr_set_xarray(struct rxe_mr *mr, unsigned long start,
+			      unsigned long end, unsigned long *pfn_list)
+{
+	unsigned long lower = rxe_mr_iova_to_index(mr, start);
+	unsigned long upper = rxe_mr_iova_to_index(mr, end - 1);
+	struct page *page;
+	void *entry;
+
+	XA_STATE(xas, &mr->page_list, lower);
+
+	/* ib_umem_odp_unmap_dma_pages() ensures pages are HMM_PFN_VALID */
+	xas_lock(&xas);
+	while (true) {
+		page = hmm_pfn_to_page(pfn_list[xas.xa_index]);
+		xas_store(&xas, page);
+
+		entry = xas_next(&xas);
+		if (xas_retry(&xas, entry) || (xas.xa_index <= upper))
+			continue;
+
+		break;
+	}
+	xas_unlock(&xas);
+}
+
 static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
 				    const struct mmu_notifier_range *range,
 				    unsigned long cur_seq)
@@ -62,3 +87,100 @@  static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
 const struct mmu_interval_notifier_ops rxe_mn_ops = {
 	.invalidate = rxe_ib_invalidate_range,
 };
+
+#define RXE_PAGEFAULT_RDONLY BIT(1)
+#define RXE_PAGEFAULT_SNAPSHOT BIT(2)
+static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcnt, u32 flags)
+{
+	int np;
+	u64 access_mask;
+	bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT);
+	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+
+	access_mask = ODP_READ_ALLOWED_BIT;
+	if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY))
+		access_mask |= ODP_WRITE_ALLOWED_BIT;
+
+	/*
+	 * ib_umem_odp_map_dma_and_lock() locks umem_mutex on success.
+	 * Callers must release the lock later to let invalidation handler
+	 * do its work again.
+	 */
+	np = ib_umem_odp_map_dma_and_lock(umem_odp, user_va, bcnt,
+					  access_mask, fault);
+	if (np < 0)
+		return np;
+
+	/*
+	 * umem_mutex is still locked here, so we can use hmm_pfn_to_page()
+	 * safely to fetch pages in the range.
+	 */
+	rxe_mr_set_xarray(mr, user_va, user_va + bcnt, umem_odp->pfn_list);
+
+	return np;
+}
+
+static int rxe_odp_init_pages(struct rxe_mr *mr)
+{
+	int ret;
+	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+
+	ret = rxe_odp_do_pagefault_and_lock(mr, mr->umem->address,
+					    mr->umem->length,
+					    RXE_PAGEFAULT_SNAPSHOT);
+
+	if (ret >= 0)
+		mutex_unlock(&umem_odp->umem_mutex);
+
+	return ret >= 0 ? 0 : ret;
+}
+
+int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
+			 u64 iova, int access_flags, struct rxe_mr *mr)
+{
+	int err;
+	struct ib_umem_odp *umem_odp;
+
+	if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+		return -EOPNOTSUPP;
+
+	rxe_mr_init(access_flags, mr);
+
+	xa_init(&mr->page_list);
+
+	if (!start && length == U64_MAX) {
+		if (iova != 0)
+			return -EINVAL;
+		if (!(rxe->attr.odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
+			return -EINVAL;
+
+		/* Never reach here, for implicit ODP is not implemented. */
+	}
+
+	umem_odp = ib_umem_odp_get(&rxe->ib_dev, start, length, access_flags,
+				   &rxe_mn_ops);
+	if (IS_ERR(umem_odp)) {
+		rxe_dbg_mr(mr, "Unable to create umem_odp err = %d\n",
+			   (int)PTR_ERR(umem_odp));
+		return PTR_ERR(umem_odp);
+	}
+
+	umem_odp->private = mr;
+
+	mr->umem = &umem_odp->umem;
+	mr->access = access_flags;
+	mr->ibmr.length = length;
+	mr->ibmr.iova = iova;
+	mr->page_offset = ib_umem_offset(&umem_odp->umem);
+
+	err = rxe_odp_init_pages(mr);
+	if (err) {
+		ib_umem_odp_release(umem_odp);
+		return err;
+	}
+
+	mr->state = RXE_MR_STATE_VALID;
+	mr->ibmr.type = IB_MR_TYPE_USER;
+
+	return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index 969e057bbfd1..9159f1bdfc6f 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -635,6 +635,10 @@  static enum resp_states process_flush(struct rxe_qp *qp,
 	struct rxe_mr *mr = qp->resp.mr;
 	struct resp_res *res = qp->resp.res;
 
+	/* ODP is not supported right now. WIP. */
+	if (mr->umem->is_odp)
+		return RESPST_ERR_UNSUPPORTED_OPCODE;
+
 	/* oA19-14, oA19-15 */
 	if (res && res->replay)
 		return RESPST_ACKNOWLEDGE;
@@ -688,10 +692,13 @@  static enum resp_states atomic_reply(struct rxe_qp *qp,
 	if (!res->replay) {
 		u64 iova = qp->resp.va + qp->resp.offset;
 
-		err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode,
-					  atmeth_comp(pkt),
-					  atmeth_swap_add(pkt),
-					  &res->atomic.orig_val);
+		if (mr->umem->is_odp)
+			err = RESPST_ERR_UNSUPPORTED_OPCODE;
+		else
+			err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode,
+						  atmeth_comp(pkt),
+						  atmeth_swap_add(pkt),
+						  &res->atomic.orig_val);
 		if (err)
 			return err;
 
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 48f86839d36a..192ad835c712 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1278,7 +1278,10 @@  static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start,
 	mr->ibmr.pd = ibpd;
 	mr->ibmr.device = ibpd->device;
 
-	err = rxe_mr_init_user(rxe, start, length, iova, access, mr);
+	if (access & IB_ACCESS_ON_DEMAND)
+		err = rxe_odp_mr_init_user(rxe, start, length, iova, access, mr);
+	else
+		err = rxe_mr_init_user(rxe, start, length, iova, access, mr);
 	if (err) {
 		rxe_dbg_mr(mr, "reg_user_mr failed, err = %d", err);
 		goto err_cleanup;
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 1058b5de8920..24dd747586e0 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -298,6 +298,7 @@  enum {
 				| IB_ACCESS_LOCAL_WRITE
 				| IB_ACCESS_MW_BIND
 				| IB_ACCESS_ON_DEMAND
+				| IB_ACCESS_HUGETLB
 				| IB_ACCESS_FLUSH_GLOBAL
 				| IB_ACCESS_FLUSH_PERSISTENT
 				| IB_ACCESS_OPTIONAL,