diff mbox series

[11/15] RDMA/mlx5: Avoid double lookups on the pagefault path

Message ID 20191009160934.3143-12-jgg@ziepe.ca (mailing list archive)
State Accepted
Delegated to: Jason Gunthorpe
Headers show
Series Rework the locking and datastructures for mlx5 implicit ODP | expand

Commit Message

Jason Gunthorpe Oct. 9, 2019, 4:09 p.m. UTC
From: Jason Gunthorpe <jgg@mellanox.com>

Now that the locking is simplified combine pagefault_implicit_mr() with
implicit_mr_get_data() so that we sweep over the idx range only once,
and do the single xlt update at the end, after the child umems are
setup.

This avoids double iteration/xa_loads plus the sketchy failure path if the
xa_load() fails.

Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/odp.c | 186 +++++++++++++------------------
 1 file changed, 80 insertions(+), 106 deletions(-)
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index e8413fd1b8c73b..1897ce6a25f693 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -419,68 +419,6 @@  static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 	return ret;
 }
 
-static struct mlx5_ib_mr *implicit_mr_get_data(struct mlx5_ib_mr *imr,
-						u64 io_virt, size_t bcnt)
-{
-	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
-	unsigned long end_idx = (io_virt + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
-	unsigned long idx = io_virt >> MLX5_IMR_MTT_SHIFT;
-	unsigned long inv_start_idx = end_idx + 1;
-	unsigned long inv_len = 0;
-	struct mlx5_ib_mr *result = NULL;
-	int ret;
-
-	lockdep_assert_held(&imr->dev->odp_srcu);
-
-	for (idx = idx; idx <= end_idx; idx++) {
-		struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
-
-		if (unlikely(!mtt)) {
-			mtt = implicit_get_child_mr(imr, idx);
-			if (IS_ERR(mtt)) {
-				result = mtt;
-				goto out;
-			}
-			inv_start_idx = min(inv_start_idx, idx);
-			inv_len = idx - inv_start_idx + 1;
-		}
-
-		/* Return first odp if region not covered by single one */
-		if (likely(!result))
-			result = mtt;
-	}
-
-	/*
-	 * Any time the implicit_children are changed we must perform an
-	 * update of the xlt before exiting to ensure the HW and the
-	 * implicit_children remains synchronized.
-	 */
-out:
-	if (likely(!inv_len))
-		return result;
-
-	/*
-	 * Notice this is not strictly ordered right, the KSM is updated after
-	 * the implicit_leaves is updated, so a parallel page fault could see
-	 * a MR that is not yet visible in the KSM.  This is similar to a
-	 * parallel page fault seeing a MR that is being concurrently removed
-	 * from the KSM. Both of these improbable situations are resolved
-	 * safely by resuming the HW and then taking another page fault. The
-	 * next pagefault handler will see the new information.
-	 */
-	mutex_lock(&odp_imr->umem_mutex);
-	ret = mlx5_ib_update_xlt(imr, inv_start_idx, inv_len, 0,
-				 MLX5_IB_UPD_XLT_INDIRECT |
-					 MLX5_IB_UPD_XLT_ATOMIC);
-	mutex_unlock(&odp_imr->umem_mutex);
-	if (ret) {
-		mlx5_ib_err(to_mdev(imr->ibmr.pd->device),
-			    "Failed to update PAS\n");
-		return ERR_PTR(ret);
-	}
-	return result;
-}
-
 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 					     struct ib_udata *udata,
 					     int access_flags)
@@ -647,6 +585,84 @@  static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
 	return ret;
 }
 
+static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
+				 struct ib_umem_odp *odp_imr, u64 user_va,
+				 size_t bcnt, u32 *bytes_mapped, u32 flags)
+{
+	unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
+	unsigned long upd_start_idx = end_idx + 1;
+	unsigned long upd_len = 0;
+	unsigned long npages = 0;
+	int err;
+	int ret;
+
+	if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
+		     mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
+		return -EFAULT;
+
+	/* Fault each child mr that intersects with our interval. */
+	while (bcnt) {
+		unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
+		struct ib_umem_odp *umem_odp;
+		struct mlx5_ib_mr *mtt;
+		u64 len;
+
+		mtt = xa_load(&imr->implicit_children, idx);
+		if (unlikely(!mtt)) {
+			mtt = implicit_get_child_mr(imr, idx);
+			if (IS_ERR(mtt)) {
+				ret = PTR_ERR(mtt);
+				goto out;
+			}
+			upd_start_idx = min(upd_start_idx, idx);
+			upd_len = idx - upd_start_idx + 1;
+		}
+
+		umem_odp = to_ib_umem_odp(mtt->umem);
+		len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) -
+		      user_va;
+
+		ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
+					bytes_mapped, flags);
+		if (ret < 0)
+			goto out;
+		user_va += len;
+		bcnt -= len;
+		npages += ret;
+	}
+
+	ret = npages;
+
+	/*
+	 * Any time the implicit_children are changed we must perform an
+	 * update of the xlt before exiting to ensure the HW and the
+	 * implicit_children remains synchronized.
+	 */
+out:
+	if (likely(!upd_len))
+		return ret;
+
+	/*
+	 * Notice this is not strictly ordered right, the KSM is updated after
+	 * the implicit_children is updated, so a parallel page fault could
+	 * see a MR that is not yet visible in the KSM.  This is similar to a
+	 * parallel page fault seeing a MR that is being concurrently removed
+	 * from the KSM. Both of these improbable situations are resolved
+	 * safely by resuming the HW and then taking another page fault. The
+	 * next pagefault handler will see the new information.
+	 */
+	mutex_lock(&odp_imr->umem_mutex);
+	err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0,
+				 MLX5_IB_UPD_XLT_INDIRECT |
+					 MLX5_IB_UPD_XLT_ATOMIC);
+	mutex_unlock(&odp_imr->umem_mutex);
+	if (err) {
+		mlx5_ib_err(imr->dev, "Failed to update PAS\n");
+		return err;
+	}
+	return ret;
+}
+
 /*
  * Returns:
  *  -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
@@ -660,8 +676,6 @@  static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
 			u32 *bytes_mapped, u32 flags)
 {
 	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
-	struct mlx5_ib_mr *mtt;
-	int npages = 0;
 
 	if (!odp->is_implicit_odp) {
 		if (unlikely(io_virt < ib_umem_start(odp) ||
@@ -670,48 +684,8 @@  static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
 		return pagefault_real_mr(mr, odp, io_virt, bcnt, bytes_mapped,
 					 flags);
 	}
-
-	if (unlikely(io_virt >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
-		     mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - io_virt < bcnt))
-		return -EFAULT;
-
-	mtt = implicit_mr_get_data(mr, io_virt, bcnt);
-	if (IS_ERR(mtt))
-		return PTR_ERR(mtt);
-
-	/* Fault each child mr that intersects with our interval. */
-	while (bcnt) {
-		struct ib_umem_odp *umem_odp = to_ib_umem_odp(mtt->umem);
-		u64 end = min_t(u64, io_virt + bcnt, ib_umem_end(umem_odp));
-		u64 len = end - io_virt;
-		int ret;
-
-		ret = pagefault_real_mr(mtt, umem_odp, io_virt, len,
-					bytes_mapped, flags);
-		if (ret < 0)
-			return ret;
-		io_virt += len;
-		bcnt -= len;
-		npages += ret;
-
-		if (unlikely(bcnt)) {
-			mtt = xa_load(&mr->implicit_children,
-				      io_virt >> MLX5_IMR_MTT_SHIFT);
-
-			/*
-			 * implicit_mr_get_data sets up all the leaves, this
-			 * means they got invalidated before we got to them.
-			 */
-			if (!mtt) {
-				mlx5_ib_dbg(
-					mr->dev,
-					"next implicit leaf removed at 0x%llx.\n",
-					io_virt);
-				return -EAGAIN;
-			}
-		}
-	}
-	return npages;
+	return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
+				     flags);
 }
 
 struct pf_frame {