[09/15] RDMA/mlx5: Use an xarray for the children of an implicit ODP

Message ID	20191009160934.3143-10-jgg@ziepe.ca (mailing list archive)
State	Accepted
Delegated to:	Jason Gunthorpe
Headers	show Return-Path: <SRS0=5pZl=YC=vger.kernel.org=linux-rdma-owner@kernel.org> From: Jason Gunthorpe <jgg@ziepe.ca> To: linux-rdma@vger.kernel.org Cc: Jason Gunthorpe <jgg@mellanox.com>, Artemy Kovalyov <artemyko@mellanox.com> Subject: [PATCH 09/15] RDMA/mlx5: Use an xarray for the children of an implicit ODP Date: Wed, 9 Oct 2019 13:09:29 -0300 Message-Id: <20191009160934.3143-10-jgg@ziepe.ca> In-Reply-To: <20191009160934.3143-1-jgg@ziepe.ca> References: <20191009160934.3143-1-jgg@ziepe.ca> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk
Series	Rework the locking and datastructures for mlx5 implicit ODP \| expand [00/15] Rework the locking and datastructures for mlx5 implicit ODP [01/15] RDMA/mlx5: Use SRCU properly in ODP prefetch [02/15] RDMA/mlx5: Split sig_err MR data into its own xarray [03/15] RDMA/mlx5: Use a dedicated mkey xarray for ODP [04/15] RDMA/mlx5: Delete struct mlx5_priv->mkey_table [05/15] RDMA/mlx5: Rework implicit_mr_get_data [06/15] RDMA/mlx5: Lift implicit_mr_alloc() into the two routines that call it [07/15] RDMA/mlx5: Set the HW IOVA of the child MRs to their place in the tree [08/15] RDMA/mlx5: Split implicit handling from pagefault_mr [09/15] RDMA/mlx5: Use an xarray for the children of an implicit ODP [10/15] RDMA/mlx5: Reduce locking in implicit_mr_get_data() [11/15] RDMA/mlx5: Avoid double lookups on the pagefault path [12/15] RDMA/mlx5: Rework implicit ODP destroy [13/15] RDMA/mlx5: Do not store implicit children in the odp_mkeys xarray [14/15] RDMA/mlx5: Do not race with mlx5_ib_invalidate_range during create and destroy [15/15] RDMA/odp: Remove broken debugging call to invalidate_range

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 2cf91db6a36f5f..88769fcffb5a10 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -617,10 +617,13 @@ struct mlx5_ib_mr { u64 data_iova; u64 pi_iova; + /* For ODP and implicit */ atomic_t num_leaf_free; wait_queue_head_t q_leaf_free; - struct mlx5_async_work cb_work; atomic_t num_pending_prefetch; + struct xarray implicit_children; + + struct mlx5_async_work cb_work; }; static inline bool is_odp_mr(struct mlx5_ib_mr *mr) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index aba4f17c235467..d70cf02343a79f 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -93,132 +93,54 @@ struct mlx5_pagefault { static u64 mlx5_imr_ksm_entries; -static int check_parent(struct ib_umem_odp *odp, - struct mlx5_ib_mr *parent) +void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, + struct mlx5_ib_mr *imr, int flags) { - struct mlx5_ib_mr *mr = odp->private; - - return mr && mr->parent == parent && !odp->dying; -} - -static struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr) -{ - if (WARN_ON(!mr || !is_odp_mr(mr))) - return NULL; - - return to_ib_umem_odp(mr->umem)->per_mm; -} - -static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) -{ - struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; - struct ib_ucontext_per_mm *per_mm = odp->per_mm; - struct rb_node *rb; - - down_read(&per_mm->umem_rwsem); - while (1) { - rb = rb_next(&odp->interval_tree.rb); - if (!rb) - goto not_found; - odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (check_parent(odp, parent)) - goto end; - } -not_found: - odp = NULL; -end: - up_read(&per_mm->umem_rwsem); - return odp; -} - -static struct ib_umem_odp *odp_lookup(u64 start, u64 length, - struct mlx5_ib_mr *parent) -{ - struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(parent); - struct ib_umem_odp *odp; - struct rb_node *rb; - - down_read(&per_mm->umem_rwsem); - odp = rbt_ib_umem_lookup(&per_mm->umem_tree, start, length); - if (!odp) - goto end; - - while (1) { - if (check_parent(odp, parent)) - goto end; - rb = rb_next(&odp->interval_tree.rb); - if (!rb) - goto not_found; - odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (ib_umem_start(odp) > start + length) - goto not_found; - } -not_found: - odp = NULL; -end: - up_read(&per_mm->umem_rwsem); - return odp; -} - -void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, - size_t nentries, struct mlx5_ib_mr *mr, int flags) -{ - struct ib_pd *pd = mr->ibmr.pd; - struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct ib_umem_odp *odp; - unsigned long va; - int i; + struct mlx5_klm *end = pklm + nentries; if (flags & MLX5_IB_UPD_XLT_ZAP) { - for (i = 0; i < nentries; i++, pklm++) { + for (; pklm != end; pklm++, idx++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = cpu_to_be32(dev->null_mkey); + pklm->key = cpu_to_be32(imr->dev->null_mkey); pklm->va = 0; } return; } /* - * The locking here is pretty subtle. Ideally the implicit children - * list would be protected by the umem_mutex, however that is not + * The locking here is pretty subtle. Ideally the implicit_children + * xarray would be protected by the umem_mutex, however that is not * possible. Instead this uses a weaker update-then-lock pattern: * * srcu_read_lock() - * <change children list> + * xa_store() * mutex_lock(umem_mutex) * mlx5_ib_update_xlt() * mutex_unlock(umem_mutex) * destroy lkey * - * ie any change the children list must be followed by the locked - * update_xlt before destroying. + * ie any change the xarray must be followed by the locked update_xlt + * before destroying. * * The umem_mutex provides the acquire/release semantic needed to make - * the children list visible to a racing thread. While SRCU is not + * the xa_store() visible to a racing thread. While SRCU is not * technically required, using it gives consistent use of the SRCU - * locking around the children list. + * locking around the xarray. */ - lockdep_assert_held(&to_ib_umem_odp(mr->umem)->umem_mutex); - lockdep_assert_held(&mr->dev->odp_srcu); + lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); + lockdep_assert_held(&imr->dev->odp_srcu); - odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE, - nentries * MLX5_IMR_MTT_SIZE, mr); + for (; pklm != end; pklm++, idx++) { + struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); - for (i = 0; i < nentries; i++, pklm++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - va = (offset + i) * MLX5_IMR_MTT_SIZE; - if (odp && ib_umem_start(odp) == va) { - struct mlx5_ib_mr *mtt = odp->private; - + if (mtt) { pklm->key = cpu_to_be32(mtt->ibmr.lkey); - pklm->va = cpu_to_be64(va); - odp = odp_next(odp); + pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); } else { - pklm->key = cpu_to_be32(dev->null_mkey); + pklm->key = cpu_to_be32(imr->dev->null_mkey); pklm->va = 0; } - mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", - i, va, be32_to_cpu(pklm->key)); } } @@ -320,6 +242,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, if (unlikely(!umem_odp->npages && mr->parent && !umem_odp->dying)) { + xa_erase(&mr->parent->implicit_children, + ib_umem_start(umem_odp) >> MLX5_IMR_MTT_SHIFT); xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); umem_odp->dying = 1; atomic_inc(&mr->parent->num_leaf_free); @@ -464,6 +388,16 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, goto out_release; } + /* + * Once the store to either xarray completes any error unwind has to + * use synchronize_srcu(). Avoid this with xa_reserve() + */ + err = xa_err(xa_store(&imr->implicit_children, idx, mr, GFP_KERNEL)); + if (err) { + ret = ERR_PTR(err); + goto out_release; + } + xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), &mr->mmkey, GFP_ATOMIC); @@ -479,7 +413,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, return ret; } -static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *imr, +static struct mlx5_ib_mr *implicit_mr_get_data(struct mlx5_ib_mr *imr, u64 io_virt, size_t bcnt) { struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); @@ -487,39 +421,32 @@ static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *imr, unsigned long idx = io_virt >> MLX5_IMR_MTT_SHIFT; unsigned long inv_start_idx = end_idx + 1; unsigned long inv_len = 0; - struct ib_umem_odp *result = NULL; - struct ib_umem_odp *odp; + struct mlx5_ib_mr *result = NULL; int ret; mutex_lock(&odp_imr->umem_mutex); - odp = odp_lookup(idx * MLX5_IMR_MTT_SIZE, 1, imr); for (idx = idx; idx <= end_idx; idx++) { - if (unlikely(!odp)) { - struct mlx5_ib_mr *mtt; + struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); + if (unlikely(!mtt)) { mtt = implicit_get_child_mr(imr, idx); if (IS_ERR(mtt)) { - result = ERR_CAST(mtt); + result = mtt; goto out; } - odp = to_ib_umem_odp(mtt->umem); inv_start_idx = min(inv_start_idx, idx); inv_len = idx - inv_start_idx + 1; } /* Return first odp if region not covered by single one */ if (likely(!result)) - result = odp; - - odp = odp_next(odp); - if (odp && ib_umem_start(odp) != idx * MLX5_IMR_MTT_SIZE) - odp = NULL; + result = mtt; } /* - * Any time the children in the interval tree are changed we must - * perform an update of the xlt before exiting to ensure the HW and - * the tree remains synchronized. + * Any time the implicit_children are changed we must perform an + * update of the xlt before exiting to ensure the HW and the + * implicit_children remains synchronized. */ out: if (likely(!inv_len)) @@ -569,6 +496,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, init_waitqueue_head(&imr->q_leaf_free); atomic_set(&imr->num_leaf_free, 0); atomic_set(&imr->num_pending_prefetch, 0); + xa_init(&imr->implicit_children); err = mlx5_ib_update_xlt(imr, 0, mlx5_imr_ksm_entries, @@ -596,18 +524,15 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) { - struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); - struct rb_node *node; + struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); + struct mlx5_ib_mr *mtt; + unsigned long idx; - down_read(&per_mm->umem_rwsem); - for (node = rb_first_cached(&per_mm->umem_tree); node; - node = rb_next(node)) { - struct ib_umem_odp *umem_odp = - rb_entry(node, struct ib_umem_odp, interval_tree.rb); - struct mlx5_ib_mr *mr = umem_odp->private; + mutex_lock(&odp_imr->umem_mutex); + xa_for_each (&imr->implicit_children, idx, mtt) { + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mtt->umem); - if (mr->parent != imr) - continue; + xa_erase(&imr->implicit_children, idx); mutex_lock(&umem_odp->umem_mutex); ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), @@ -623,9 +548,12 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) schedule_work(&umem_odp->work); mutex_unlock(&umem_odp->umem_mutex); } - up_read(&per_mm->umem_rwsem); + mutex_unlock(&odp_imr->umem_mutex); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); + WARN_ON(!xa_empty(&imr->implicit_children)); + /* Remove any left over reserved elements */ + xa_destroy(&imr->implicit_children); } #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) @@ -718,7 +646,7 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, u32 *bytes_mapped, u32 flags) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - struct ib_umem_odp *child; + struct mlx5_ib_mr *mtt; int npages = 0; if (!odp->is_implicit_odp) { @@ -733,17 +661,18 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - io_virt < bcnt)) return -EFAULT; - child = implicit_mr_get_data(mr, io_virt, bcnt); - if (IS_ERR(child)) - return PTR_ERR(child); + mtt = implicit_mr_get_data(mr, io_virt, bcnt); + if (IS_ERR(mtt)) + return PTR_ERR(mtt); /* Fault each child mr that intersects with our interval. */ while (bcnt) { - u64 end = min_t(u64, io_virt + bcnt, ib_umem_end(child)); + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mtt->umem); + u64 end = min_t(u64, io_virt + bcnt, ib_umem_end(umem_odp)); u64 len = end - io_virt; int ret; - ret = pagefault_real_mr(child->private, child, io_virt, len, + ret = pagefault_real_mr(mtt, umem_odp, io_virt, len, bytes_mapped, flags); if (ret < 0) return ret; @@ -752,12 +681,14 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, npages += ret; if (unlikely(bcnt)) { - child = odp_next(child); + mtt = xa_load(&mr->implicit_children, + io_virt >> MLX5_IMR_MTT_SHIFT); + /* * implicit_mr_get_data sets up all the leaves, this * means they got invalidated before we got to them. */ - if (!child || ib_umem_start(child) != io_virt) { + if (!mtt) { mlx5_ib_dbg( mr->dev, "next implicit leaf removed at 0x%llx.\n", diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 253df1a1fa5406..28078efc38339f 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -156,22 +156,6 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, umem_call_back cb, bool blockable, void *cookie); -/* - * Find first region intersecting with address range. - * Return NULL if not found - */ -static inline struct ib_umem_odp * -rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length) -{ - struct interval_tree_node *node; - - node = interval_tree_iter_first(root, addr, addr + length - 1); - if (!node) - return NULL; - return container_of(node, struct ib_umem_odp, interval_tree); - -} - static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, unsigned long mmu_seq) {

[09/15] RDMA/mlx5: Use an xarray for the children of an implicit ODP

Commit Message

Patch