Message ID | c96b8645a81085abff739e6b06e286a350d1283d.1737274283.git.leon@kernel.org (mailing list archive) |
---|---|
State | Accepted |
Delegated to: | Jason Gunthorpe |
Headers | show |
Series | [rdma-next,v1] RDMA/mlx5: Fix implicit ODP use after free | expand |
On Sun, Jan 19, 2025 at 10:21:41AM +0200, Leon Romanovsky wrote: > Fixes: 5256edcb98a1 ("RDMA/mlx5: Rework implicit ODP destroy") Cc: stable Fixes a user triggerable oops > - if (!refcount_inc_not_zero(&imr->mmkey.usecount)) > + xa_lock(&imr->implicit_children); > + if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) != > + mr) { > + xa_unlock(&imr->implicit_children); > return; > + } > > - xa_erase(&imr->implicit_children, idx); > if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault)) > - xa_erase(&mr_to_mdev(mr)->odp_mkeys, > - mlx5_base_mkey(mr->mmkey.key)); > + __xa_erase(&mr_to_mdev(mr)->odp_mkeys, > + mlx5_base_mkey(mr->mmkey.key)); > + xa_unlock(&imr->implicit_children); > + > + if (!refcount_inc_not_zero(&imr->mmkey.usecount)) > + return; It seems the refcount must be done first: /* * If userspace is racing freeing the parent implicit ODP MR * then we can loose the race with parent destruction. In this * case mlx5_ib_free_odp_mr() will free everything in the * implicit_children xarray so NOP is fine. This child MR * cannot be destroyed here because we are under its umem_mutex. */ if (!refcount_inc_not_zero(&imr->mmkey.usecount)) return; What we must not do is remove something from the xarray and then fail to free it. Jason
On Mon, Jan 20, 2025 at 02:49:22PM -0400, Jason Gunthorpe wrote: > On Sun, Jan 19, 2025 at 10:21:41AM +0200, Leon Romanovsky wrote: > > > Fixes: 5256edcb98a1 ("RDMA/mlx5: Rework implicit ODP destroy") > > Cc: stable > > Fixes a user triggerable oops > > - if (!refcount_inc_not_zero(&imr->mmkey.usecount)) > > + xa_lock(&imr->implicit_children); > > + if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) != > > + mr) { > > + xa_unlock(&imr->implicit_children); > > return; > > + } > > > > - xa_erase(&imr->implicit_children, idx); > > if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault)) > > - xa_erase(&mr_to_mdev(mr)->odp_mkeys, > > - mlx5_base_mkey(mr->mmkey.key)); > > + __xa_erase(&mr_to_mdev(mr)->odp_mkeys, > > + mlx5_base_mkey(mr->mmkey.key)); > > + xa_unlock(&imr->implicit_children); > > + > > + if (!refcount_inc_not_zero(&imr->mmkey.usecount)) > > + return; > > It seems the refcount must be done first: > > /* > * If userspace is racing freeing the parent implicit ODP MR > * then we can loose the race with parent destruction. In this > * case mlx5_ib_free_odp_mr() will free everything in the > * implicit_children xarray so NOP is fine. This child MR > * cannot be destroyed here because we are under its umem_mutex. > */ > if (!refcount_inc_not_zero(&imr->mmkey.usecount)) > return; > > What we must not do is remove something from the xarray and then fail > to free it. Yes, like it was before. > > Jason
On Sun, Jan 19, 2025 at 10:21:41AM +0200, Leon Romanovsky wrote: > From: Patrisious Haddad <phaddad@nvidia.com> > > Prevent double queueing of implicit ODP mr destroy work by using > __xa_cmpxchg() to make sure, this is the first and last time we are > destroying this specific mr. > > Without this change, we could try to invalidate this mr twice, which in > turn could result in queuing a MR work destroy twice, and eventually the > second work could execute after the MR was freed due to the first work, > causing a user after free and trace below. > > drivers/infiniband/hw/mlx5/odp.c | 25 ++++++++++++++++--------- > 1 file changed, 16 insertions(+), 9 deletions(-) Applied to for-next with the change I noted Thanks, Jason
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index da7b4d6ad2c3..f4cd50a3693c 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -240,13 +240,20 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; struct mlx5_ib_mr *imr = mr->parent; - if (!refcount_inc_not_zero(&imr->mmkey.usecount)) + xa_lock(&imr->implicit_children); + if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) != + mr) { + xa_unlock(&imr->implicit_children); return; + } - xa_erase(&imr->implicit_children, idx); if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault)) - xa_erase(&mr_to_mdev(mr)->odp_mkeys, - mlx5_base_mkey(mr->mmkey.key)); + __xa_erase(&mr_to_mdev(mr)->odp_mkeys, + mlx5_base_mkey(mr->mmkey.key)); + xa_unlock(&imr->implicit_children); + + if (!refcount_inc_not_zero(&imr->mmkey.usecount)) + return; /* Freeing a MR is a sleeping operation, so bounce to a work queue */ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); @@ -513,18 +520,18 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, refcount_inc(&ret->mmkey.usecount); goto out_lock; } - xa_unlock(&imr->implicit_children); if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { - ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), - &mr->mmkey, GFP_KERNEL); + ret = __xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey, GFP_KERNEL); if (xa_is_err(ret)) { ret = ERR_PTR(xa_err(ret)); - xa_erase(&imr->implicit_children, idx); - goto out_mr; + __xa_erase(&imr->implicit_children, idx); + goto out_lock; } mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD; } + xa_unlock(&imr->implicit_children); mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr); return mr;