diff mbox series

[rdma-rc] Revert "RDMA/mlx5: Fix releasing unallocated memory in dereg MR flow"

Message ID 20211222101312.1358616-1-maorg@nvidia.com (mailing list archive)
State Accepted
Delegated to: Jason Gunthorpe
Headers show
Series [rdma-rc] Revert "RDMA/mlx5: Fix releasing unallocated memory in dereg MR flow" | expand

Commit Message

Maor Gottlieb Dec. 22, 2021, 10:13 a.m. UTC
This patch is not the full fix and still causes to call traces
during mlx5_ib_dereg_mr().

This reverts commit f0ae4afe3d35e67db042c58a52909e06262b740f.

Fixes: f0ae4afe3d35 ("RDMA/mlx5: Fix releasing unallocated memory in dereg MR flow")
Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  6 +++---
 drivers/infiniband/hw/mlx5/mr.c      | 26 ++++++++++++++------------
 2 files changed, 17 insertions(+), 15 deletions(-)

Comments

Leon Romanovsky Dec. 23, 2021, 9:36 a.m. UTC | #1
On Wed, Dec 22, 2021 at 12:13:12PM +0200, Maor Gottlieb wrote:
> This patch is not the full fix and still causes to call traces
> during mlx5_ib_dereg_mr().
> 
> This reverts commit f0ae4afe3d35e67db042c58a52909e06262b740f.
> 
> Fixes: f0ae4afe3d35 ("RDMA/mlx5: Fix releasing unallocated memory in dereg MR flow")
> Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
> ---
>  drivers/infiniband/hw/mlx5/mlx5_ib.h |  6 +++---
>  drivers/infiniband/hw/mlx5/mr.c      | 26 ++++++++++++++------------
>  2 files changed, 17 insertions(+), 15 deletions(-)
> 

Thanks,
Acked-by: Leon Romanovsky <leonro@nvidia.com>
Jason Gunthorpe Dec. 24, 2021, 2:51 a.m. UTC | #2
On Wed, Dec 22, 2021 at 12:13:12PM +0200, Maor Gottlieb wrote:
> This patch is not the full fix and still causes to call traces
> during mlx5_ib_dereg_mr().
> 
> This reverts commit f0ae4afe3d35e67db042c58a52909e06262b740f.
> 
> Fixes: f0ae4afe3d35 ("RDMA/mlx5: Fix releasing unallocated memory in dereg MR flow")
> Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
> ---
>  drivers/infiniband/hw/mlx5/mlx5_ib.h |  6 +++---
>  drivers/infiniband/hw/mlx5/mr.c      | 26 ++++++++++++++------------
>  2 files changed, 17 insertions(+), 15 deletions(-)

Huh? Why is this such a problem to fix the missing udatas?

Jason
Leon Romanovsky Dec. 24, 2021, 6 a.m. UTC | #3
On Thu, Dec 23, 2021 at 10:51:45PM -0400, Jason Gunthorpe wrote:
> On Wed, Dec 22, 2021 at 12:13:12PM +0200, Maor Gottlieb wrote:
> > This patch is not the full fix and still causes to call traces
> > during mlx5_ib_dereg_mr().
> > 
> > This reverts commit f0ae4afe3d35e67db042c58a52909e06262b740f.
> > 
> > Fixes: f0ae4afe3d35 ("RDMA/mlx5: Fix releasing unallocated memory in dereg MR flow")
> > Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
> > ---
> >  drivers/infiniband/hw/mlx5/mlx5_ib.h |  6 +++---
> >  drivers/infiniband/hw/mlx5/mr.c      | 26 ++++++++++++++------------
> >  2 files changed, 17 insertions(+), 15 deletions(-)
> 
> Huh? Why is this such a problem to fix the missing udatas?

Because of internal structure of Nvidia/Mellanox, we are having very
hard time to properly test any fix in ULP area.

In addition, the vacation season doesn't help either.

Feel free to approach me or Maor offline and we will explain you the
non-technical details behind this revert.

Thanks

> 
> Jason
Timo Rothenpieler Dec. 29, 2021, 1:10 p.m. UTC | #4
On 22.12.2021 11:13, Maor Gottlieb wrote:
> This patch is not the full fix and still causes to call traces
> during mlx5_ib_dereg_mr().
> 
> This reverts commit f0ae4afe3d35e67db042c58a52909e06262b740f.

What is the status of this in the latest stable Kernel?
I see that f0ae4afe3d35e67db042c58a52909e06262b740f was backported into 
5.15, but since then, no fix or revert made it into 5.15.

I wanted to upgrade our cluster to 5.15 soon, but I'm a bit concerned 
about reports of issues surrounding this patch.
Leon Romanovsky Dec. 30, 2021, 11:08 a.m. UTC | #5
On Wed, Dec 29, 2021 at 02:10:31PM +0100, Timo Rothenpieler wrote:
> On 22.12.2021 11:13, Maor Gottlieb wrote:
> > This patch is not the full fix and still causes to call traces
> > during mlx5_ib_dereg_mr().
> > 
> > This reverts commit f0ae4afe3d35e67db042c58a52909e06262b740f.
> 
> What is the status of this in the latest stable Kernel?

The revert is not accepted yet.

> I see that f0ae4afe3d35e67db042c58a52909e06262b740f was backported into
> 5.15, but since then, no fix or revert made it into 5.15.
> 
> I wanted to upgrade our cluster to 5.15 soon, but I'm a bit concerned about
> reports of issues surrounding this patch.
Jason Gunthorpe Jan. 5, 2022, 5 p.m. UTC | #6
On Wed, Dec 22, 2021 at 12:13:12PM +0200, Maor Gottlieb wrote:
> This patch is not the full fix and still causes to call traces
> during mlx5_ib_dereg_mr().
> 
> This reverts commit f0ae4afe3d35e67db042c58a52909e06262b740f.
> 
> Fixes: f0ae4afe3d35 ("RDMA/mlx5: Fix releasing unallocated memory in dereg MR flow")
> Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
> Acked-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>  drivers/infiniband/hw/mlx5/mlx5_ib.h |  6 +++---
>  drivers/infiniband/hw/mlx5/mr.c      | 26 ++++++++++++++------------
>  2 files changed, 17 insertions(+), 15 deletions(-)

Applied to for-rc, lets get the correct fix for the next cycle.

Thanks
Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 4a7a56ed740b..e636e954f6bf 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -664,6 +664,7 @@  struct mlx5_ib_mr {
 
 	/* User MR data */
 	struct mlx5_cache_ent *cache_ent;
+	struct ib_umem *umem;
 
 	/* This is zero'd when the MR is allocated */
 	union {
@@ -675,7 +676,7 @@  struct mlx5_ib_mr {
 			struct list_head list;
 		};
 
-		/* Used only by kernel MRs */
+		/* Used only by kernel MRs (umem == NULL) */
 		struct {
 			void *descs;
 			void *descs_alloc;
@@ -696,9 +697,8 @@  struct mlx5_ib_mr {
 			int data_length;
 		};
 
-		/* Used only by User MRs */
+		/* Used only by User MRs (umem != NULL) */
 		struct {
-			struct ib_umem *umem;
 			unsigned int page_shift;
 			/* Current access_flags */
 			int access_flags;
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 63e2129f1142..157d862fb864 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1904,18 +1904,19 @@  mlx5_alloc_priv_descs(struct ib_device *device,
 	return ret;
 }
 
-static void mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
+static void
+mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
 {
-	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
-	int size = mr->max_descs * mr->desc_size;
-
-	if (!mr->descs)
-		return;
+	if (!mr->umem && mr->descs) {
+		struct ib_device *device = mr->ibmr.device;
+		int size = mr->max_descs * mr->desc_size;
+		struct mlx5_ib_dev *dev = to_mdev(device);
 
-	dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
-			 DMA_TO_DEVICE);
-	kfree(mr->descs_alloc);
-	mr->descs = NULL;
+		dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
+				 DMA_TO_DEVICE);
+		kfree(mr->descs_alloc);
+		mr->descs = NULL;
+	}
 }
 
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
@@ -1991,8 +1992,7 @@  int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 	if (mr->cache_ent) {
 		mlx5_mr_cache_free(dev, mr);
 	} else {
-		if (!udata)
-			mlx5_free_priv_descs(mr);
+		mlx5_free_priv_descs(mr);
 		kfree(mr);
 	}
 	return 0;
@@ -2079,6 +2079,7 @@  static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
 	if (err)
 		goto err_free_in;
 
+	mr->umem = NULL;
 	kfree(in);
 
 	return mr;
@@ -2205,6 +2206,7 @@  static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
 	}
 
 	mr->ibmr.device = pd->device;
+	mr->umem = NULL;
 
 	switch (mr_type) {
 	case IB_MR_TYPE_MEM_REG: