diff mbox

[34/36] IB/mlx5/hmm: add mlx5 HMM device initialization and callback.

Message ID 1432236233-4035-35-git-send-email-j.glisse@gmail.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Jerome Glisse May 21, 2015, 7:23 p.m. UTC
From: Jérôme Glisse <jglisse@redhat.com>

This add the core HMM callback for mlx5 device driver and initialize
the HMM device for the mlx5 infiniband device driver.

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
cc: <linux-rdma@vger.kernel.org>
---
 drivers/infiniband/core/umem_odp.c   |  12 ++-
 drivers/infiniband/hw/mlx5/main.c    |   5 +
 drivers/infiniband/hw/mlx5/mem.c     |  36 ++++++-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  19 +++-
 drivers/infiniband/hw/mlx5/mr.c      |   9 +-
 drivers/infiniband/hw/mlx5/odp.c     | 177 ++++++++++++++++++++++++++++++++++-
 include/rdma/ib_umem_odp.h           |  20 +++-
 7 files changed, 268 insertions(+), 10 deletions(-)
diff mbox

Patch

diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index d5d57a8..559542d 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -132,7 +132,7 @@  int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
 			return -ENOMEM;
 		}
 		kref_init(&ib_mirror->kref);
-		init_rwsem(&ib_mirror->hmm_mr_rwsem);
+		init_rwsem(&ib_mirror->umem_rwsem);
 		ib_mirror->umem_tree = RB_ROOT;
 		ib_mirror->ib_device = ib_device;
 
@@ -149,10 +149,11 @@  int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
 		context->ib_mirror = ib_mirror_ref(ib_mirror);
 	}
 	mutex_unlock(&ib_device->hmm_mutex);
-	umem->odp_data.ib_mirror = ib_mirror;
+	umem->odp_data->ib_mirror = ib_mirror;
 
 	down_write(&ib_mirror->umem_rwsem);
-	rbt_ib_umem_insert(&umem->odp_data->interval_tree, &mirror->umem_tree);
+	rbt_ib_umem_insert(&umem->odp_data->interval_tree,
+			   &ib_mirror->umem_tree);
 	up_write(&ib_mirror->umem_rwsem);
 
 	mmput(mm);
@@ -161,7 +162,7 @@  int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
 
 void ib_umem_odp_release(struct ib_umem *umem)
 {
-	struct ib_mirror *ib_mirror = umem->odp_data;
+	struct ib_mirror *ib_mirror = umem->odp_data->ib_mirror;
 
 	/*
 	 * Ensure that no more pages are mapped in the umem.
@@ -178,7 +179,8 @@  void ib_umem_odp_release(struct ib_umem *umem)
 	 * range covered by one and only one umem while holding the umem rwsem.
 	 */
 	down_write(&ib_mirror->umem_rwsem);
-	rbt_ib_umem_remove(&umem->odp_data->interval_tree, &mirror->umem_tree);
+	rbt_ib_umem_remove(&umem->odp_data->interval_tree,
+			   &ib_mirror->umem_tree);
 	up_write(&ib_mirror->umem_rwsem);
 
 	ib_mirror_unref(ib_mirror);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index d553f90..eddabf0 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1316,6 +1316,9 @@  static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	if (err)
 		goto err_rsrc;
 
+	/* If HMM initialization fails we just do not enable odp. */
+	mlx5_dev_init_odp_hmm(&dev->ib_dev, &mdev->pdev->dev);
+
 	err = ib_register_device(&dev->ib_dev, NULL);
 	if (err)
 		goto err_odp;
@@ -1340,6 +1343,7 @@  err_umrc:
 
 err_dev:
 	ib_unregister_device(&dev->ib_dev);
+	mlx5_dev_fini_odp_hmm(&dev->ib_dev);
 
 err_odp:
 	mlx5_ib_odp_remove_one(dev);
@@ -1359,6 +1363,7 @@  static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
 
 	ib_unregister_device(&dev->ib_dev);
 	destroy_umrc_res(dev);
+	mlx5_dev_fini_odp_hmm(&dev->ib_dev);
 	mlx5_ib_odp_remove_one(dev);
 	destroy_dev_resources(&dev->devr);
 	ib_dealloc_device(&dev->ib_dev);
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 21084c7..f150825 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -164,7 +164,41 @@  void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 	int entry;
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
-#error "CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM not supported at this stage !"
+	if (umem->odp_data) {
+		struct ib_mirror *ib_mirror = umem->odp_data->ib_mirror;
+		struct hmm_mirror *mirror = &ib_mirror->base;
+		struct hmm_pt_iter *iter = data, local_iter;
+		unsigned long addr;
+
+		if (iter == NULL) {
+			iter = &local_iter;
+			hmm_pt_iter_init(iter);
+		}
+
+		for (i = 0, addr = ib_umem_start(umem) + (offset << PAGE_SHIFT);
+		     i < num_pages; ++i, addr += PAGE_SIZE) {
+			dma_addr_t *ptep, pte;
+
+			/* Get and lock pointer to mirror page table. */
+			ptep = hmm_pt_iter_update(iter, &mirror->pt, addr);
+			pte = ptep ? *ptep : 0;
+			/* HMM will not have any page tables set up, if this
+			 * function is called before page faults have happened
+			 * on the MR. In that case, we don't have PA's yet, so
+			 * just set each one to zero and continue on. The hw
+			 * will trigger a page fault.
+			 */
+			if (hmm_pte_test_valid_dma(&pte))
+				pas[i] = cpu_to_be64(umem_dma_to_mtt(pte));
+			else
+				pas[i] = (__be64)0;
+		}
+
+		if (iter == &local_iter)
+			hmm_pt_iter_fini(iter, &mirror->pt);
+
+		return;
+	}
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 	const bool odp = umem->odp_data != NULL;
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index a6d62be..f1bafd4 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -615,6 +615,7 @@  int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
 			    struct ib_mr_status *mr_status);
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+
 extern struct workqueue_struct *mlx5_ib_page_fault_wq;
 
 int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev);
@@ -629,13 +630,18 @@  void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
 void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
-#error "CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM not supported at this stage !"
+void mlx5_dev_init_odp_hmm(struct ib_device *ib_dev, struct device *dev);
+void mlx5_dev_fini_odp_hmm(struct ib_device *ib_dev);
+int mlx5_ib_umem_invalidate(struct ib_umem *umem, u64 start,
+			    u64 end, void *cookie);
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 			      unsigned long end);
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 
+
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
 static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
 {
 	return 0;
@@ -671,4 +677,15 @@  static inline u8 convert_access(int acc)
 #define MLX5_MAX_UMR_SHIFT 16
 #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
 
+#ifndef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
+static inline void mlx5_dev_init_odp_hmm(struct ib_device *ib_dev,
+					 struct device *dev)
+{
+}
+
+static inline void mlx5_dev_fini_odp_hmm(struct ib_device *ib_dev)
+{
+}
+#endif /* ! CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
+
 #endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 23cd123..7b2d84a 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1210,7 +1210,14 @@  int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
 		/* Wait for all running page-fault handlers to finish. */
 		synchronize_srcu(&dev->mr_srcu);
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
-#error "CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM not supported at this stage !"
+		if (mlx5_ib_umem_invalidate(umem, ib_umem_start(umem),
+					    ib_umem_end(umem), NULL))
+			/*
+			 * FIXME do something to kill all mr and umem
+			 * in use by this process.
+			 */
+			pr_err("killing all mr with odp due to "
+			       "mtt update failure\n");
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 		/* Destroy all page mappings */
 		mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 1de4d13..bd29155 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -52,8 +52,183 @@  static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
 	return container_of(mmr, struct mlx5_ib_mr, mmr);
 }
 
+
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
-#error "CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM not supported at this stage !"
+
+
+int mlx5_ib_umem_invalidate(struct ib_umem *umem, u64 start,
+			    u64 end, void *cookie)
+{
+	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1;
+	u64 idx = 0, blk_start_idx = 0;
+	struct hmm_pt_iter iter;
+	struct mlx5_ib_mr *mlx5_ib_mr;
+	struct hmm_mirror *mirror;
+	int in_block = 0;
+	u64 addr;
+	int ret = 0;
+
+	if (!umem || !umem->odp_data) {
+		pr_err("invalidation called on NULL umem or non-ODP umem\n");
+		return -EINVAL;
+	}
+
+	/* Is this ib_mr active and registered yet ? */
+	if (umem->odp_data->private == NULL)
+		return 0;
+
+	mlx5_ib_mr = umem->odp_data->private;
+	if (!mlx5_ib_mr->ibmr.pd)
+		return 0;
+
+	start = max_t(u64, ib_umem_start(umem), start);
+	end = min_t(u64, ib_umem_end(umem), end);
+	hmm_pt_iter_init(&iter);
+	mirror = &umem->odp_data->ib_mirror->base;
+
+	/*
+	 * Iteration one - zap the HW's MTTs. HMM ensures that while we are
+	 * doing the invalidation, no page fault will attempt to overwrite the
+	 * same MTTs.  Concurent invalidations might race us, but they will
+	 * write 0s as well, so no difference in the end result.
+	 */
+	for (addr = start; addr < end; addr += (u64)umem->page_size) {
+		dma_addr_t *ptep;
+
+		/* Need to happen before ptep as ptep might break the loop and
+		 * idx might be use outside the loop.
+		 */
+		idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
+
+		/* Get and lock pointer to mirror page table. */
+		ptep = hmm_pt_iter_update(&iter, &mirror->pt, addr);
+		if (!ptep) {
+			addr = hmm_pt_iter_next(&iter, &mirror->pt, addr, end);
+			continue;
+		}
+
+		/*
+		 * Strive to write the MTTs in chunks, but avoid overwriting
+		 * non-existing MTTs. The huristic here can be improved to
+		 * estimate the cost of another UMR vs. the cost of bigger
+		 * UMR.
+		 */
+		if ((*ptep) & (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
+			if ((*ptep) & ODP_WRITE_ALLOWED_BIT)
+				hmm_pte_set_dirty(ptep);
+			/*
+			 * Because there can not be concurrent overlapping
+			 * munmap, page migrate, page write protect then it
+			 * is safe here to clear those bits.
+			 */
+			hmm_pte_clear_bit(ptep, ODP_READ_ALLOWED_SHIFT);
+			hmm_pte_clear_bit(ptep, ODP_WRITE_ALLOWED_SHIFT);
+			if (!in_block) {
+				blk_start_idx = idx;
+				in_block = 1;
+			}
+		} else {
+			u64 umr_offset = idx & umr_block_mask;
+
+			if (in_block && umr_offset == 0) {
+				ret = mlx5_ib_update_mtt(mlx5_ib_mr,
+							 blk_start_idx,
+							 idx - blk_start_idx, 1,
+							 &iter) || ret;
+				in_block = 0;
+			}
+		}
+	}
+	if (in_block)
+		ret = mlx5_ib_update_mtt(mlx5_ib_mr, blk_start_idx,
+					 idx - blk_start_idx + 1, 1,
+					 &iter) || ret;
+	hmm_pt_iter_fini(&iter, &mirror->pt);
+	return ret;
+}
+
+static int mlx5_hmm_invalidate_range(struct hmm_mirror *mirror,
+				     unsigned long start,
+				     unsigned long end)
+{
+	struct ib_mirror *ib_mirror;
+	int ret;
+
+	ib_mirror = container_of(mirror, struct ib_mirror, base);
+
+	/* Go over all memory region and invalidate them. */
+	down_read(&ib_mirror->umem_rwsem);
+	ret = rbt_ib_umem_for_each_in_range(&ib_mirror->umem_tree, start, end,
+					    mlx5_ib_umem_invalidate, NULL);
+	up_read(&ib_mirror->umem_rwsem);
+	return ret;
+}
+
+static void mlx5_hmm_release(struct hmm_mirror *mirror)
+{
+	struct ib_mirror *ib_mirror;
+
+	ib_mirror = container_of(mirror, struct ib_mirror, base);
+
+	/* Go over all memory region and invalidate them. */
+	mlx5_hmm_invalidate_range(mirror, 0, ULLONG_MAX);
+}
+
+static int mlx5_hmm_update(struct hmm_mirror *mirror,
+			   const struct hmm_event *event)
+{
+	struct device *device = mirror->device->dev;
+	int ret = 0;
+
+	switch (event->etype) {
+	case HMM_DEVICE_RFAULT:
+	case HMM_DEVICE_WFAULT:
+		/* FIXME implement. */
+		break;
+	case HMM_ISDIRTY:
+		hmm_mirror_range_dirty(mirror, event->start, event->end);
+		break;
+	case HMM_NONE:
+	default:
+		dev_warn(device, "Warning: unhandled HMM event (%d)"
+			 "defaulting to invalidation\n", event->etype);
+		/* Fallthrough. */
+	/* For write protect and fork we could only invalidate writeable mr. */
+	case HMM_WRITE_PROTECT:
+	case HMM_MIGRATE:
+	case HMM_MUNMAP:
+	case HMM_FORK:
+		ret = mlx5_hmm_invalidate_range(mirror,
+						event->start,
+						event->end);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct hmm_device_ops mlx5_hmm_ops = {
+	.release		= &mlx5_hmm_release,
+	.update			= &mlx5_hmm_update,
+};
+
+void mlx5_dev_init_odp_hmm(struct ib_device *ib_device, struct device *dev)
+{
+	INIT_LIST_HEAD(&ib_device->ib_mirrors);
+	ib_device->hmm_dev.dev = dev;
+	ib_device->hmm_dev.ops = &mlx5_hmm_ops;
+	ib_device->hmm_ready = !hmm_device_register(&ib_device->hmm_dev);
+	mutex_init(&ib_device->hmm_mutex);
+}
+
+void mlx5_dev_fini_odp_hmm(struct ib_device *ib_device)
+{
+	if (!ib_device->hmm_ready)
+		return;
+	hmm_device_unregister(&ib_device->hmm_dev);
+}
+
+
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 
 
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index c7c2670..e982fd3 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -133,7 +133,25 @@  struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node,
 
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
-#error "CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM not supported at this stage !"
+
+
+/*
+ * HMM have few bits reserved for hardware specific bits inside the mirror page
+ * table. For IB we record the mapping protection per page there.
+ */
+#define ODP_READ_ALLOWED_SHIFT	(HMM_PTE_HW_SHIFT + 0)
+#define ODP_WRITE_ALLOWED_SHIFT	(HMM_PTE_HW_SHIFT + 1)
+#define ODP_READ_ALLOWED_BIT	(1 << ODP_READ_ALLOWED_SHIFT)
+#define ODP_WRITE_ALLOWED_BIT	(1 << ODP_WRITE_ALLOWED_SHIFT)
+
+/* Make sure we are not overwritting valid address bit on target arch. */
+#if (HMM_PTE_HW_SHIFT + 2) > PAGE_SHIFT
+#error (HMM_PTE_HW_SHIFT + 2) > PAGE_SHIFT
+#endif
+
+#define ODP_DMA_ADDR_MASK HMM_PTE_DMA_MASK
+
+
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */