diff mbox series

[05/16] iommupt: Add unmap_pages op

Message ID 5-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com (mailing list archive)
State New
Headers show
Series Consolidate iommu page table implementations | expand

Commit Message

Jason Gunthorpe Aug. 15, 2024, 3:11 p.m. UTC
unmap_pages removes mappings and any fully contained interior tables from
the given range. This follows the strict iommu_domain API definition where
it does not split up larger page sizes into smaller. The caller must
perform unmap only on ranges created by map or it must have somehow
otherwise determined safe cut points (eg iommufd/vfio use iova_to_phys to
scan for them)

A following patch will provide 'cut' which explicitly does the page size
split if the HW can support it.

unmap is implemented with a recursive descent of the tree. It has an
additional cost of checking that the entire VA range is mapped. If the
caller provides a VA range that spans an entire table item then the table
can be freed as well.

Cache incoherent HW is handled by keep tracking of what table memory
ranges need CPU cache invalidation at each level and performing that
invalidation once when ascending from that level.

Currently, the only user I know of for partial unmap is VFIO type 1 v1.0.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/generic_pt/iommu_pt.h | 143 ++++++++++++++++++++++++++++
 include/linux/generic_pt/iommu.h    |  24 +++++
 2 files changed, 167 insertions(+)
diff mbox series

Patch

diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index 835c84ea716093..6d1c59b33d02f3 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -14,6 +14,63 @@ 
 
 #include <linux/iommu.h>
 #include <linux/export.h>
+#include <linux/cleanup.h>
+#include <linux/dma-mapping.h>
+
+/*
+ * Keep track of what table items are being written too during mutation
+ * operations. When the HW is DMA Incoherent these have to be cache flushed
+ * before they are visible. The write_log batches flushes together and uses a C
+ * cleanup to make sure the table memory is flushed before walking concludes
+ * with that table.
+ *
+ * There are two notable cases that need special flushing:
+ *  1) Installing a table entry requires the new table memory (and all of it's
+ *     children) are flushed.
+ *  2) Installing a shared table requires that other threads using the shared
+ *     table ensure it is flushed before they attempt to use it.
+ */
+struct iommu_write_log {
+	struct pt_range *range;
+	struct pt_table_p *table;
+	unsigned int start_idx;
+	unsigned int last_idx;
+};
+
+static void record_write(struct iommu_write_log *wlog,
+			 const struct pt_state *pts,
+			 unsigned int index_count_lg2)
+{
+	if (!(PT_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT)))
+		return;
+
+	if (!wlog->table) {
+		wlog->table = pts->table;
+		wlog->start_idx = pts->index;
+	}
+	wlog->last_idx =
+		max(wlog->last_idx,
+		    log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0,
+				 index_count_lg2));
+}
+
+static void done_writes(struct iommu_write_log *wlog)
+{
+	struct pt_iommu *iommu_table = iommu_from_common(wlog->range->common);
+	dma_addr_t dma;
+
+	if (!pt_feature(wlog->range->common, PT_FEAT_DMA_INCOHERENT) ||
+	    !wlog->table)
+		return;
+
+	dma = virt_to_phys(wlog->table);
+	dma_sync_single_for_device(iommu_table->iommu_device,
+				   dma + wlog->start_idx * PT_ENTRY_WORD_SIZE,
+				   (wlog->last_idx - wlog->start_idx + 1) *
+					   PT_ENTRY_WORD_SIZE,
+				   DMA_TO_DEVICE);
+	wlog->table = NULL;
+}
 
 static int make_range(struct pt_common *common, struct pt_range *range,
 		      dma_addr_t iova, dma_addr_t len)
@@ -102,6 +159,91 @@  static int __collect_tables(struct pt_range *range, void *arg,
 	return 0;
 }
 
+struct pt_unmap_args {
+	struct pt_radix_list_head free_list;
+	pt_vaddr_t unmapped;
+};
+
+static int __unmap_pages(struct pt_range *range, void *arg, unsigned int level,
+			 struct pt_table_p *table)
+{
+	struct iommu_write_log wlog __cleanup(done_writes) = { .range = range };
+	struct pt_state pts = pt_init(range, level, table);
+	struct pt_unmap_args *unmap = arg;
+	int ret;
+
+	for_each_pt_level_item(&pts) {
+		switch (pts.type) {
+		case PT_ENTRY_TABLE: {
+			/* descend will change va */
+			bool fully_covered = pt_entry_fully_covered(
+				&pts, pt_table_item_lg2sz(&pts));
+
+			ret = pt_descend(&pts, arg, __unmap_pages);
+			if (ret)
+				return ret;
+
+			/*
+			 * If the unmapping range fully covers the table then we
+			 * can free it as well. The clear is delayed until we
+			 * succeed in clearing the lower table levels.
+			 */
+			if (fully_covered) {
+				pt_radix_add_list(&unmap->free_list,
+						  pts.table_lower);
+				record_write(&wlog, &pts, ilog2(1));
+				pt_clear_entry(&pts, ilog2(1));
+			}
+			break;
+		}
+		case PT_ENTRY_EMPTY:
+			return -EFAULT;
+		case PT_ENTRY_OA:
+			/*
+			 * The IOMMU API does not require drivers to support
+			 * unmapping parts of pages. Only legacy VFIO type 1 v1
+			 * will attempt it after probing for "fine-grained
+			 * superpages" support. There it allows the v1 version
+			 * of VFIO (that nobody uses) to pass more than
+			 * PAGE_SIZE to map.
+			 */
+			if (!pt_entry_fully_covered(&pts,
+						    pt_entry_oa_lg2sz(&pts)))
+				return -EADDRINUSE;
+			unmap->unmapped += log2_to_int(pt_entry_oa_lg2sz(&pts));
+			record_write(&wlog, &pts,
+				     pt_entry_num_contig_lg2(&pts));
+			pt_clear_entry(&pts, pt_entry_num_contig_lg2(&pts));
+			break;
+		}
+	}
+	return 0;
+}
+
+static size_t NS(unmap_pages)(struct pt_iommu *iommu_table, dma_addr_t iova,
+			      dma_addr_t len,
+			      struct iommu_iotlb_gather *iotlb_gather)
+{
+	struct pt_common *common = common_from_iommu(iommu_table);
+	struct pt_unmap_args unmap = {};
+	struct pt_range range;
+	int ret;
+
+	ret = make_range(common_from_iommu(iommu_table), &range, iova, len);
+	if (ret)
+		return ret;
+
+	pt_walk_range(&range, __unmap_pages, &unmap);
+
+	if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+		pt_radix_stop_incoherent_list(&unmap.free_list,
+					      iommu_table->iommu_device);
+
+	/* FIXME into gather */
+	pt_radix_free_list_rcu(&unmap.free_list);
+	return unmap.unmapped;
+}
+
 static void NS(get_info)(struct pt_iommu *iommu_table,
 			 struct pt_iommu_info *info)
 {
@@ -143,6 +285,7 @@  static void NS(deinit)(struct pt_iommu *iommu_table)
 }
 
 static const struct pt_iommu_ops NS(ops) = {
+	.unmap_pages = NS(unmap_pages),
 	.iova_to_phys = NS(iova_to_phys),
 	.get_info = NS(get_info),
 	.deinit = NS(deinit),
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index 5cd56eac14b41d..bdb6bf2c2ebe85 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -8,6 +8,7 @@ 
 #include <linux/generic_pt/common.h>
 #include <linux/mm_types.h>
 
+struct iommu_iotlb_gather;
 struct pt_iommu_ops;
 
 /**
@@ -60,6 +61,29 @@  struct pt_iommu_info {
 
 /* See the function comments in iommu_pt.c for kdocs */
 struct pt_iommu_ops {
+	/**
+	 * unmap_pages() - Make a range of IOVA empty/not present
+	 * @iommu_table: Table to manipulate
+	 * @iova: IO virtual address to start
+	 * @len: Length of the range starting from @iova
+	 * @gather: Gather struct that must be flushed on return
+	 *
+	 * unmap_pages() will remove translation created by map_pages().
+	 * It cannot subdivide a mapping created by map_pages(),
+	 * so it should be called with IOVA ranges that match those passed
+	 * to map_pages. The IOVA range can aggregate contiguous map_pages() calls
+	 * so long as no individual range is split.
+	 *
+	 * Context: The caller must hold a write range lock that includes
+	 * the whole range.
+	 *
+	 * Returns: Number of bytes of VA unmapped. iova + res will be the
+	 * point unmapping stopped.
+	 */
+	size_t (*unmap_pages)(struct pt_iommu *iommu_table, dma_addr_t iova,
+			      dma_addr_t len,
+			      struct iommu_iotlb_gather *iotlb_gather);
+
 	/**
 	 * iova_to_phys() - Return the output address for the given IOVA
 	 * @iommu_table: Table to query