@@ -159,6 +159,342 @@ static int __collect_tables(struct pt_range *range, void *arg,
return 0;
}
+/* Allocate a table, the empty table will be ready to be installed. */
+static inline struct pt_table_p *_table_alloc(struct pt_common *common,
+ size_t lg2sz, gfp_t gfp,
+ bool no_incoherent_start)
+{
+ struct pt_iommu *iommu_table = iommu_from_common(common);
+ struct pt_table_p *table_mem;
+
+ table_mem = pt_radix_alloc(common, iommu_table->nid, lg2sz, gfp);
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
+ !no_incoherent_start) {
+ int ret = pt_radix_start_incoherent(
+ table_mem, iommu_table->iommu_device, true);
+ if (ret) {
+ pt_radix_free(table_mem);
+ return ERR_PTR(ret);
+ }
+ }
+ return table_mem;
+}
+
+static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
+ uintptr_t top_of_table,
+ gfp_t gfp,
+ bool no_incoherent_start)
+{
+ /*
+ * FIXME top is special it doesn't need RCU or the list, and it might be
+ * small. For now just waste a page on it regardless.
+ */
+ return _table_alloc(common,
+ max(pt_top_memsize_lg2(common, top_of_table),
+ PAGE_SHIFT),
+ gfp, no_incoherent_start);
+}
+
+/* Allocate an interior table */
+static inline struct pt_table_p *table_alloc(struct pt_state *pts, gfp_t gfp,
+ bool no_incoherent_start)
+{
+ return _table_alloc(pts->range->common,
+ pt_num_items_lg2(pts) + ilog2(PT_ENTRY_WORD_SIZE),
+ gfp, no_incoherent_start);
+}
+
+static inline int pt_iommu_new_table(struct pt_state *pts,
+ struct pt_write_attrs *attrs,
+ bool no_incoherent_start)
+{
+ struct pt_table_p *table_mem;
+
+ /* Given PA/VA/length can't be represented */
+ if (unlikely(!pt_can_have_table(pts)))
+ return -ENXIO;
+
+ table_mem = table_alloc(pts, attrs->gfp, no_incoherent_start);
+ if (IS_ERR(table_mem))
+ return PTR_ERR(table_mem);
+
+ if (!pt_install_table(pts, virt_to_phys(table_mem), attrs)) {
+ pt_radix_free(table_mem);
+ return -EAGAIN;
+ }
+ pts->table_lower = table_mem;
+ return 0;
+}
+
+struct pt_iommu_map_args {
+ struct pt_radix_list_head free_list;
+ struct pt_write_attrs attrs;
+ pt_oaddr_t oa;
+};
+
+/*
+ * Check that the items in a contiguous block are all empty. This will
+ * recursively check any tables in the block to validate they are empty and
+ * accumulate them on the free list. Makes no change on failure. On success
+ * caller must fill the items.
+ */
+static int pt_iommu_clear_contig(const struct pt_state *start_pts,
+ struct pt_iommu_map_args *map,
+ struct iommu_write_log *wlog,
+ unsigned int pgsize_lg2)
+{
+ struct pt_range range = *start_pts->range;
+ struct pt_state pts =
+ pt_init(&range, start_pts->level, start_pts->table);
+ struct pt_iommu_collect_args collect = {
+ .free_list = map->free_list,
+ };
+ int ret;
+
+ pts.index = start_pts->index;
+ pts.table_lower = start_pts->table_lower;
+ pts.end_index = start_pts->index +
+ log2_to_int(pgsize_lg2 - pt_table_item_lg2sz(&pts));
+ pts.type = start_pts->type;
+ pts.entry = start_pts->entry;
+ while (true) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ ret = pt_walk_child_all(&pts, __collect_tables,
+ &collect);
+ if (ret)
+ return ret;
+ pt_radix_add_list(&collect.free_list,
+ pt_table_ptr(&pts));
+ } else if (pts.type != PT_ENTRY_EMPTY) {
+ return -EADDRINUSE;
+ }
+
+ _pt_advance(&pts, ilog2(1));
+ if (pts.index == pts.end_index)
+ break;
+ pt_load_entry(&pts);
+ }
+ map->free_list = collect.free_list;
+ return 0;
+}
+
+static int __map_pages(struct pt_range *range, void *arg, unsigned int level,
+ struct pt_table_p *table)
+{
+ struct iommu_write_log wlog __cleanup(done_writes) = { .range = range };
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_map_args *map = arg;
+ int ret;
+
+again:
+ for_each_pt_level_item(&pts) {
+ /*
+ * FIXME: This allows us to segment on our own, but there is
+ * probably a better performing way to implement it.
+ */
+ unsigned int pgsize_lg2 = pt_compute_best_pgsize(&pts, map->oa);
+
+ /*
+ * Our mapping fully covers this page size of items starting
+ * here
+ */
+ if (pgsize_lg2) {
+ if (pgsize_lg2 != pt_table_item_lg2sz(&pts) ||
+ pts.type != PT_ENTRY_EMPTY) {
+ ret = pt_iommu_clear_contig(&pts, map, &wlog,
+ pgsize_lg2);
+ if (ret)
+ return ret;
+ }
+
+ record_write(&wlog, &pts, pgsize_lg2);
+ pt_install_leaf_entry(&pts, map->oa, pgsize_lg2,
+ &map->attrs);
+ pts.type = PT_ENTRY_OA;
+ map->oa += log2_to_int(pgsize_lg2);
+ continue;
+ }
+
+ /* Otherwise we need to descend to a child table */
+
+ if (pts.type == PT_ENTRY_EMPTY) {
+ record_write(&wlog, &pts, ilog2(1));
+ ret = pt_iommu_new_table(&pts, &map->attrs, false);
+ if (ret) {
+ /*
+ * Racing with another thread installing a table
+ */
+ if (ret == -EAGAIN)
+ goto again;
+ return ret;
+ }
+ if (pts_feature(&pts, PT_FEAT_DMA_INCOHERENT)) {
+ done_writes(&wlog);
+ pt_radix_done_incoherent_flush(pts.table_lower);
+ }
+ } else if (pts.type == PT_ENTRY_TABLE) {
+ /*
+ * Racing with a shared pt_iommu_new_table()? The other
+ * thread is still flushing the cache, so we have to
+ * also flush it to ensure that when our thread's map
+ * completes our mapping is working.
+ *
+ * Using the folio memory means we don't have to rely on
+ * an available PTE bit to keep track.
+ *
+ */
+ if (pts_feature(&pts, PT_FEAT_DMA_INCOHERENT) &&
+ pt_radix_incoherent_still_flushing(pts.table_lower))
+ record_write(&wlog, &pts, ilog2(1));
+ } else {
+ return -EADDRINUSE;
+ }
+
+ /*
+ * Notice the already present table can possibly be shared with
+ * another concurrent map.
+ */
+ ret = pt_descend(&pts, arg, __map_pages);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Add a table to the top, increasing the top level as much as necessary to
+ * encompass range.
+ */
+static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range,
+ struct pt_write_attrs *attrs)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ uintptr_t top_of_table = READ_ONCE(common->top_of_table);
+ uintptr_t new_top_of_table = top_of_table;
+ struct pt_radix_list_head free_list = {};
+ unsigned long flags;
+ int ret;
+
+ while (true) {
+ struct pt_range top_range =
+ _pt_top_range(common, new_top_of_table);
+ struct pt_state pts = pt_init_top(&top_range);
+ struct pt_table_p *table_mem;
+
+ top_range.va = range->va;
+ top_range.last_va = range->last_va;
+
+ if (!pt_check_range(&top_range))
+ break;
+
+ pts.level++;
+ if (pts.level > PT_MAX_TOP_LEVEL ||
+ pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) {
+ ret = -ERANGE;
+ goto err_free;
+ }
+
+ table_mem = table_alloc_top(
+ common, _pt_top_set(NULL, pts.level), attrs->gfp, true);
+ if (IS_ERR(table_mem))
+ return PTR_ERR(table_mem);
+ pt_radix_add_list(&free_list, table_mem);
+
+ /* The new table links to the lower table always at index 0 */
+ top_range.va = 0;
+ pts.table_lower = pts.table;
+ pts.table = table_mem;
+ pt_load_single_entry(&pts);
+ PT_WARN_ON(pts.index != 0);
+ pt_install_table(&pts, virt_to_phys(pts.table_lower), attrs);
+ new_top_of_table = _pt_top_set(pts.table, pts.level);
+
+ top_range = _pt_top_range(common, new_top_of_table);
+ }
+
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
+ ret = pt_radix_start_incoherent_list(
+ &free_list, iommu_from_common(common)->iommu_device);
+ if (ret)
+ goto err_free;
+ }
+
+ /*
+ * top_of_table is write locked by the spinlock, but readers can use
+ * READ_ONCE() to get the value. Since we encode both the level and the
+ * pointer in one quanta the lockless reader will always see something
+ * valid. The HW must be updated to the new level under the spinlock
+ * before top_of_table is updated so that concurrent readers don't map
+ * into the new level until it is fully functional. If another thread
+ * already updated it while we were working then throw everything away
+ * and try again.
+ */
+ spin_lock_irqsave(&iommu_table->table_lock, flags);
+ if (common->top_of_table != top_of_table) {
+ spin_unlock_irqrestore(&iommu_table->table_lock, flags);
+ ret = -EAGAIN;
+ goto err_free;
+ }
+
+ /* FIXME update the HW here */
+ WRITE_ONCE(common->top_of_table, new_top_of_table);
+ spin_unlock_irqrestore(&iommu_table->table_lock, flags);
+
+ *range = pt_make_range(common, range->va, range->last_va);
+ PT_WARN_ON(pt_check_range(range));
+ return 0;
+
+err_free:
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ pt_radix_stop_incoherent_list(
+ &free_list, iommu_from_common(common)->iommu_device);
+ pt_radix_free_list(&free_list);
+ return ret;
+}
+
+static int NS(map_pages)(struct pt_iommu *iommu_table, dma_addr_t iova,
+ phys_addr_t paddr, dma_addr_t len, unsigned int prot,
+ gfp_t gfp, size_t *mapped,
+ struct iommu_iotlb_gather *iotlb_gather)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_iommu_map_args map = { .oa = paddr };
+ struct pt_range range;
+ int ret;
+
+ if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE))))
+ return -EINVAL;
+
+ if ((sizeof(pt_oaddr_t) > sizeof(paddr) && paddr > PT_VADDR_MAX) ||
+ (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 &&
+ oalog2_div(paddr, common->max_oasz_lg2)))
+ return -ERANGE;
+
+ ret = pt_iommu_set_prot(common, &map.attrs, prot);
+ if (ret)
+ return ret;
+ map.attrs.gfp = gfp;
+
+again:
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, len);
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && ret == -ERANGE) {
+ ret = increase_top(iommu_table, &range, &map.attrs);
+ if (ret) {
+ if (ret == -EAGAIN)
+ goto again;
+ return ret;
+ }
+ }
+ if (ret)
+ return ret;
+
+ ret = pt_walk_range(&range, __map_pages, &map);
+
+ /* Bytes successfully mapped */
+ *mapped += map.oa - paddr;
+ return ret;
+}
+
struct pt_unmap_args {
struct pt_radix_list_head free_list;
pt_vaddr_t unmapped;
@@ -285,6 +621,7 @@ static void NS(deinit)(struct pt_iommu *iommu_table)
}
static const struct pt_iommu_ops NS(ops) = {
+ .map_pages = NS(map_pages),
.unmap_pages = NS(unmap_pages),
.iova_to_phys = NS(iova_to_phys),
.get_info = NS(get_info),
@@ -61,6 +61,35 @@ struct pt_iommu_info {
/* See the function comments in iommu_pt.c for kdocs */
struct pt_iommu_ops {
+ /**
+ * map_pages() - Install translation for an IOVA range
+ * @iommu_table: Table to manipulate
+ * @iova: IO virtual address to start
+ * @paddr: Physical/Output address to start
+ * @len: Length of the range starting from @iova
+ * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO
+ * @gfp: GFP flags for any memory allocations
+ * @gather: Gather struct that must be flushed on return
+ *
+ * The range starting at IOVA will have paddr installed into it. The
+ * rage is automatically segmented into optimally sized table entries,
+ * and can have any valid alignment.
+ *
+ * On error the caller will probably want to invoke unmap on the range
+ * from iova up to the amount indicated by @mapped to return the table
+ * back to an unchanged state.
+ *
+ * Context: The caller must hold a write range lock that includes
+ * the whole range.
+ *
+ * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA
+ * that were mapped are added to @mapped, @mapped is not zerod first.
+ */
+ int (*map_pages)(struct pt_iommu *iommu_table, dma_addr_t iova,
+ phys_addr_t paddr, dma_addr_t len, unsigned int prot,
+ gfp_t gfp, size_t *mapped,
+ struct iommu_iotlb_gather *iotlb_gather);
+
/**
* unmap_pages() - Make a range of IOVA empty/not present
* @iommu_table: Table to manipulate
Implement a self-segmenting algorithm for map_pages. This can handle any valid input VA/length and will automatically break it up into appropriately sized table entries using a recursive descent algorithm. The appropriate page size is computed each step using some bitwise calculations. map is slightly complicated because it has to handle a number of special edge cases: - Overmapping a previously shared table with an OA - requries validating and discarding the possibly empty tables - Doing the above across an entire to-be-created contiguous entry. - Installing a new table concurrently with another thread - Racing table installation with CPU cache flushing - Expanding the table by adding more top levels on the fly Managing the table installation race is done using a flag in the folio. When the shared table entry is possibly unflushed the flag will be set. This works for all pagetable formats but is less efficient than the io-pgtable-arm-lpae approach of using a SW table bit. It may be interesting to provide the latter as an option. Table expansion is a unique feature of AMDv1, this version is quite similar except we handle racing concurrent lockless map. The table top pointer and starting level are encoding in a single uintptr_t which ensures we can READ_ONCE() without tearing. Any op will do the READ_ONCE() and use that fixed point as its starting point. Concurrent expansion is handled with a table global spinlock. When inserting a new table entry map checks that the portion of the table is empty. This includes removing an empty interior tables. The approach here is atomic per entry. Either the new entry is written, or no change is made to the table. This is done by keeping a list of interior tables to free and only progressing once the entire space is checked to be empty. Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> --- drivers/iommu/generic_pt/iommu_pt.h | 337 ++++++++++++++++++++++++++++ include/linux/generic_pt/iommu.h | 29 +++ 2 files changed, 366 insertions(+)