diff mbox series

[RFC,07/12] iommufd: Data structure to provide IOVA to PFN mapping

Message ID 7-v1-e79cd8d168e8+6-iommufd_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series IOMMUFD Generic interface | expand

Commit Message

Jason Gunthorpe March 18, 2022, 5:27 p.m. UTC
This is the remainder of the IOAS data structure. Provide an object called
an io_pagetable that is composed of iopt_areas pointing at iopt_pages,
along with a list of iommu_domains that mirror the IOVA to PFN map.

At the top this is a simple interval tree of iopt_areas indicating the map
of IOVA to iopt_pages. An xarray keeps track of a list of domains. Based
on the attached domains there is a minimum alignment for areas (which may
be smaller than PAGE_SIZE) and an interval tree of reserved IOVA that
can't be mapped.

The concept of a 'user' refers to something like a VFIO mdev that is
accessing the IOVA and using a 'struct page *' for CPU based access.

Externally an API is provided that matches the requirements of the IOCTL
interface for map/unmap and domain attachment.

The API provides a 'copy' primitive to establish a new IOVA map in a
different IOAS from an existing mapping.

This is designed to support a pre-registration flow where userspace would
setup an dummy IOAS with no domains, map in memory and then establish a
user to pin all PFNs into the xarray.

Copy can then be used to create new IOVA mappings in a different IOAS,
with iommu_domains attached. Upon copy the PFNs will be read out of the
xarray and mapped into the iommu_domains, avoiding any pin_user_pages()
overheads.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/Makefile          |   1 +
 drivers/iommu/iommufd/io_pagetable.c    | 890 ++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |  35 +
 3 files changed, 926 insertions(+)
 create mode 100644 drivers/iommu/iommufd/io_pagetable.c

Comments

Alex Williamson March 22, 2022, 10:15 p.m. UTC | #1
On Fri, 18 Mar 2022 14:27:32 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:
> +/*
> + * The area takes a slice of the pages from start_bytes to start_byte + length
> + */
> +static struct iopt_area *
> +iopt_alloc_area(struct io_pagetable *iopt, struct iopt_pages *pages,
> +		unsigned long iova, unsigned long start_byte,
> +		unsigned long length, int iommu_prot, unsigned int flags)
> +{
> +	struct iopt_area *area;
> +	int rc;
> +
> +	area = kzalloc(sizeof(*area), GFP_KERNEL);
> +	if (!area)
> +		return ERR_PTR(-ENOMEM);
> +
> +	area->iopt = iopt;
> +	area->iommu_prot = iommu_prot;
> +	area->page_offset = start_byte % PAGE_SIZE;
> +	area->pages_node.start = start_byte / PAGE_SIZE;
> +	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
> +		return ERR_PTR(-EOVERFLOW);
> +	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
> +	if (WARN_ON(area->pages_node.last >= pages->npages))
> +		return ERR_PTR(-EOVERFLOW);

@area leaked in the above two error cases.

> +
> +	down_write(&iopt->iova_rwsem);
> +	if (flags & IOPT_ALLOC_IOVA) {
> +		rc = iopt_alloc_iova(iopt, &iova,
> +				     (uintptr_t)pages->uptr + start_byte,
> +				     length);
> +		if (rc)
> +			goto out_unlock;
> +	}
> +
> +	if (check_add_overflow(iova, length - 1, &area->node.last)) {
> +		rc = -EOVERFLOW;
> +		goto out_unlock;
> +	}
> +
> +	if (!(flags & IOPT_ALLOC_IOVA)) {
> +		if ((iova & (iopt->iova_alignment - 1)) ||
> +		    (length & (iopt->iova_alignment - 1)) || !length) {
> +			rc = -EINVAL;
> +			goto out_unlock;
> +		}
> +
> +		/* No reserved IOVA intersects the range */
> +		if (interval_tree_iter_first(&iopt->reserved_iova_itree, iova,
> +					     area->node.last)) {
> +			rc = -ENOENT;
> +			goto out_unlock;
> +		}
> +
> +		/* Check that there is not already a mapping in the range */
> +		if (iopt_area_iter_first(iopt, iova, area->node.last)) {
> +			rc = -EADDRINUSE;
> +			goto out_unlock;
> +		}
> +	}
> +
> +	/*
> +	 * The area is inserted with a NULL pages indicating it is not fully
> +	 * initialized yet.
> +	 */
> +	area->node.start = iova;
> +	interval_tree_insert(&area->node, &area->iopt->area_itree);
> +	up_write(&iopt->iova_rwsem);
> +	return area;
> +
> +out_unlock:
> +	up_write(&iopt->iova_rwsem);
> +	kfree(area);
> +	return ERR_PTR(rc);
> +}
...
> +/**
> + * iopt_access_pages() - Return a list of pages under the iova
> + * @iopt: io_pagetable to act on
> + * @iova: Starting IOVA
> + * @length: Number of bytes to access
> + * @out_pages: Output page list
> + * @write: True if access is for writing
> + *
> + * Reads @npages starting at iova and returns the struct page * pointers. These
> + * can be kmap'd by the caller for CPU access.
> + *
> + * The caller must perform iopt_unaccess_pages() when done to balance this.
> + *
> + * iova can be unaligned from PAGE_SIZE. The first returned byte starts at
> + * page_to_phys(out_pages[0]) + (iova % PAGE_SIZE). The caller promises not to
> + * touch memory outside the requested iova slice.
> + *
> + * FIXME: callers that need a DMA mapping via a sgl should create another
> + * interface to build the SGL efficiently
> + */
> +int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
> +		      unsigned long length, struct page **out_pages, bool write)
> +{
> +	unsigned long cur_iova = iova;
> +	unsigned long last_iova;
> +	struct iopt_area *area;
> +	int rc;
> +
> +	if (!length)
> +		return -EINVAL;
> +	if (check_add_overflow(iova, length - 1, &last_iova))
> +		return -EOVERFLOW;
> +
> +	down_read(&iopt->iova_rwsem);
> +	for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
> +	     area = iopt_area_iter_next(area, iova, last_iova)) {
> +		unsigned long last = min(last_iova, iopt_area_last_iova(area));
> +		unsigned long last_index;
> +		unsigned long index;
> +
> +		/* Need contiguous areas in the access */
> +		if (iopt_area_iova(area) < cur_iova || !area->pages) {
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Should this be (cur_iova != iova && iopt_area_iova(area) < cur_iova)?

I can't see how we'd require in-kernel page users to know the iopt_area
alignment from userspace, so I think this needs to skip the first
iteration.  Thanks,

Alex

> +			rc = -EINVAL;
> +			goto out_remove;
> +		}
> +
> +		index = iopt_area_iova_to_index(area, cur_iova);
> +		last_index = iopt_area_iova_to_index(area, last);
> +		rc = iopt_pages_add_user(area->pages, index, last_index,
> +					 out_pages, write);
> +		if (rc)
> +			goto out_remove;
> +		if (last == last_iova)
> +			break;
> +		/*
> +		 * Can't cross areas that are not aligned to the system page
> +		 * size with this API.
> +		 */
> +		if (cur_iova % PAGE_SIZE) {
> +			rc = -EINVAL;
> +			goto out_remove;
> +		}
> +		cur_iova = last + 1;
> +		out_pages += last_index - index;
> +		atomic_inc(&area->num_users);
> +	}
> +
> +	up_read(&iopt->iova_rwsem);
> +	return 0;
> +
> +out_remove:
> +	if (cur_iova != iova)
> +		iopt_unaccess_pages(iopt, iova, cur_iova - iova);
> +	up_read(&iopt->iova_rwsem);
> +	return rc;
> +}
Jason Gunthorpe March 23, 2022, 6:15 p.m. UTC | #2
On Tue, Mar 22, 2022 at 04:15:44PM -0600, Alex Williamson wrote:

> > +static struct iopt_area *
> > +iopt_alloc_area(struct io_pagetable *iopt, struct iopt_pages *pages,
> > +		unsigned long iova, unsigned long start_byte,
> > +		unsigned long length, int iommu_prot, unsigned int flags)
> > +{
> > +	struct iopt_area *area;
> > +	int rc;
> > +
> > +	area = kzalloc(sizeof(*area), GFP_KERNEL);
> > +	if (!area)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	area->iopt = iopt;
> > +	area->iommu_prot = iommu_prot;
> > +	area->page_offset = start_byte % PAGE_SIZE;
> > +	area->pages_node.start = start_byte / PAGE_SIZE;
> > +	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
> > +		return ERR_PTR(-EOVERFLOW);
> > +	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
> > +	if (WARN_ON(area->pages_node.last >= pages->npages))
> > +		return ERR_PTR(-EOVERFLOW);
> 
> @area leaked in the above two error cases.

Yes, thanks

> > +int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
> > +		      unsigned long length, struct page **out_pages, bool write)
> > +{
> > +	unsigned long cur_iova = iova;
> > +	unsigned long last_iova;
> > +	struct iopt_area *area;
> > +	int rc;
> > +
> > +	if (!length)
> > +		return -EINVAL;
> > +	if (check_add_overflow(iova, length - 1, &last_iova))
> > +		return -EOVERFLOW;
> > +
> > +	down_read(&iopt->iova_rwsem);
> > +	for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
> > +	     area = iopt_area_iter_next(area, iova, last_iova)) {
> > +		unsigned long last = min(last_iova, iopt_area_last_iova(area));
> > +		unsigned long last_index;
> > +		unsigned long index;
> > +
> > +		/* Need contiguous areas in the access */
> > +		if (iopt_area_iova(area) < cur_iova || !area->pages) {
>                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> Should this be (cur_iova != iova && iopt_area_iova(area) < cur_iova)?

Oh good eye

That is a typo < should be >:

		if (iopt_area_iova(area) > cur_iova || !area->pages) {

There are three boundary conditions here to worry about
 - interval trees return any nodes that intersect the queried range
   so the first found node can start after iova

 - There can be a gap in the intervals

 - The final area can end short of last_iova

The last one is botched too and needs this:
        for ... { ...
	}
+	if (cur_iova != last_iova)
+		goto out_remove;

The test suite isn't covering the boundary cases here yet, I added a
FIXME for this for now.

Thanks,
Jason
Tian, Kevin March 24, 2022, 3:09 a.m. UTC | #3
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, March 24, 2022 2:16 AM
> 
> On Tue, Mar 22, 2022 at 04:15:44PM -0600, Alex Williamson wrote:
> 
> > > +int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
> > > +		      unsigned long length, struct page **out_pages, bool write)
> > > +{
> > > +	unsigned long cur_iova = iova;
> > > +	unsigned long last_iova;
> > > +	struct iopt_area *area;
> > > +	int rc;
> > > +
> > > +	if (!length)
> > > +		return -EINVAL;
> > > +	if (check_add_overflow(iova, length - 1, &last_iova))
> > > +		return -EOVERFLOW;
> > > +
> > > +	down_read(&iopt->iova_rwsem);
> > > +	for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
> > > +	     area = iopt_area_iter_next(area, iova, last_iova)) {
> > > +		unsigned long last = min(last_iova, iopt_area_last_iova(area));
> > > +		unsigned long last_index;
> > > +		unsigned long index;
> > > +
> > > +		/* Need contiguous areas in the access */
> > > +		if (iopt_area_iova(area) < cur_iova || !area->pages) {
> >                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > Should this be (cur_iova != iova && iopt_area_iova(area) < cur_iova)?
> 
> Oh good eye
> 
> That is a typo < should be >:
> 
> 		if (iopt_area_iova(area) > cur_iova || !area->pages) {
> 
> There are three boundary conditions here to worry about
>  - interval trees return any nodes that intersect the queried range
>    so the first found node can start after iova
> 
>  - There can be a gap in the intervals
> 
>  - The final area can end short of last_iova
> 
> The last one is botched too and needs this:
>         for ... { ...
> 	}
> +	if (cur_iova != last_iova)
> +		goto out_remove;
> 
> The test suite isn't covering the boundary cases here yet, I added a
> FIXME for this for now.
> 

Another nit about below:

+		/*
+		 * Can't cross areas that are not aligned to the system page
+		 * size with this API.
+		 */
+		if (cur_iova % PAGE_SIZE) {
+			rc = -EINVAL;
+			goto out_remove;
+		}

Currently it's done after iopt_pages_add_user() but before cur_iova 
is adjusted, which implies the last add_user() will not be reverted in
case of failed check here.

suppose this should be checked at the start of the loop.

Thanks
Kevin
Jason Gunthorpe March 24, 2022, 12:46 p.m. UTC | #4
On Thu, Mar 24, 2022 at 03:09:46AM +0000, Tian, Kevin wrote:
> +		/*
> +		 * Can't cross areas that are not aligned to the system page
> +		 * size with this API.
> +		 */
> +		if (cur_iova % PAGE_SIZE) {
> +			rc = -EINVAL;
> +			goto out_remove;
> +		}
> 
> Currently it's done after iopt_pages_add_user() but before cur_iova 
> is adjusted, which implies the last add_user() will not be reverted in
> case of failed check here.

Oh, yes that's right too..

The above is wrong even, it didn't get fixed when page_offset was
done.

So more like this:

diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 1c08ae9b848fcf..9505f119df982e 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -23,7 +23,7 @@ static unsigned long iopt_area_iova_to_index(struct iopt_area *area,
 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
 		WARN_ON(iova < iopt_area_iova(area) ||
 			iova > iopt_area_last_iova(area));
-	return (iova - (iopt_area_iova(area) & PAGE_MASK)) / PAGE_SIZE;
+	return (iova - (iopt_area_iova(area) - area->page_offset)) / PAGE_SIZE;
 }
 
 static struct iopt_area *iopt_find_exact_area(struct io_pagetable *iopt,
@@ -436,31 +436,45 @@ int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
 		unsigned long index;
 
 		/* Need contiguous areas in the access */
-		if (iopt_area_iova(area) < cur_iova || !area->pages) {
+		if (iopt_area_iova(area) > cur_iova || !area->pages) {
 			rc = -EINVAL;
 			goto out_remove;
 		}
 
 		index = iopt_area_iova_to_index(area, cur_iova);
 		last_index = iopt_area_iova_to_index(area, last);
+
+		/*
+		 * The API can only return aligned pages, so the starting point
+		 * must be at a page boundary.
+		 */
+		if ((cur_iova - (iopt_area_iova(area) - area->page_offset)) %
+		    PAGE_SIZE) {
+			rc = -EINVAL;
+			goto out_remove;
+		}
+
+		/*
+		 * and an interior ending point must be at a page boundary
+		 */
+		if (last != last_iova &&
+		    (iopt_area_last_iova(area) - cur_iova + 1) % PAGE_SIZE) {
+			rc = -EINVAL;
+			goto out_remove;
+		}
+
 		rc = iopt_pages_add_user(area->pages, index, last_index,
 					 out_pages, write);
 		if (rc)
 			goto out_remove;
 		if (last == last_iova)
 			break;
-		/*
-		 * Can't cross areas that are not aligned to the system page
-		 * size with this API.
-		 */
-		if (cur_iova % PAGE_SIZE) {
-			rc = -EINVAL;
-			goto out_remove;
-		}
 		cur_iova = last + 1;
 		out_pages += last_index - index;
 		atomic_inc(&area->num_users);
 	}
+	if (cur_iova != last_iova)
+		goto out_remove;
 
 	up_read(&iopt->iova_rwsem);
 	return 0;
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 5c47d706ed9449..a46e0f0ae82553 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -1221,5 +1221,6 @@ TEST_F(vfio_compat_mock_domain, get_info)
 /* FIXME test VFIO_IOMMU_MAP_DMA */
 /* FIXME test VFIO_IOMMU_UNMAP_DMA */
 /* FIXME test 2k iova alignment */
+/* FIXME cover boundary cases for iopt_access_pages()  */
 
 TEST_HARNESS_MAIN
zhangfei.gao@foxmail.com March 25, 2022, 1:34 p.m. UTC | #5
Hi, Jason

On 2022/3/19 上午1:27, Jason Gunthorpe via iommu wrote:
> This is the remainder of the IOAS data structure. Provide an object called
> an io_pagetable that is composed of iopt_areas pointing at iopt_pages,
> along with a list of iommu_domains that mirror the IOVA to PFN map.
>
> At the top this is a simple interval tree of iopt_areas indicating the map
> of IOVA to iopt_pages. An xarray keeps track of a list of domains. Based
> on the attached domains there is a minimum alignment for areas (which may
> be smaller than PAGE_SIZE) and an interval tree of reserved IOVA that
> can't be mapped.
>
> The concept of a 'user' refers to something like a VFIO mdev that is
> accessing the IOVA and using a 'struct page *' for CPU based access.
>
> Externally an API is provided that matches the requirements of the IOCTL
> interface for map/unmap and domain attachment.
>
> The API provides a 'copy' primitive to establish a new IOVA map in a
> different IOAS from an existing mapping.
>
> This is designed to support a pre-registration flow where userspace would
> setup an dummy IOAS with no domains, map in memory and then establish a
> user to pin all PFNs into the xarray.
>
> Copy can then be used to create new IOVA mappings in a different IOAS,
> with iommu_domains attached. Upon copy the PFNs will be read out of the
> xarray and mapped into the iommu_domains, avoiding any pin_user_pages()
> overheads.
>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>   drivers/iommu/iommufd/Makefile          |   1 +
>   drivers/iommu/iommufd/io_pagetable.c    | 890 ++++++++++++++++++++++++
>   drivers/iommu/iommufd/iommufd_private.h |  35 +
>   3 files changed, 926 insertions(+)
>   create mode 100644 drivers/iommu/iommufd/io_pagetable.c
>
> diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
> index 05a0e91e30afad..b66a8c47ff55ec 100644
> --- a/drivers/iommu/iommufd/Makefile
> +++ b/drivers/iommu/iommufd/Makefile
> @@ -1,5 +1,6 @@
>   # SPDX-License-Identifier: GPL-2.0-only
>   iommufd-y := \
> +	io_pagetable.o \
>   	main.o \
>   	pages.o
>   
> diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
> new file mode 100644
> index 00000000000000..f9f3b06946bfb9
> --- /dev/null
> +++ b/drivers/iommu/iommufd/io_pagetable.c
> @@ -0,0 +1,890 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
> + *
> + * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
> + * PFNs can be placed into an iommu_domain, or returned to the caller as a page
> + * list for access by an in-kernel user.
> + *
> + * The datastructure uses the iopt_pages to optimize the storage of the PFNs
> + * between the domains and xarray.
> + */
> +#include <linux/lockdep.h>
> +#include <linux/iommu.h>
> +#include <linux/sched/mm.h>
> +#include <linux/err.h>
> +#include <linux/slab.h>
> +#include <linux/errno.h>
> +
> +#include "io_pagetable.h"
> +
> +static unsigned long iopt_area_iova_to_index(struct iopt_area *area,
> +					     unsigned long iova)
> +{
> +	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
> +		WARN_ON(iova < iopt_area_iova(area) ||
> +			iova > iopt_area_last_iova(area));
> +	return (iova - (iopt_area_iova(area) & PAGE_MASK)) / PAGE_SIZE;
> +}
> +
> +static struct iopt_area *iopt_find_exact_area(struct io_pagetable *iopt,
> +					      unsigned long iova,
> +					      unsigned long last_iova)
> +{
> +	struct iopt_area *area;
> +
> +	area = iopt_area_iter_first(iopt, iova, last_iova);
> +	if (!area || !area->pages || iopt_area_iova(area) != iova ||
> +	    iopt_area_last_iova(area) != last_iova)
> +		return NULL;
> +	return area;
> +}
> +
> +static bool __alloc_iova_check_hole(struct interval_tree_span_iter *span,
> +				    unsigned long length,
> +				    unsigned long iova_alignment,
> +				    unsigned long page_offset)
> +{
> +	if (!span->is_hole || span->last_hole - span->start_hole < length - 1)
> +		return false;
> +
> +	span->start_hole =
> +		ALIGN(span->start_hole, iova_alignment) | page_offset;
> +	if (span->start_hole > span->last_hole ||
> +	    span->last_hole - span->start_hole < length - 1)
> +		return false;
> +	return true;
> +}
> +
> +/*
> + * Automatically find a block of IOVA that is not being used and not reserved.
> + * Does not return a 0 IOVA even if it is valid.
> + */
> +static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
> +			   unsigned long uptr, unsigned long length)
> +{
> +	struct interval_tree_span_iter reserved_span;
> +	unsigned long page_offset = uptr % PAGE_SIZE;
> +	struct interval_tree_span_iter area_span;
> +	unsigned long iova_alignment;
> +
> +	lockdep_assert_held(&iopt->iova_rwsem);
> +
> +	/* Protect roundup_pow-of_two() from overflow */
> +	if (length == 0 || length >= ULONG_MAX / 2)
> +		return -EOVERFLOW;
> +
> +	/*
> +	 * Keep alignment present in the uptr when building the IOVA, this
> +	 * increases the chance we can map a THP.
> +	 */
> +	if (!uptr)
> +		iova_alignment = roundup_pow_of_two(length);
> +	else
> +		iova_alignment =
> +			min_t(unsigned long, roundup_pow_of_two(length),
> +			      1UL << __ffs64(uptr));
> +
> +	if (iova_alignment < iopt->iova_alignment)
> +		return -EINVAL;
> +	for (interval_tree_span_iter_first(&area_span, &iopt->area_itree,
> +					   PAGE_SIZE, ULONG_MAX - PAGE_SIZE);
> +	     !interval_tree_span_iter_done(&area_span);
> +	     interval_tree_span_iter_next(&area_span)) {
> +		if (!__alloc_iova_check_hole(&area_span, length, iova_alignment,
> +					     page_offset))
> +			continue;
> +
> +		for (interval_tree_span_iter_first(
> +			     &reserved_span, &iopt->reserved_iova_itree,
> +			     area_span.start_hole, area_span.last_hole);
> +		     !interval_tree_span_iter_done(&reserved_span);
> +		     interval_tree_span_iter_next(&reserved_span)) {
> +			if (!__alloc_iova_check_hole(&reserved_span, length,
> +						     iova_alignment,
> +						     page_offset))
> +				continue;
> +
> +			*iova = reserved_span.start_hole;
> +			return 0;
> +		}
> +	}
> +	return -ENOSPC;
> +}
> +
> +/*
> + * The area takes a slice of the pages from start_bytes to start_byte + length
> + */
> +static struct iopt_area *
> +iopt_alloc_area(struct io_pagetable *iopt, struct iopt_pages *pages,
> +		unsigned long iova, unsigned long start_byte,
> +		unsigned long length, int iommu_prot, unsigned int flags)
> +{
> +	struct iopt_area *area;
> +	int rc;
> +
> +	area = kzalloc(sizeof(*area), GFP_KERNEL);
> +	if (!area)
> +		return ERR_PTR(-ENOMEM);
> +
> +	area->iopt = iopt;
> +	area->iommu_prot = iommu_prot;
> +	area->page_offset = start_byte % PAGE_SIZE;
> +	area->pages_node.start = start_byte / PAGE_SIZE;
> +	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
> +		return ERR_PTR(-EOVERFLOW);
> +	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
> +	if (WARN_ON(area->pages_node.last >= pages->npages))
> +		return ERR_PTR(-EOVERFLOW);
> +
> +	down_write(&iopt->iova_rwsem);
> +	if (flags & IOPT_ALLOC_IOVA) {
> +		rc = iopt_alloc_iova(iopt, &iova,
> +				     (uintptr_t)pages->uptr + start_byte,
> +				     length);
> +		if (rc)
> +			goto out_unlock;
> +	}
> +
> +	if (check_add_overflow(iova, length - 1, &area->node.last)) {
> +		rc = -EOVERFLOW;
> +		goto out_unlock;
> +	}
> +
> +	if (!(flags & IOPT_ALLOC_IOVA)) {
> +		if ((iova & (iopt->iova_alignment - 1)) ||
> +		    (length & (iopt->iova_alignment - 1)) || !length) {
> +			rc = -EINVAL;
> +			goto out_unlock;
> +		}
> +
> +		/* No reserved IOVA intersects the range */
> +		if (interval_tree_iter_first(&iopt->reserved_iova_itree, iova,
> +					     area->node.last)) {
> +			rc = -ENOENT;
> +			goto out_unlock;
> +		}
> +
> +		/* Check that there is not already a mapping in the range */
> +		if (iopt_area_iter_first(iopt, iova, area->node.last)) {
> +			rc = -EADDRINUSE;
> +			goto out_unlock;
> +		}
> +	}
> +
> +	/*
> +	 * The area is inserted with a NULL pages indicating it is not fully
> +	 * initialized yet.
> +	 */
> +	area->node.start = iova;
> +	interval_tree_insert(&area->node, &area->iopt->area_itree);
> +	up_write(&iopt->iova_rwsem);
> +	return area;
> +
> +out_unlock:
> +	up_write(&iopt->iova_rwsem);
> +	kfree(area);
> +	return ERR_PTR(rc);
> +}
> +
> +static void iopt_abort_area(struct iopt_area *area)
> +{
> +	down_write(&area->iopt->iova_rwsem);
> +	interval_tree_remove(&area->node, &area->iopt->area_itree);
> +	up_write(&area->iopt->iova_rwsem);
> +	kfree(area);
> +}
> +
> +static int iopt_finalize_area(struct iopt_area *area, struct iopt_pages *pages)
> +{
> +	int rc;
> +
> +	down_read(&area->iopt->domains_rwsem);
> +	rc = iopt_area_fill_domains(area, pages);
> +	if (!rc) {
> +		/*
> +		 * area->pages must be set inside the domains_rwsem to ensure
> +		 * any newly added domains will get filled. Moves the reference
> +		 * in from the caller
> +		 */
> +		down_write(&area->iopt->iova_rwsem);
> +		area->pages = pages;
> +		up_write(&area->iopt->iova_rwsem);
> +	}
> +	up_read(&area->iopt->domains_rwsem);
> +	return rc;
> +}
> +
> +int iopt_map_pages(struct io_pagetable *iopt, struct iopt_pages *pages,
> +		   unsigned long *dst_iova, unsigned long start_bytes,
> +		   unsigned long length, int iommu_prot, unsigned int flags)
> +{
> +	struct iopt_area *area;
> +	int rc;
> +
> +	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
> +		return -EPERM;
> +
> +	area = iopt_alloc_area(iopt, pages, *dst_iova, start_bytes, length,
> +			       iommu_prot, flags);
> +	if (IS_ERR(area))
> +		return PTR_ERR(area);
> +	*dst_iova = iopt_area_iova(area);
> +
> +	rc = iopt_finalize_area(area, pages);
> +	if (rc) {
> +		iopt_abort_area(area);
> +		return rc;
> +	}
> +	return 0;
> +}
> +
> +/**
> + * iopt_map_user_pages() - Map a user VA to an iova in the io page table
> + * @iopt: io_pagetable to act on
> + * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
> + *        the chosen iova on output. Otherwise is the iova to map to on input
> + * @uptr: User VA to map
> + * @length: Number of bytes to map
> + * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
> + * @flags: IOPT_ALLOC_IOVA or zero
> + *
> + * iova, uptr, and length must be aligned to iova_alignment. For domain backed
> + * page tables this will pin the pages and load them into the domain at iova.
> + * For non-domain page tables this will only setup a lazy reference and the
> + * caller must use iopt_access_pages() to touch them.
> + *
> + * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
> + * destroyed.
> + */
> +int iopt_map_user_pages(struct io_pagetable *iopt, unsigned long *iova,
> +			void __user *uptr, unsigned long length, int iommu_prot,
> +			unsigned int flags)
> +{
> +	struct iopt_pages *pages;
> +	int rc;
> +
> +	pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
> +	if (IS_ERR(pages))
> +		return PTR_ERR(pages);
> +
> +	rc = iopt_map_pages(iopt, pages, iova, uptr - pages->uptr, length,
> +			    iommu_prot, flags);
> +	if (rc) {
> +		iopt_put_pages(pages);
> +		return rc;
> +	}
> +	return 0;
> +}
> +
> +struct iopt_pages *iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
> +				  unsigned long *start_byte,
> +				  unsigned long length)
> +{
> +	unsigned long iova_end;
> +	struct iopt_pages *pages;
> +	struct iopt_area *area;
> +
> +	if (check_add_overflow(iova, length - 1, &iova_end))
> +		return ERR_PTR(-EOVERFLOW);
> +
> +	down_read(&iopt->iova_rwsem);
> +	area = iopt_find_exact_area(iopt, iova, iova_end);
> +	if (!area) {
> +		up_read(&iopt->iova_rwsem);
> +		return ERR_PTR(-ENOENT);
> +	}
> +	pages = area->pages;
> +	*start_byte = area->page_offset + iopt_area_index(area) * PAGE_SIZE;
> +	kref_get(&pages->kref);
> +	up_read(&iopt->iova_rwsem);
> +
> +	return pages;
> +}
> +
> +static int __iopt_unmap_iova(struct io_pagetable *iopt, struct iopt_area *area,
> +			     struct iopt_pages *pages)
> +{
> +	/* Drivers have to unpin on notification. */
> +	if (WARN_ON(atomic_read(&area->num_users)))
> +		return -EBUSY;
> +
> +	iopt_area_unfill_domains(area, pages);
> +	WARN_ON(atomic_read(&area->num_users));
> +	iopt_abort_area(area);
> +	iopt_put_pages(pages);
> +	return 0;
> +}
> +
> +/**
> + * iopt_unmap_iova() - Remove a range of iova
> + * @iopt: io_pagetable to act on
> + * @iova: Starting iova to unmap
> + * @length: Number of bytes to unmap
> + *
> + * The requested range must exactly match an existing range.
> + * Splitting/truncating IOVA mappings is not allowed.
> + */
> +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
> +		    unsigned long length)
> +{
> +	struct iopt_pages *pages;
> +	struct iopt_area *area;
> +	unsigned long iova_end;
> +	int rc;
> +
> +	if (!length)
> +		return -EINVAL;
> +
> +	if (check_add_overflow(iova, length - 1, &iova_end))
> +		return -EOVERFLOW;
> +
> +	down_read(&iopt->domains_rwsem);
> +	down_write(&iopt->iova_rwsem);
> +	area = iopt_find_exact_area(iopt, iova, iova_end);
> +	if (!area) {
> +		up_write(&iopt->iova_rwsem);
> +		up_read(&iopt->domains_rwsem);
> +		return -ENOENT;
> +	}
> +	pages = area->pages;
> +	area->pages = NULL;
> +	up_write(&iopt->iova_rwsem);
> +
> +	rc = __iopt_unmap_iova(iopt, area, pages);
> +	up_read(&iopt->domains_rwsem);
> +	return rc;
> +}
> +
> +int iopt_unmap_all(struct io_pagetable *iopt)
> +{
> +	struct iopt_area *area;
> +	int rc;
> +
> +	down_read(&iopt->domains_rwsem);
> +	down_write(&iopt->iova_rwsem);
> +	while ((area = iopt_area_iter_first(iopt, 0, ULONG_MAX))) {
> +		struct iopt_pages *pages;
> +
> +		/* Userspace should not race unmap all and map */
> +		if (!area->pages) {
> +			rc = -EBUSY;
> +			goto out_unlock_iova;
> +		}
> +		pages = area->pages;
> +		area->pages = NULL;
> +		up_write(&iopt->iova_rwsem);
> +
> +		rc = __iopt_unmap_iova(iopt, area, pages);
> +		if (rc)
> +			goto out_unlock_domains;
> +
> +		down_write(&iopt->iova_rwsem);
> +	}
> +	rc = 0;
> +
> +out_unlock_iova:
> +	up_write(&iopt->iova_rwsem);
> +out_unlock_domains:
> +	up_read(&iopt->domains_rwsem);
> +	return rc;
> +}
> +
> +/**
> + * iopt_access_pages() - Return a list of pages under the iova
> + * @iopt: io_pagetable to act on
> + * @iova: Starting IOVA
> + * @length: Number of bytes to access
> + * @out_pages: Output page list
> + * @write: True if access is for writing
> + *
> + * Reads @npages starting at iova and returns the struct page * pointers. These
> + * can be kmap'd by the caller for CPU access.
> + *
> + * The caller must perform iopt_unaccess_pages() when done to balance this.
> + *
> + * iova can be unaligned from PAGE_SIZE. The first returned byte starts at
> + * page_to_phys(out_pages[0]) + (iova % PAGE_SIZE). The caller promises not to
> + * touch memory outside the requested iova slice.
> + *
> + * FIXME: callers that need a DMA mapping via a sgl should create another
> + * interface to build the SGL efficiently
> + */
> +int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
> +		      unsigned long length, struct page **out_pages, bool write)
> +{
> +	unsigned long cur_iova = iova;
> +	unsigned long last_iova;
> +	struct iopt_area *area;
> +	int rc;
> +
> +	if (!length)
> +		return -EINVAL;
> +	if (check_add_overflow(iova, length - 1, &last_iova))
> +		return -EOVERFLOW;
> +
> +	down_read(&iopt->iova_rwsem);
> +	for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
> +	     area = iopt_area_iter_next(area, iova, last_iova)) {
> +		unsigned long last = min(last_iova, iopt_area_last_iova(area));
> +		unsigned long last_index;
> +		unsigned long index;
> +
> +		/* Need contiguous areas in the access */
> +		if (iopt_area_iova(area) < cur_iova || !area->pages) {
> +			rc = -EINVAL;
> +			goto out_remove;
> +		}
> +
> +		index = iopt_area_iova_to_index(area, cur_iova);
> +		last_index = iopt_area_iova_to_index(area, last);
> +		rc = iopt_pages_add_user(area->pages, index, last_index,
> +					 out_pages, write);
> +		if (rc)
> +			goto out_remove;
> +		if (last == last_iova)
> +			break;
> +		/*
> +		 * Can't cross areas that are not aligned to the system page
> +		 * size with this API.
> +		 */
> +		if (cur_iova % PAGE_SIZE) {
> +			rc = -EINVAL;
> +			goto out_remove;
> +		}
> +		cur_iova = last + 1;
> +		out_pages += last_index - index;
> +		atomic_inc(&area->num_users);
> +	}
> +
> +	up_read(&iopt->iova_rwsem);
> +	return 0;
> +
> +out_remove:
> +	if (cur_iova != iova)
> +		iopt_unaccess_pages(iopt, iova, cur_iova - iova);
> +	up_read(&iopt->iova_rwsem);
> +	return rc;
> +}
> +
> +/**
> + * iopt_unaccess_pages() - Undo iopt_access_pages
> + * @iopt: io_pagetable to act on
> + * @iova: Starting IOVA
> + * @length:- Number of bytes to access
> + *
> + * Return the struct page's. The caller must stop accessing them before calling
> + * this. The iova/length must exactly match the one provided to access_pages.
> + */
> +void iopt_unaccess_pages(struct io_pagetable *iopt, unsigned long iova,
> +			 size_t length)
> +{
> +	unsigned long cur_iova = iova;
> +	unsigned long last_iova;
> +	struct iopt_area *area;
> +
> +	if (WARN_ON(!length) ||
> +	    WARN_ON(check_add_overflow(iova, length - 1, &last_iova)))
> +		return;
> +
> +	down_read(&iopt->iova_rwsem);
> +	for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
> +	     area = iopt_area_iter_next(area, iova, last_iova)) {
> +		unsigned long last = min(last_iova, iopt_area_last_iova(area));
> +		int num_users;
> +
> +		iopt_pages_remove_user(area->pages,
> +				       iopt_area_iova_to_index(area, cur_iova),
> +				       iopt_area_iova_to_index(area, last));
> +		if (last == last_iova)
> +			break;
> +		cur_iova = last + 1;
> +		num_users = atomic_dec_return(&area->num_users);
> +		WARN_ON(num_users < 0);
> +	}
> +	up_read(&iopt->iova_rwsem);
> +}
> +
> +struct iopt_reserved_iova {
> +	struct interval_tree_node node;
> +	void *owner;
> +};
> +
> +int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
> +		      unsigned long last, void *owner)
> +{
> +	struct iopt_reserved_iova *reserved;
> +
> +	lockdep_assert_held_write(&iopt->iova_rwsem);
> +
> +	if (iopt_area_iter_first(iopt, start, last))
> +		return -EADDRINUSE;
> +
> +	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL);
> +	if (!reserved)
> +		return -ENOMEM;
> +	reserved->node.start = start;
> +	reserved->node.last = last;
> +	reserved->owner = owner;
> +	interval_tree_insert(&reserved->node, &iopt->reserved_iova_itree);
> +	return 0;
> +}
> +
> +void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
> +{
> +
> +	struct interval_tree_node *node;
> +
> +	for (node = interval_tree_iter_first(&iopt->reserved_iova_itree, 0,
> +					     ULONG_MAX);
> +	     node;) {
> +		struct iopt_reserved_iova *reserved =
> +			container_of(node, struct iopt_reserved_iova, node);
> +
> +		node = interval_tree_iter_next(node, 0, ULONG_MAX);
> +
> +		if (reserved->owner == owner) {
> +			interval_tree_remove(&reserved->node,
> +					     &iopt->reserved_iova_itree);
> +			kfree(reserved);
> +		}
> +	}
> +}
> +
> +int iopt_init_table(struct io_pagetable *iopt)
> +{
> +	init_rwsem(&iopt->iova_rwsem);
> +	init_rwsem(&iopt->domains_rwsem);
> +	iopt->area_itree = RB_ROOT_CACHED;
> +	iopt->reserved_iova_itree = RB_ROOT_CACHED;
> +	xa_init(&iopt->domains);
> +
> +	/*
> +	 * iopt's start as SW tables that can use the entire size_t IOVA space
> +	 * due to the use of size_t in the APIs. They have no alignment
> +	 * restriction.
> +	 */
> +	iopt->iova_alignment = 1;
> +
> +	return 0;
> +}
> +
> +void iopt_destroy_table(struct io_pagetable *iopt)
> +{
> +	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
> +		iopt_remove_reserved_iova(iopt, NULL);
> +	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_iova_itree.rb_root));
> +	WARN_ON(!xa_empty(&iopt->domains));
> +	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
> +}
> +
> +/**
> + * iopt_unfill_domain() - Unfill a domain with PFNs
> + * @iopt: io_pagetable to act on
> + * @domain: domain to unfill
> + *
> + * This is used when removing a domain from the iopt. Every area in the iopt
> + * will be unmapped from the domain. The domain must already be removed from the
> + * domains xarray.
> + */
> +static void iopt_unfill_domain(struct io_pagetable *iopt,
> +			       struct iommu_domain *domain)
> +{
> +	struct iopt_area *area;
> +
> +	lockdep_assert_held(&iopt->iova_rwsem);
> +	lockdep_assert_held_write(&iopt->domains_rwsem);
> +
> +	/*
> +	 * Some other domain is holding all the pfns still, rapidly unmap this
> +	 * domain.
> +	 */
> +	if (iopt->next_domain_id != 0) {
> +		/* Pick an arbitrary remaining domain to act as storage */
> +		struct iommu_domain *storage_domain =
> +			xa_load(&iopt->domains, 0);
> +
> +		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
> +		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
> +			struct iopt_pages *pages = area->pages;
> +
> +			if (WARN_ON(!pages))
> +				continue;
> +
> +			mutex_lock(&pages->mutex);
> +			if (area->storage_domain != domain) {
> +				mutex_unlock(&pages->mutex);
> +				continue;
> +			}
> +			area->storage_domain = storage_domain;
> +			mutex_unlock(&pages->mutex);
> +		}
> +
> +
> +		iopt_unmap_domain(iopt, domain);
> +		return;
> +	}
> +
> +	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
> +	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
> +		struct iopt_pages *pages = area->pages;
> +
> +		if (WARN_ON(!pages))
> +			continue;
> +
> +		mutex_lock(&pages->mutex);
> +		interval_tree_remove(&area->pages_node,
> +				     &area->pages->domains_itree);
> +		WARN_ON(area->storage_domain != domain);
> +		area->storage_domain = NULL;
> +		iopt_area_unfill_domain(area, pages, domain);
> +		mutex_unlock(&pages->mutex);
> +	}
> +}
> +
> +/**
> + * iopt_fill_domain() - Fill a domain with PFNs
> + * @iopt: io_pagetable to act on
> + * @domain: domain to fill
> + *
> + * Fill the domain with PFNs from every area in the iopt. On failure the domain
> + * is left unchanged.
> + */
> +static int iopt_fill_domain(struct io_pagetable *iopt,
> +			    struct iommu_domain *domain)
> +{
> +	struct iopt_area *end_area;
> +	struct iopt_area *area;
> +	int rc;
> +
> +	lockdep_assert_held(&iopt->iova_rwsem);
> +	lockdep_assert_held_write(&iopt->domains_rwsem);
> +
> +	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
> +	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
> +		struct iopt_pages *pages = area->pages;
> +
> +		if (WARN_ON(!pages))
> +			continue;
> +
> +		mutex_lock(&pages->mutex);
> +		rc = iopt_area_fill_domain(area, domain);
> +		if (rc) {
> +			mutex_unlock(&pages->mutex);
> +			goto out_unfill;
> +		}
> +		if (!area->storage_domain) {
> +			WARN_ON(iopt->next_domain_id != 0);
> +			area->storage_domain = domain;
> +			interval_tree_insert(&area->pages_node,
> +					     &pages->domains_itree);
> +		}
> +		mutex_unlock(&pages->mutex);
> +	}
> +	return 0;
> +
> +out_unfill:
> +	end_area = area;
> +	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
> +	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
> +		struct iopt_pages *pages = area->pages;
> +
> +		if (area == end_area)
> +			break;
> +		if (WARN_ON(!pages))
> +			continue;
> +		mutex_lock(&pages->mutex);
> +		if (iopt->next_domain_id == 0) {
> +			interval_tree_remove(&area->pages_node,
> +					     &pages->domains_itree);
> +			area->storage_domain = NULL;
> +		}
> +		iopt_area_unfill_domain(area, pages, domain);
> +		mutex_unlock(&pages->mutex);
> +	}
> +	return rc;
> +}
> +
> +/* All existing area's conform to an increased page size */
> +static int iopt_check_iova_alignment(struct io_pagetable *iopt,
> +				     unsigned long new_iova_alignment)
> +{
> +	struct iopt_area *area;
> +
> +	lockdep_assert_held(&iopt->iova_rwsem);
> +
> +	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
> +	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
> +		if ((iopt_area_iova(area) % new_iova_alignment) ||
> +		    (iopt_area_length(area) % new_iova_alignment))
> +			return -EADDRINUSE;
> +	return 0;
> +}
> +
> +int iopt_table_add_domain(struct io_pagetable *iopt,
> +			  struct iommu_domain *domain)
> +{
> +	const struct iommu_domain_geometry *geometry = &domain->geometry;
> +	struct iommu_domain *iter_domain;
> +	unsigned int new_iova_alignment;
> +	unsigned long index;
> +	int rc;
> +
> +	down_write(&iopt->domains_rwsem);
> +	down_write(&iopt->iova_rwsem);
> +
> +	xa_for_each (&iopt->domains, index, iter_domain) {
> +		if (WARN_ON(iter_domain == domain)) {
> +			rc = -EEXIST;
> +			goto out_unlock;
> +		}
> +	}
> +
> +	/*
> +	 * The io page size drives the iova_alignment. Internally the iopt_pages
> +	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
> +	 * objects into the iommu_domain.
> +	 *
> +	 * A iommu_domain must always be able to accept PAGE_SIZE to be
> +	 * compatible as we can't guarantee higher contiguity.
> +	 */
> +	new_iova_alignment =
> +		max_t(unsigned long, 1UL << __ffs(domain->pgsize_bitmap),
> +		      iopt->iova_alignment);
> +	if (new_iova_alignment > PAGE_SIZE) {
> +		rc = -EINVAL;
> +		goto out_unlock;
> +	}
> +	if (new_iova_alignment != iopt->iova_alignment) {
> +		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
> +		if (rc)
> +			goto out_unlock;
> +	}
> +
> +	/* No area exists that is outside the allowed domain aperture */
> +	if (geometry->aperture_start != 0) {
> +		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
> +				       domain);
> +		if (rc)
> +			goto out_reserved;
> +	}
> +	if (geometry->aperture_end != ULONG_MAX) {
> +		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
> +				       ULONG_MAX, domain);
> +		if (rc)
> +			goto out_reserved;
> +	}
> +
> +	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
> +	if (rc)
> +		goto out_reserved;
> +
> +	rc = iopt_fill_domain(iopt, domain);
> +	if (rc)
> +		goto out_release;
> +
> +	iopt->iova_alignment = new_iova_alignment;
> +	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
> +	iopt->next_domain_id++;
Not understand here.

Do we get the domain = xa_load(&iopt->domains, iopt->next_domain_id-1)?
Then how to get the domain if next_domain_id++.
For example, iopt_table_add_domain 3 times with 3 domains,
how to know which next_domain_id is the correct one.

Thanks
Jason Gunthorpe March 25, 2022, 5:19 p.m. UTC | #6
On Fri, Mar 25, 2022 at 09:34:08PM +0800, zhangfei.gao@foxmail.com wrote:

> > +	iopt->iova_alignment = new_iova_alignment;
> > +	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
> > +	iopt->next_domain_id++;
> Not understand here.
> 
> Do we get the domain = xa_load(&iopt->domains, iopt->next_domain_id-1)?
> Then how to get the domain if next_domain_id++.
> For example, iopt_table_add_domain 3 times with 3 domains,
> how to know which next_domain_id is the correct one.

There is no "correct one" this is just a simple list of domains, the
alorithms either need to pick any single domain or iterate over every
domain.

Basically this bit of code is building a vector with the operations
'push_back', 'front', 'erase' and 'for each'

Jason
Yi Liu April 13, 2022, 2:02 p.m. UTC | #7
Hi Jason,

On 2022/3/19 01:27, Jason Gunthorpe wrote:
> This is the remainder of the IOAS data structure. Provide an object called
> an io_pagetable that is composed of iopt_areas pointing at iopt_pages,
> along with a list of iommu_domains that mirror the IOVA to PFN map.
> 
> At the top this is a simple interval tree of iopt_areas indicating the map
> of IOVA to iopt_pages. An xarray keeps track of a list of domains. Based
> on the attached domains there is a minimum alignment for areas (which may
> be smaller than PAGE_SIZE) and an interval tree of reserved IOVA that
> can't be mapped.
> 
> The concept of a 'user' refers to something like a VFIO mdev that is
> accessing the IOVA and using a 'struct page *' for CPU based access.
> 
> Externally an API is provided that matches the requirements of the IOCTL
> interface for map/unmap and domain attachment.
> 
> The API provides a 'copy' primitive to establish a new IOVA map in a
> different IOAS from an existing mapping.
> 
> This is designed to support a pre-registration flow where userspace would
> setup an dummy IOAS with no domains, map in memory and then establish a
> user to pin all PFNs into the xarray.
> 
> Copy can then be used to create new IOVA mappings in a different IOAS,
> with iommu_domains attached. Upon copy the PFNs will be read out of the
> xarray and mapped into the iommu_domains, avoiding any pin_user_pages()
> overheads.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>   drivers/iommu/iommufd/Makefile          |   1 +
>   drivers/iommu/iommufd/io_pagetable.c    | 890 ++++++++++++++++++++++++
>   drivers/iommu/iommufd/iommufd_private.h |  35 +
>   3 files changed, 926 insertions(+)
>   create mode 100644 drivers/iommu/iommufd/io_pagetable.c
> 
> diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
> index 05a0e91e30afad..b66a8c47ff55ec 100644
> --- a/drivers/iommu/iommufd/Makefile
> +++ b/drivers/iommu/iommufd/Makefile
> @@ -1,5 +1,6 @@
>   # SPDX-License-Identifier: GPL-2.0-only
>   iommufd-y := \
> +	io_pagetable.o \
>   	main.o \
>   	pages.o
>   
> diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
> new file mode 100644
> index 00000000000000..f9f3b06946bfb9
> --- /dev/null
> +++ b/drivers/iommu/iommufd/io_pagetable.c
> @@ -0,0 +1,890 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
> + *
> + * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
> + * PFNs can be placed into an iommu_domain, or returned to the caller as a page
> + * list for access by an in-kernel user.
> + *
> + * The datastructure uses the iopt_pages to optimize the storage of the PFNs
> + * between the domains and xarray.
> + */
> +#include <linux/lockdep.h>
> +#include <linux/iommu.h>
> +#include <linux/sched/mm.h>
> +#include <linux/err.h>
> +#include <linux/slab.h>
> +#include <linux/errno.h>
> +
> +#include "io_pagetable.h"
> +
> +static unsigned long iopt_area_iova_to_index(struct iopt_area *area,
> +					     unsigned long iova)
> +{
> +	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
> +		WARN_ON(iova < iopt_area_iova(area) ||
> +			iova > iopt_area_last_iova(area));
> +	return (iova - (iopt_area_iova(area) & PAGE_MASK)) / PAGE_SIZE;
> +}
> +
> +static struct iopt_area *iopt_find_exact_area(struct io_pagetable *iopt,
> +					      unsigned long iova,
> +					      unsigned long last_iova)
> +{
> +	struct iopt_area *area;
> +
> +	area = iopt_area_iter_first(iopt, iova, last_iova);
> +	if (!area || !area->pages || iopt_area_iova(area) != iova ||
> +	    iopt_area_last_iova(area) != last_iova)
> +		return NULL;
> +	return area;
> +}
> +
> +static bool __alloc_iova_check_hole(struct interval_tree_span_iter *span,
> +				    unsigned long length,
> +				    unsigned long iova_alignment,
> +				    unsigned long page_offset)
> +{
> +	if (!span->is_hole || span->last_hole - span->start_hole < length - 1)
> +		return false;
> +
> +	span->start_hole =
> +		ALIGN(span->start_hole, iova_alignment) | page_offset;
> +	if (span->start_hole > span->last_hole ||
> +	    span->last_hole - span->start_hole < length - 1)
> +		return false;
> +	return true;
> +}
> +
> +/*
> + * Automatically find a block of IOVA that is not being used and not reserved.
> + * Does not return a 0 IOVA even if it is valid.
> + */
> +static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
> +			   unsigned long uptr, unsigned long length)
> +{
> +	struct interval_tree_span_iter reserved_span;
> +	unsigned long page_offset = uptr % PAGE_SIZE;
> +	struct interval_tree_span_iter area_span;
> +	unsigned long iova_alignment;
> +
> +	lockdep_assert_held(&iopt->iova_rwsem);
> +
> +	/* Protect roundup_pow-of_two() from overflow */
> +	if (length == 0 || length >= ULONG_MAX / 2)
> +		return -EOVERFLOW;
> +
> +	/*
> +	 * Keep alignment present in the uptr when building the IOVA, this
> +	 * increases the chance we can map a THP.
> +	 */
> +	if (!uptr)
> +		iova_alignment = roundup_pow_of_two(length);
> +	else
> +		iova_alignment =
> +			min_t(unsigned long, roundup_pow_of_two(length),
> +			      1UL << __ffs64(uptr));
> +
> +	if (iova_alignment < iopt->iova_alignment)
> +		return -EINVAL;
> +	for (interval_tree_span_iter_first(&area_span, &iopt->area_itree,
> +					   PAGE_SIZE, ULONG_MAX - PAGE_SIZE);
> +	     !interval_tree_span_iter_done(&area_span);
> +	     interval_tree_span_iter_next(&area_span)) {
> +		if (!__alloc_iova_check_hole(&area_span, length, iova_alignment,
> +					     page_offset))
> +			continue;
> +
> +		for (interval_tree_span_iter_first(
> +			     &reserved_span, &iopt->reserved_iova_itree,
> +			     area_span.start_hole, area_span.last_hole);
> +		     !interval_tree_span_iter_done(&reserved_span);
> +		     interval_tree_span_iter_next(&reserved_span)) {
> +			if (!__alloc_iova_check_hole(&reserved_span, length,
> +						     iova_alignment,
> +						     page_offset))
> +				continue;
> +
> +			*iova = reserved_span.start_hole;
> +			return 0;
> +		}
> +	}
> +	return -ENOSPC;
> +}
> +
> +/*
> + * The area takes a slice of the pages from start_bytes to start_byte + length
> + */
> +static struct iopt_area *
> +iopt_alloc_area(struct io_pagetable *iopt, struct iopt_pages *pages,
> +		unsigned long iova, unsigned long start_byte,
> +		unsigned long length, int iommu_prot, unsigned int flags)
> +{
> +	struct iopt_area *area;
> +	int rc;
> +
> +	area = kzalloc(sizeof(*area), GFP_KERNEL);
> +	if (!area)
> +		return ERR_PTR(-ENOMEM);
> +
> +	area->iopt = iopt;
> +	area->iommu_prot = iommu_prot;
> +	area->page_offset = start_byte % PAGE_SIZE;
> +	area->pages_node.start = start_byte / PAGE_SIZE;
> +	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
> +		return ERR_PTR(-EOVERFLOW);
> +	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
> +	if (WARN_ON(area->pages_node.last >= pages->npages))
> +		return ERR_PTR(-EOVERFLOW);
> +
> +	down_write(&iopt->iova_rwsem);
> +	if (flags & IOPT_ALLOC_IOVA) {
> +		rc = iopt_alloc_iova(iopt, &iova,
> +				     (uintptr_t)pages->uptr + start_byte,
> +				     length);
> +		if (rc)
> +			goto out_unlock;
> +	}
> +
> +	if (check_add_overflow(iova, length - 1, &area->node.last)) {
> +		rc = -EOVERFLOW;
> +		goto out_unlock;
> +	}
> +
> +	if (!(flags & IOPT_ALLOC_IOVA)) {
> +		if ((iova & (iopt->iova_alignment - 1)) ||
> +		    (length & (iopt->iova_alignment - 1)) || !length) {
> +			rc = -EINVAL;
> +			goto out_unlock;
> +		}
> +
> +		/* No reserved IOVA intersects the range */
> +		if (interval_tree_iter_first(&iopt->reserved_iova_itree, iova,
> +					     area->node.last)) {
> +			rc = -ENOENT;
> +			goto out_unlock;
> +		}
> +
> +		/* Check that there is not already a mapping in the range */
> +		if (iopt_area_iter_first(iopt, iova, area->node.last)) {
> +			rc = -EADDRINUSE;
> +			goto out_unlock;
> +		}
> +	}
> +
> +	/*
> +	 * The area is inserted with a NULL pages indicating it is not fully
> +	 * initialized yet.
> +	 */
> +	area->node.start = iova;
> +	interval_tree_insert(&area->node, &area->iopt->area_itree);
> +	up_write(&iopt->iova_rwsem);
> +	return area;
> +
> +out_unlock:
> +	up_write(&iopt->iova_rwsem);
> +	kfree(area);
> +	return ERR_PTR(rc);
> +}
> +
> +static void iopt_abort_area(struct iopt_area *area)
> +{
> +	down_write(&area->iopt->iova_rwsem);
> +	interval_tree_remove(&area->node, &area->iopt->area_itree);
> +	up_write(&area->iopt->iova_rwsem);
> +	kfree(area);
> +}
> +
> +static int iopt_finalize_area(struct iopt_area *area, struct iopt_pages *pages)
> +{
> +	int rc;
> +
> +	down_read(&area->iopt->domains_rwsem);
> +	rc = iopt_area_fill_domains(area, pages);
> +	if (!rc) {
> +		/*
> +		 * area->pages must be set inside the domains_rwsem to ensure
> +		 * any newly added domains will get filled. Moves the reference
> +		 * in from the caller
> +		 */
> +		down_write(&area->iopt->iova_rwsem);
> +		area->pages = pages;
> +		up_write(&area->iopt->iova_rwsem);
> +	}
> +	up_read(&area->iopt->domains_rwsem);
> +	return rc;
> +}
> +
> +int iopt_map_pages(struct io_pagetable *iopt, struct iopt_pages *pages,
> +		   unsigned long *dst_iova, unsigned long start_bytes,
> +		   unsigned long length, int iommu_prot, unsigned int flags)
> +{
> +	struct iopt_area *area;
> +	int rc;
> +
> +	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
> +		return -EPERM;
> +
> +	area = iopt_alloc_area(iopt, pages, *dst_iova, start_bytes, length,
> +			       iommu_prot, flags);
> +	if (IS_ERR(area))
> +		return PTR_ERR(area);
> +	*dst_iova = iopt_area_iova(area);
> +
> +	rc = iopt_finalize_area(area, pages);
> +	if (rc) {
> +		iopt_abort_area(area);
> +		return rc;
> +	}
> +	return 0;
> +}
> +
> +/**
> + * iopt_map_user_pages() - Map a user VA to an iova in the io page table
> + * @iopt: io_pagetable to act on
> + * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
> + *        the chosen iova on output. Otherwise is the iova to map to on input
> + * @uptr: User VA to map
> + * @length: Number of bytes to map
> + * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
> + * @flags: IOPT_ALLOC_IOVA or zero
> + *
> + * iova, uptr, and length must be aligned to iova_alignment. For domain backed
> + * page tables this will pin the pages and load them into the domain at iova.
> + * For non-domain page tables this will only setup a lazy reference and the
> + * caller must use iopt_access_pages() to touch them.
> + *
> + * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
> + * destroyed.
> + */
> +int iopt_map_user_pages(struct io_pagetable *iopt, unsigned long *iova,
> +			void __user *uptr, unsigned long length, int iommu_prot,
> +			unsigned int flags)
> +{
> +	struct iopt_pages *pages;
> +	int rc;
> +
> +	pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
> +	if (IS_ERR(pages))
> +		return PTR_ERR(pages);
> +
> +	rc = iopt_map_pages(iopt, pages, iova, uptr - pages->uptr, length,
> +			    iommu_prot, flags);
> +	if (rc) {
> +		iopt_put_pages(pages);
> +		return rc;
> +	}
> +	return 0;
> +}
> +
> +struct iopt_pages *iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
> +				  unsigned long *start_byte,
> +				  unsigned long length)
> +{
> +	unsigned long iova_end;
> +	struct iopt_pages *pages;
> +	struct iopt_area *area;
> +
> +	if (check_add_overflow(iova, length - 1, &iova_end))
> +		return ERR_PTR(-EOVERFLOW);
> +
> +	down_read(&iopt->iova_rwsem);
> +	area = iopt_find_exact_area(iopt, iova, iova_end);
> +	if (!area) {
> +		up_read(&iopt->iova_rwsem);
> +		return ERR_PTR(-ENOENT);
> +	}
> +	pages = area->pages;
> +	*start_byte = area->page_offset + iopt_area_index(area) * PAGE_SIZE;
> +	kref_get(&pages->kref);
> +	up_read(&iopt->iova_rwsem);
> +
> +	return pages;
> +}
> +
> +static int __iopt_unmap_iova(struct io_pagetable *iopt, struct iopt_area *area,
> +			     struct iopt_pages *pages)
> +{
> +	/* Drivers have to unpin on notification. */
> +	if (WARN_ON(atomic_read(&area->num_users)))
> +		return -EBUSY;
> +
> +	iopt_area_unfill_domains(area, pages);
> +	WARN_ON(atomic_read(&area->num_users));
> +	iopt_abort_area(area);
> +	iopt_put_pages(pages);
> +	return 0;
> +}
> +
> +/**
> + * iopt_unmap_iova() - Remove a range of iova
> + * @iopt: io_pagetable to act on
> + * @iova: Starting iova to unmap
> + * @length: Number of bytes to unmap
> + *
> + * The requested range must exactly match an existing range.
> + * Splitting/truncating IOVA mappings is not allowed.
> + */
> +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
> +		    unsigned long length)
> +{
> +	struct iopt_pages *pages;
> +	struct iopt_area *area;
> +	unsigned long iova_end;
> +	int rc;
> +
> +	if (!length)
> +		return -EINVAL;
> +
> +	if (check_add_overflow(iova, length - 1, &iova_end))
> +		return -EOVERFLOW;
> +
> +	down_read(&iopt->domains_rwsem);
> +	down_write(&iopt->iova_rwsem);
> +	area = iopt_find_exact_area(iopt, iova, iova_end);

when testing vIOMMU with Qemu using iommufd, I hit a problem as log #3
shows. Qemu failed when trying to do map due to an IOVA still in use.
After debugging, the 0xfffff000 IOVA is mapped but not unmapped. But per 
log #2, Qemu has issued unmap with a larger range (0xff000000 -
0x100000000) which includes the 0xfffff000. But iopt_find_exact_area()
doesn't find any area. So 0xfffff000 is not unmapped. Is this correct? Same
test passed with vfio iommu type1 driver. any idea?

#1
qemu-system-x86_64: 218 vfio_dma_map(0x55b99d7b7d40, 0xfffff000, 0x1000, 
0x7f79ee81f000) = 0

#2
qemu-system-x86_64: 232 vfio_dma_unmap(0x55b99d7b7d40, 0xff000000, 
0x1000000) = 0 (No such file or directory)
qemu-system-x86_64: IOMMU_IOAS_UNMAP failed: No such file or directory
qemu-system-x86_64: 241 vfio_dma_unmap(0x55b99d7b7d40, 0xff000000, 
0x1000000) = -2 (No such file or directory)
                                vtd_address_space_unmap, notify iommu 
start: ff000000, end: 100000000 - 2

#3
qemu-system-x86_64: IOMMU_IOAS_MAP failed: Address already in use
qemu-system-x86_64: vfio_container_dma_map(0x55b99d7b7d40, 0xfffc0000, 
0x40000, 0x7f7968c00000) = -98 (Address already in use)
qemu: hardware error: vfio: DMA mapping failed, unable to continue

#4 Kernel debug log:

[ 1042.662165] iopt_unmap_iova 338 iova: ff000000, length: 1000000
[ 1042.662339] iopt_unmap_iova 345 iova: ff000000, length: 1000000
[ 1042.662505] iopt_unmap_iova 348 iova: ff000000, length: 1000000
[ 1042.662736] iopt_find_exact_area, iova: ff000000, last_iova: ffffffff
[ 1042.662909] iopt_unmap_iova 350 iova: ff000000, length: 1000000
[ 1042.663084] iommufd_ioas_unmap 253 iova: ff000000 length: 1000000, rc: -2

> +	if (!area) {
> +		up_write(&iopt->iova_rwsem);
> +		up_read(&iopt->domains_rwsem);
> +		return -ENOENT;
> +	}
> +	pages = area->pages;
> +	area->pages = NULL;
> +	up_write(&iopt->iova_rwsem);
> +
> +	rc = __iopt_unmap_iova(iopt, area, pages);
> +	up_read(&iopt->domains_rwsem);
> +	return rc;
> +}
> +
> +int iopt_unmap_all(struct io_pagetable *iopt)
> +{
> +	struct iopt_area *area;
> +	int rc;
> +
> +	down_read(&iopt->domains_rwsem);
> +	down_write(&iopt->iova_rwsem);
> +	while ((area = iopt_area_iter_first(iopt, 0, ULONG_MAX))) {
> +		struct iopt_pages *pages;
> +
> +		/* Userspace should not race unmap all and map */
> +		if (!area->pages) {
> +			rc = -EBUSY;
> +			goto out_unlock_iova;
> +		}
> +		pages = area->pages;
> +		area->pages = NULL;
> +		up_write(&iopt->iova_rwsem);
> +
> +		rc = __iopt_unmap_iova(iopt, area, pages);
> +		if (rc)
> +			goto out_unlock_domains;
> +
> +		down_write(&iopt->iova_rwsem);
> +	}
> +	rc = 0;
> +
> +out_unlock_iova:
> +	up_write(&iopt->iova_rwsem);
> +out_unlock_domains:
> +	up_read(&iopt->domains_rwsem);
> +	return rc;
> +}
> +
> +/**
> + * iopt_access_pages() - Return a list of pages under the iova
> + * @iopt: io_pagetable to act on
> + * @iova: Starting IOVA
> + * @length: Number of bytes to access
> + * @out_pages: Output page list
> + * @write: True if access is for writing
> + *
> + * Reads @npages starting at iova and returns the struct page * pointers. These
> + * can be kmap'd by the caller for CPU access.
> + *
> + * The caller must perform iopt_unaccess_pages() when done to balance this.
> + *
> + * iova can be unaligned from PAGE_SIZE. The first returned byte starts at
> + * page_to_phys(out_pages[0]) + (iova % PAGE_SIZE). The caller promises not to
> + * touch memory outside the requested iova slice.
> + *
> + * FIXME: callers that need a DMA mapping via a sgl should create another
> + * interface to build the SGL efficiently
> + */
> +int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
> +		      unsigned long length, struct page **out_pages, bool write)
> +{
> +	unsigned long cur_iova = iova;
> +	unsigned long last_iova;
> +	struct iopt_area *area;
> +	int rc;
> +
> +	if (!length)
> +		return -EINVAL;
> +	if (check_add_overflow(iova, length - 1, &last_iova))
> +		return -EOVERFLOW;
> +
> +	down_read(&iopt->iova_rwsem);
> +	for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
> +	     area = iopt_area_iter_next(area, iova, last_iova)) {
> +		unsigned long last = min(last_iova, iopt_area_last_iova(area));
> +		unsigned long last_index;
> +		unsigned long index;
> +
> +		/* Need contiguous areas in the access */
> +		if (iopt_area_iova(area) < cur_iova || !area->pages) {
> +			rc = -EINVAL;
> +			goto out_remove;
> +		}
> +
> +		index = iopt_area_iova_to_index(area, cur_iova);
> +		last_index = iopt_area_iova_to_index(area, last);
> +		rc = iopt_pages_add_user(area->pages, index, last_index,
> +					 out_pages, write);
> +		if (rc)
> +			goto out_remove;
> +		if (last == last_iova)
> +			break;
> +		/*
> +		 * Can't cross areas that are not aligned to the system page
> +		 * size with this API.
> +		 */
> +		if (cur_iova % PAGE_SIZE) {
> +			rc = -EINVAL;
> +			goto out_remove;
> +		}
> +		cur_iova = last + 1;
> +		out_pages += last_index - index;
> +		atomic_inc(&area->num_users);
> +	}
> +
> +	up_read(&iopt->iova_rwsem);
> +	return 0;
> +
> +out_remove:
> +	if (cur_iova != iova)
> +		iopt_unaccess_pages(iopt, iova, cur_iova - iova);
> +	up_read(&iopt->iova_rwsem);
> +	return rc;
> +}
> +
> +/**
> + * iopt_unaccess_pages() - Undo iopt_access_pages
> + * @iopt: io_pagetable to act on
> + * @iova: Starting IOVA
> + * @length:- Number of bytes to access
> + *
> + * Return the struct page's. The caller must stop accessing them before calling
> + * this. The iova/length must exactly match the one provided to access_pages.
> + */
> +void iopt_unaccess_pages(struct io_pagetable *iopt, unsigned long iova,
> +			 size_t length)
> +{
> +	unsigned long cur_iova = iova;
> +	unsigned long last_iova;
> +	struct iopt_area *area;
> +
> +	if (WARN_ON(!length) ||
> +	    WARN_ON(check_add_overflow(iova, length - 1, &last_iova)))
> +		return;
> +
> +	down_read(&iopt->iova_rwsem);
> +	for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
> +	     area = iopt_area_iter_next(area, iova, last_iova)) {
> +		unsigned long last = min(last_iova, iopt_area_last_iova(area));
> +		int num_users;
> +
> +		iopt_pages_remove_user(area->pages,
> +				       iopt_area_iova_to_index(area, cur_iova),
> +				       iopt_area_iova_to_index(area, last));
> +		if (last == last_iova)
> +			break;
> +		cur_iova = last + 1;
> +		num_users = atomic_dec_return(&area->num_users);
> +		WARN_ON(num_users < 0);
> +	}
> +	up_read(&iopt->iova_rwsem);
> +}
> +
> +struct iopt_reserved_iova {
> +	struct interval_tree_node node;
> +	void *owner;
> +};
> +
> +int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
> +		      unsigned long last, void *owner)
> +{
> +	struct iopt_reserved_iova *reserved;
> +
> +	lockdep_assert_held_write(&iopt->iova_rwsem);
> +
> +	if (iopt_area_iter_first(iopt, start, last))
> +		return -EADDRINUSE;
> +
> +	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL);
> +	if (!reserved)
> +		return -ENOMEM;
> +	reserved->node.start = start;
> +	reserved->node.last = last;
> +	reserved->owner = owner;
> +	interval_tree_insert(&reserved->node, &iopt->reserved_iova_itree);
> +	return 0;
> +}
> +
> +void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
> +{
> +
> +	struct interval_tree_node *node;
> +
> +	for (node = interval_tree_iter_first(&iopt->reserved_iova_itree, 0,
> +					     ULONG_MAX);
> +	     node;) {
> +		struct iopt_reserved_iova *reserved =
> +			container_of(node, struct iopt_reserved_iova, node);
> +
> +		node = interval_tree_iter_next(node, 0, ULONG_MAX);
> +
> +		if (reserved->owner == owner) {
> +			interval_tree_remove(&reserved->node,
> +					     &iopt->reserved_iova_itree);
> +			kfree(reserved);
> +		}
> +	}
> +}
> +
> +int iopt_init_table(struct io_pagetable *iopt)
> +{
> +	init_rwsem(&iopt->iova_rwsem);
> +	init_rwsem(&iopt->domains_rwsem);
> +	iopt->area_itree = RB_ROOT_CACHED;
> +	iopt->reserved_iova_itree = RB_ROOT_CACHED;
> +	xa_init(&iopt->domains);
> +
> +	/*
> +	 * iopt's start as SW tables that can use the entire size_t IOVA space
> +	 * due to the use of size_t in the APIs. They have no alignment
> +	 * restriction.
> +	 */
> +	iopt->iova_alignment = 1;
> +
> +	return 0;
> +}
> +
> +void iopt_destroy_table(struct io_pagetable *iopt)
> +{
> +	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
> +		iopt_remove_reserved_iova(iopt, NULL);
> +	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_iova_itree.rb_root));
> +	WARN_ON(!xa_empty(&iopt->domains));
> +	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
> +}
> +
> +/**
> + * iopt_unfill_domain() - Unfill a domain with PFNs
> + * @iopt: io_pagetable to act on
> + * @domain: domain to unfill
> + *
> + * This is used when removing a domain from the iopt. Every area in the iopt
> + * will be unmapped from the domain. The domain must already be removed from the
> + * domains xarray.
> + */
> +static void iopt_unfill_domain(struct io_pagetable *iopt,
> +			       struct iommu_domain *domain)
> +{
> +	struct iopt_area *area;
> +
> +	lockdep_assert_held(&iopt->iova_rwsem);
> +	lockdep_assert_held_write(&iopt->domains_rwsem);
> +
> +	/*
> +	 * Some other domain is holding all the pfns still, rapidly unmap this
> +	 * domain.
> +	 */
> +	if (iopt->next_domain_id != 0) {
> +		/* Pick an arbitrary remaining domain to act as storage */
> +		struct iommu_domain *storage_domain =
> +			xa_load(&iopt->domains, 0);
> +
> +		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
> +		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
> +			struct iopt_pages *pages = area->pages;
> +
> +			if (WARN_ON(!pages))
> +				continue;
> +
> +			mutex_lock(&pages->mutex);
> +			if (area->storage_domain != domain) {
> +				mutex_unlock(&pages->mutex);
> +				continue;
> +			}
> +			area->storage_domain = storage_domain;
> +			mutex_unlock(&pages->mutex);
> +		}
> +
> +
> +		iopt_unmap_domain(iopt, domain);
> +		return;
> +	}
> +
> +	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
> +	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
> +		struct iopt_pages *pages = area->pages;
> +
> +		if (WARN_ON(!pages))
> +			continue;
> +
> +		mutex_lock(&pages->mutex);
> +		interval_tree_remove(&area->pages_node,
> +				     &area->pages->domains_itree);
> +		WARN_ON(area->storage_domain != domain);
> +		area->storage_domain = NULL;
> +		iopt_area_unfill_domain(area, pages, domain);
> +		mutex_unlock(&pages->mutex);
> +	}
> +}
> +
> +/**
> + * iopt_fill_domain() - Fill a domain with PFNs
> + * @iopt: io_pagetable to act on
> + * @domain: domain to fill
> + *
> + * Fill the domain with PFNs from every area in the iopt. On failure the domain
> + * is left unchanged.
> + */
> +static int iopt_fill_domain(struct io_pagetable *iopt,
> +			    struct iommu_domain *domain)
> +{
> +	struct iopt_area *end_area;
> +	struct iopt_area *area;
> +	int rc;
> +
> +	lockdep_assert_held(&iopt->iova_rwsem);
> +	lockdep_assert_held_write(&iopt->domains_rwsem);
> +
> +	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
> +	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
> +		struct iopt_pages *pages = area->pages;
> +
> +		if (WARN_ON(!pages))
> +			continue;
> +
> +		mutex_lock(&pages->mutex);
> +		rc = iopt_area_fill_domain(area, domain);
> +		if (rc) {
> +			mutex_unlock(&pages->mutex);
> +			goto out_unfill;
> +		}
> +		if (!area->storage_domain) {
> +			WARN_ON(iopt->next_domain_id != 0);
> +			area->storage_domain = domain;
> +			interval_tree_insert(&area->pages_node,
> +					     &pages->domains_itree);
> +		}
> +		mutex_unlock(&pages->mutex);
> +	}
> +	return 0;
> +
> +out_unfill:
> +	end_area = area;
> +	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
> +	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
> +		struct iopt_pages *pages = area->pages;
> +
> +		if (area == end_area)
> +			break;
> +		if (WARN_ON(!pages))
> +			continue;
> +		mutex_lock(&pages->mutex);
> +		if (iopt->next_domain_id == 0) {
> +			interval_tree_remove(&area->pages_node,
> +					     &pages->domains_itree);
> +			area->storage_domain = NULL;
> +		}
> +		iopt_area_unfill_domain(area, pages, domain);
> +		mutex_unlock(&pages->mutex);
> +	}
> +	return rc;
> +}
> +
> +/* All existing area's conform to an increased page size */
> +static int iopt_check_iova_alignment(struct io_pagetable *iopt,
> +				     unsigned long new_iova_alignment)
> +{
> +	struct iopt_area *area;
> +
> +	lockdep_assert_held(&iopt->iova_rwsem);
> +
> +	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
> +	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
> +		if ((iopt_area_iova(area) % new_iova_alignment) ||
> +		    (iopt_area_length(area) % new_iova_alignment))
> +			return -EADDRINUSE;
> +	return 0;
> +}
> +
> +int iopt_table_add_domain(struct io_pagetable *iopt,
> +			  struct iommu_domain *domain)
> +{
> +	const struct iommu_domain_geometry *geometry = &domain->geometry;
> +	struct iommu_domain *iter_domain;
> +	unsigned int new_iova_alignment;
> +	unsigned long index;
> +	int rc;
> +
> +	down_write(&iopt->domains_rwsem);
> +	down_write(&iopt->iova_rwsem);
> +
> +	xa_for_each (&iopt->domains, index, iter_domain) {
> +		if (WARN_ON(iter_domain == domain)) {
> +			rc = -EEXIST;
> +			goto out_unlock;
> +		}
> +	}
> +
> +	/*
> +	 * The io page size drives the iova_alignment. Internally the iopt_pages
> +	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
> +	 * objects into the iommu_domain.
> +	 *
> +	 * A iommu_domain must always be able to accept PAGE_SIZE to be
> +	 * compatible as we can't guarantee higher contiguity.
> +	 */
> +	new_iova_alignment =
> +		max_t(unsigned long, 1UL << __ffs(domain->pgsize_bitmap),
> +		      iopt->iova_alignment);
> +	if (new_iova_alignment > PAGE_SIZE) {
> +		rc = -EINVAL;
> +		goto out_unlock;
> +	}
> +	if (new_iova_alignment != iopt->iova_alignment) {
> +		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
> +		if (rc)
> +			goto out_unlock;
> +	}
> +
> +	/* No area exists that is outside the allowed domain aperture */
> +	if (geometry->aperture_start != 0) {
> +		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
> +				       domain);
> +		if (rc)
> +			goto out_reserved;
> +	}
> +	if (geometry->aperture_end != ULONG_MAX) {
> +		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
> +				       ULONG_MAX, domain);
> +		if (rc)
> +			goto out_reserved;
> +	}
> +
> +	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
> +	if (rc)
> +		goto out_reserved;
> +
> +	rc = iopt_fill_domain(iopt, domain);
> +	if (rc)
> +		goto out_release;
> +
> +	iopt->iova_alignment = new_iova_alignment;
> +	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
> +	iopt->next_domain_id++;
> +	up_write(&iopt->iova_rwsem);
> +	up_write(&iopt->domains_rwsem);
> +	return 0;
> +out_release:
> +	xa_release(&iopt->domains, iopt->next_domain_id);
> +out_reserved:
> +	iopt_remove_reserved_iova(iopt, domain);
> +out_unlock:
> +	up_write(&iopt->iova_rwsem);
> +	up_write(&iopt->domains_rwsem);
> +	return rc;
> +}
> +
> +void iopt_table_remove_domain(struct io_pagetable *iopt,
> +			      struct iommu_domain *domain)
> +{
> +	struct iommu_domain *iter_domain = NULL;
> +	unsigned long new_iova_alignment;
> +	unsigned long index;
> +
> +	down_write(&iopt->domains_rwsem);
> +	down_write(&iopt->iova_rwsem);
> +
> +	xa_for_each (&iopt->domains, index, iter_domain)
> +		if (iter_domain == domain)
> +			break;
> +	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
> +		goto out_unlock;
> +
> +	/*
> +	 * Compress the xarray to keep it linear by swapping the entry to erase
> +	 * with the tail entry and shrinking the tail.
> +	 */
> +	iopt->next_domain_id--;
> +	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
> +	if (index != iopt->next_domain_id)
> +		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
> +
> +	iopt_unfill_domain(iopt, domain);
> +	iopt_remove_reserved_iova(iopt, domain);
> +
> +	/* Recalculate the iova alignment without the domain */
> +	new_iova_alignment = 1;
> +	xa_for_each (&iopt->domains, index, iter_domain)
> +		new_iova_alignment = max_t(unsigned long,
> +					   1UL << __ffs(domain->pgsize_bitmap),
> +					   new_iova_alignment);
> +	if (!WARN_ON(new_iova_alignment > iopt->iova_alignment))
> +		iopt->iova_alignment = new_iova_alignment;
> +
> +out_unlock:
> +	up_write(&iopt->iova_rwsem);
> +	up_write(&iopt->domains_rwsem);
> +}
> +
> +/* Narrow the valid_iova_itree to include reserved ranges from a group. */
> +int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt,
> +					  struct iommu_group *group,
> +					  phys_addr_t *sw_msi_start)
> +{
> +	struct iommu_resv_region *resv;
> +	struct iommu_resv_region *tmp;
> +	LIST_HEAD(group_resv_regions);
> +	int rc;
> +
> +	down_write(&iopt->iova_rwsem);
> +	rc = iommu_get_group_resv_regions(group, &group_resv_regions);
> +	if (rc)
> +		goto out_unlock;
> +
> +	list_for_each_entry (resv, &group_resv_regions, list) {
> +		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
> +			continue;
> +
> +		/*
> +		 * The presence of any 'real' MSI regions should take precedence
> +		 * over the software-managed one if the IOMMU driver happens to
> +		 * advertise both types.
> +		 */
> +		if (sw_msi_start && resv->type == IOMMU_RESV_MSI) {
> +			*sw_msi_start = 0;
> +			sw_msi_start = NULL;
> +		}
> +		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI)
> +			*sw_msi_start = resv->start;
> +
> +		rc = iopt_reserve_iova(iopt, resv->start,
> +				       resv->length - 1 + resv->start, group);
> +		if (rc)
> +			goto out_reserved;
> +	}
> +	rc = 0;
> +	goto out_free_resv;
> +
> +out_reserved:
> +	iopt_remove_reserved_iova(iopt, group);
> +out_free_resv:
> +	list_for_each_entry_safe (resv, tmp, &group_resv_regions, list)
> +		kfree(resv);
> +out_unlock:
> +	up_write(&iopt->iova_rwsem);
> +	return rc;
> +}
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index 2f1301d39bba7c..bcf08e61bc87e9 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -9,6 +9,9 @@
>   #include <linux/refcount.h>
>   #include <linux/uaccess.h>
>   
> +struct iommu_domain;
> +struct iommu_group;
> +
>   /*
>    * The IOVA to PFN map. The mapper automatically copies the PFNs into multiple
>    * domains and permits sharing of PFNs between io_pagetable instances. This
> @@ -27,8 +30,40 @@ struct io_pagetable {
>   	struct rw_semaphore iova_rwsem;
>   	struct rb_root_cached area_itree;
>   	struct rb_root_cached reserved_iova_itree;
> +	unsigned long iova_alignment;
>   };
>   
> +int iopt_init_table(struct io_pagetable *iopt);
> +void iopt_destroy_table(struct io_pagetable *iopt);
> +struct iopt_pages *iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
> +				  unsigned long *start_byte,
> +				  unsigned long length);
> +enum { IOPT_ALLOC_IOVA = 1 << 0 };
> +int iopt_map_user_pages(struct io_pagetable *iopt, unsigned long *iova,
> +			void __user *uptr, unsigned long length, int iommu_prot,
> +			unsigned int flags);
> +int iopt_map_pages(struct io_pagetable *iopt, struct iopt_pages *pages,
> +		   unsigned long *dst_iova, unsigned long start_byte,
> +		   unsigned long length, int iommu_prot, unsigned int flags);
> +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
> +		    unsigned long length);
> +int iopt_unmap_all(struct io_pagetable *iopt);
> +
> +int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
> +		      unsigned long npages, struct page **out_pages, bool write);
> +void iopt_unaccess_pages(struct io_pagetable *iopt, unsigned long iova,
> +			 size_t npages);
> +int iopt_table_add_domain(struct io_pagetable *iopt,
> +			  struct iommu_domain *domain);
> +void iopt_table_remove_domain(struct io_pagetable *iopt,
> +			      struct iommu_domain *domain);
> +int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt,
> +					  struct iommu_group *group,
> +					  phys_addr_t *sw_msi_start);
> +int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
> +		      unsigned long last, void *owner);
> +void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner);
> +
>   struct iommufd_ctx {
>   	struct file *filp;
>   	struct xarray objects;
Jason Gunthorpe April 13, 2022, 2:36 p.m. UTC | #8
On Wed, Apr 13, 2022 at 10:02:58PM +0800, Yi Liu wrote:
> > +/**
> > + * iopt_unmap_iova() - Remove a range of iova
> > + * @iopt: io_pagetable to act on
> > + * @iova: Starting iova to unmap
> > + * @length: Number of bytes to unmap
> > + *
> > + * The requested range must exactly match an existing range.
> > + * Splitting/truncating IOVA mappings is not allowed.
> > + */
> > +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
> > +		    unsigned long length)
> > +{
> > +	struct iopt_pages *pages;
> > +	struct iopt_area *area;
> > +	unsigned long iova_end;
> > +	int rc;
> > +
> > +	if (!length)
> > +		return -EINVAL;
> > +
> > +	if (check_add_overflow(iova, length - 1, &iova_end))
> > +		return -EOVERFLOW;
> > +
> > +	down_read(&iopt->domains_rwsem);
> > +	down_write(&iopt->iova_rwsem);
> > +	area = iopt_find_exact_area(iopt, iova, iova_end);
> 
> when testing vIOMMU with Qemu using iommufd, I hit a problem as log #3
> shows. Qemu failed when trying to do map due to an IOVA still in use.
> After debugging, the 0xfffff000 IOVA is mapped but not unmapped. But per log
> #2, Qemu has issued unmap with a larger range (0xff000000 -
> 0x100000000) which includes the 0xfffff000. But iopt_find_exact_area()
> doesn't find any area. So 0xfffff000 is not unmapped. Is this correct? Same
> test passed with vfio iommu type1 driver. any idea?

There are a couple of good reasons why the iopt_unmap_iova() should
proccess any contiguous range of fully contained areas, so I would
consider this something worth fixing. can you send a small patch and
test case and I'll fold it in?

Thanks,
Jason
Yi Liu April 13, 2022, 2:49 p.m. UTC | #9
On 2022/4/13 22:36, Jason Gunthorpe wrote:
> On Wed, Apr 13, 2022 at 10:02:58PM +0800, Yi Liu wrote:
>>> +/**
>>> + * iopt_unmap_iova() - Remove a range of iova
>>> + * @iopt: io_pagetable to act on
>>> + * @iova: Starting iova to unmap
>>> + * @length: Number of bytes to unmap
>>> + *
>>> + * The requested range must exactly match an existing range.
>>> + * Splitting/truncating IOVA mappings is not allowed.
>>> + */
>>> +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
>>> +		    unsigned long length)
>>> +{
>>> +	struct iopt_pages *pages;
>>> +	struct iopt_area *area;
>>> +	unsigned long iova_end;
>>> +	int rc;
>>> +
>>> +	if (!length)
>>> +		return -EINVAL;
>>> +
>>> +	if (check_add_overflow(iova, length - 1, &iova_end))
>>> +		return -EOVERFLOW;
>>> +
>>> +	down_read(&iopt->domains_rwsem);
>>> +	down_write(&iopt->iova_rwsem);
>>> +	area = iopt_find_exact_area(iopt, iova, iova_end);
>>
>> when testing vIOMMU with Qemu using iommufd, I hit a problem as log #3
>> shows. Qemu failed when trying to do map due to an IOVA still in use.
>> After debugging, the 0xfffff000 IOVA is mapped but not unmapped. But per log
>> #2, Qemu has issued unmap with a larger range (0xff000000 -
>> 0x100000000) which includes the 0xfffff000. But iopt_find_exact_area()
>> doesn't find any area. So 0xfffff000 is not unmapped. Is this correct? Same
>> test passed with vfio iommu type1 driver. any idea?
> 
> There are a couple of good reasons why the iopt_unmap_iova() should
> proccess any contiguous range of fully contained areas, so I would
> consider this something worth fixing. can you send a small patch and
> test case and I'll fold it in?

sure. just spotted it, so haven't got fix patch yet. I may work on
it tomorrow.
Yi Liu April 17, 2022, 2:56 p.m. UTC | #10
On 2022/4/13 22:49, Yi Liu wrote:
> On 2022/4/13 22:36, Jason Gunthorpe wrote:
>> On Wed, Apr 13, 2022 at 10:02:58PM +0800, Yi Liu wrote:
>>>> +/**
>>>> + * iopt_unmap_iova() - Remove a range of iova
>>>> + * @iopt: io_pagetable to act on
>>>> + * @iova: Starting iova to unmap
>>>> + * @length: Number of bytes to unmap
>>>> + *
>>>> + * The requested range must exactly match an existing range.
>>>> + * Splitting/truncating IOVA mappings is not allowed.
>>>> + */
>>>> +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
>>>> +            unsigned long length)
>>>> +{
>>>> +    struct iopt_pages *pages;
>>>> +    struct iopt_area *area;
>>>> +    unsigned long iova_end;
>>>> +    int rc;
>>>> +
>>>> +    if (!length)
>>>> +        return -EINVAL;
>>>> +
>>>> +    if (check_add_overflow(iova, length - 1, &iova_end))
>>>> +        return -EOVERFLOW;
>>>> +
>>>> +    down_read(&iopt->domains_rwsem);
>>>> +    down_write(&iopt->iova_rwsem);
>>>> +    area = iopt_find_exact_area(iopt, iova, iova_end);
>>>
>>> when testing vIOMMU with Qemu using iommufd, I hit a problem as log #3
>>> shows. Qemu failed when trying to do map due to an IOVA still in use.
>>> After debugging, the 0xfffff000 IOVA is mapped but not unmapped. But per 
>>> log
>>> #2, Qemu has issued unmap with a larger range (0xff000000 -
>>> 0x100000000) which includes the 0xfffff000. But iopt_find_exact_area()
>>> doesn't find any area. So 0xfffff000 is not unmapped. Is this correct? Same
>>> test passed with vfio iommu type1 driver. any idea?
>>
>> There are a couple of good reasons why the iopt_unmap_iova() should
>> proccess any contiguous range of fully contained areas, so I would
>> consider this something worth fixing. can you send a small patch and
>> test case and I'll fold it in?
> 
> sure. just spotted it, so haven't got fix patch yet. I may work on
> it tomorrow.

Hi Jason,

Got below patch for it. Also pushed to the exploration branch.

https://github.com/luxis1999/iommufd/commit/d764f3288de0fd52c578684788a437701ec31b2d

 From 22a758c401a1c7f6656625013bb87204c9ea65fe Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Sun, 17 Apr 2022 07:39:03 -0700
Subject: [PATCH] iommufd/io_pagetable: Support unmap fully contained areas

Changes:
- return the unmapped bytes to caller
- supports unmap fully containerd contiguous areas
- add a test case in selftest

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
  drivers/iommu/iommufd/io_pagetable.c    | 90 ++++++++++++-------------
  drivers/iommu/iommufd/ioas.c            |  8 ++-
  drivers/iommu/iommufd/iommufd_private.h |  4 +-
  drivers/iommu/iommufd/vfio_compat.c     |  8 ++-
  include/uapi/linux/iommufd.h            |  2 +-
  tools/testing/selftests/iommu/iommufd.c | 40 +++++++++++
  6 files changed, 99 insertions(+), 53 deletions(-)

diff --git a/drivers/iommu/iommufd/io_pagetable.c 
b/drivers/iommu/iommufd/io_pagetable.c
index f9f3b06946bf..5142f797a812 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -315,61 +315,26 @@ static int __iopt_unmap_iova(struct io_pagetable 
*iopt, struct iopt_area *area,
  	return 0;
  }

-/**
- * iopt_unmap_iova() - Remove a range of iova
- * @iopt: io_pagetable to act on
- * @iova: Starting iova to unmap
- * @length: Number of bytes to unmap
- *
- * The requested range must exactly match an existing range.
- * Splitting/truncating IOVA mappings is not allowed.
- */
-int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
-		    unsigned long length)
-{
-	struct iopt_pages *pages;
-	struct iopt_area *area;
-	unsigned long iova_end;
-	int rc;
-
-	if (!length)
-		return -EINVAL;
-
-	if (check_add_overflow(iova, length - 1, &iova_end))
-		return -EOVERFLOW;
-
-	down_read(&iopt->domains_rwsem);
-	down_write(&iopt->iova_rwsem);
-	area = iopt_find_exact_area(iopt, iova, iova_end);
-	if (!area) {
-		up_write(&iopt->iova_rwsem);
-		up_read(&iopt->domains_rwsem);
-		return -ENOENT;
-	}
-	pages = area->pages;
-	area->pages = NULL;
-	up_write(&iopt->iova_rwsem);
-
-	rc = __iopt_unmap_iova(iopt, area, pages);
-	up_read(&iopt->domains_rwsem);
-	return rc;
-}
-
-int iopt_unmap_all(struct io_pagetable *iopt)
+static int __iopt_unmap_iova_range(struct io_pagetable *iopt,
+				   unsigned long start,
+				   unsigned long end,
+				   unsigned long *unmapped)
  {
  	struct iopt_area *area;
+	unsigned long unmapped_bytes = 0;
  	int rc;

  	down_read(&iopt->domains_rwsem);
  	down_write(&iopt->iova_rwsem);
-	while ((area = iopt_area_iter_first(iopt, 0, ULONG_MAX))) {
+	while ((area = iopt_area_iter_first(iopt, start, end))) {
  		struct iopt_pages *pages;

-		/* Userspace should not race unmap all and map */
-		if (!area->pages) {
-			rc = -EBUSY;
+		if (!area->pages || iopt_area_iova(area) < start ||
+		    iopt_area_last_iova(area) > end) {
+			rc = -ENOENT;
  			goto out_unlock_iova;
  		}
+
  		pages = area->pages;
  		area->pages = NULL;
  		up_write(&iopt->iova_rwsem);
@@ -378,6 +343,10 @@ int iopt_unmap_all(struct io_pagetable *iopt)
  		if (rc)
  			goto out_unlock_domains;

+		start = iopt_area_last_iova(area) + 1;
+		unmapped_bytes +=
+			iopt_area_last_iova(area) - iopt_area_iova(area) + 1;
+
  		down_write(&iopt->iova_rwsem);
  	}
  	rc = 0;
@@ -386,9 +355,40 @@ int iopt_unmap_all(struct io_pagetable *iopt)
  	up_write(&iopt->iova_rwsem);
  out_unlock_domains:
  	up_read(&iopt->domains_rwsem);
+	if (unmapped)
+		*unmapped = unmapped_bytes;
  	return rc;
  }

+/**
+ * iopt_unmap_iova() - Remove a range of iova
+ * @iopt: io_pagetable to act on
+ * @iova: Starting iova to unmap
+ * @length: Number of bytes to unmap
+ * @unmapped: Return number of bytes unmapped
+ *
+ * The requested range must exactly match an existing range.
+ * Splitting/truncating IOVA mappings is not allowed.
+ */
+int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
+		    unsigned long length, unsigned long *unmapped)
+{
+	unsigned long iova_end;
+
+	if (!length)
+		return -EINVAL;
+
+	if (check_add_overflow(iova, length - 1, &iova_end))
+		return -EOVERFLOW;
+
+	return __iopt_unmap_iova_range(iopt, iova, iova_end, unmapped);
+}
+
+int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
+{
+	return __iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
+}
+
  /**
   * iopt_access_pages() - Return a list of pages under the iova
   * @iopt: io_pagetable to act on
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 48149988c84b..4e701d053ed6 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -14,7 +14,7 @@ void iommufd_ioas_destroy(struct iommufd_object *obj)
  	struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj);
  	int rc;

-	rc = iopt_unmap_all(&ioas->iopt);
+	rc = iopt_unmap_all(&ioas->iopt, NULL);
  	WARN_ON(rc);
  	iopt_destroy_table(&ioas->iopt);
  	mutex_destroy(&ioas->mutex);
@@ -230,6 +230,7 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
  {
  	struct iommu_ioas_unmap *cmd = ucmd->cmd;
  	struct iommufd_ioas *ioas;
+	unsigned long unmapped;
  	int rc;

  	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
@@ -237,16 +238,17 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
  		return PTR_ERR(ioas);

  	if (cmd->iova == 0 && cmd->length == U64_MAX) {
-		rc = iopt_unmap_all(&ioas->iopt);
+		rc = iopt_unmap_all(&ioas->iopt, &unmapped);
  	} else {
  		if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) {
  			rc = -EOVERFLOW;
  			goto out_put;
  		}
-		rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length);
+		rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length, &unmapped);
  	}

  out_put:
  	iommufd_put_object(&ioas->obj);
+	cmd->length = unmapped;
  	return rc;
  }
diff --git a/drivers/iommu/iommufd/iommufd_private.h 
b/drivers/iommu/iommufd/iommufd_private.h
index f55654278ac4..382704f4d698 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -46,8 +46,8 @@ int iopt_map_pages(struct io_pagetable *iopt, struct 
iopt_pages *pages,
  		   unsigned long *dst_iova, unsigned long start_byte,
  		   unsigned long length, int iommu_prot, unsigned int flags);
  int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
-		    unsigned long length);
-int iopt_unmap_all(struct io_pagetable *iopt);
+		    unsigned long length, unsigned long *unmapped);
+int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped);

  int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
  		      unsigned long npages, struct page **out_pages, bool write);
diff --git a/drivers/iommu/iommufd/vfio_compat.c 
b/drivers/iommu/iommufd/vfio_compat.c
index 5b196de00ff9..4539ff45efd9 100644
--- a/drivers/iommu/iommufd/vfio_compat.c
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -133,6 +133,7 @@ static int iommufd_vfio_unmap_dma(struct iommufd_ctx 
*ictx, unsigned int cmd,
  	u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL;
  	struct vfio_iommu_type1_dma_unmap unmap;
  	struct iommufd_ioas *ioas;
+	unsigned long unmapped;
  	int rc;

  	if (copy_from_user(&unmap, arg, minsz))
@@ -146,10 +147,13 @@ static int iommufd_vfio_unmap_dma(struct iommufd_ctx 
*ictx, unsigned int cmd,
  		return PTR_ERR(ioas);

  	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL)
-		rc = iopt_unmap_all(&ioas->iopt);
+		rc = iopt_unmap_all(&ioas->iopt, &unmapped);
  	else
-		rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size);
+		rc = iopt_unmap_iova(&ioas->iopt, unmap.iova,
+				     unmap.size, &unmapped);
  	iommufd_put_object(&ioas->obj);
+	unmap.size = unmapped;
+
  	return rc;
  }

diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 2c0f5ced4173..8cbc6a083156 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -172,7 +172,7 @@ struct iommu_ioas_copy {
   * @size: sizeof(struct iommu_ioas_copy)
   * @ioas_id: IOAS ID to change the mapping of
   * @iova: IOVA to start the unmapping at
- * @length: Number of bytes to unmap
+ * @length: Number of bytes to unmap, and return back the bytes unmapped
   *
   * Unmap an IOVA range. The iova/length must exactly match a range
   * used with IOMMU_IOAS_PAGETABLE_MAP, or be the values 0 & U64_MAX.
diff --git a/tools/testing/selftests/iommu/iommufd.c 
b/tools/testing/selftests/iommu/iommufd.c
index 5c47d706ed94..42956acd2c04 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -357,6 +357,47 @@ TEST_F(iommufd_ioas, area)
  	ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_UNMAP, &unmap_cmd));
  }

+TEST_F(iommufd_ioas, unmap_fully_contained_area)
+{
+	struct iommu_ioas_map map_cmd = {
+		.size = sizeof(map_cmd),
+		.ioas_id = self->ioas_id,
+		.flags = IOMMU_IOAS_MAP_FIXED_IOVA,
+		.length = PAGE_SIZE,
+		.user_va = (uintptr_t)buffer,
+	};
+	struct iommu_ioas_unmap unmap_cmd = {
+		.size = sizeof(unmap_cmd),
+		.ioas_id = self->ioas_id,
+		.length = PAGE_SIZE,
+	};
+	int i;
+
+	for (i = 0; i != 4; i++) {
+		map_cmd.iova = self->base_iova + i * 16 * PAGE_SIZE;
+		map_cmd.length = 8 * PAGE_SIZE;
+		ASSERT_EQ(0,
+			  ioctl(self->fd, IOMMU_IOAS_MAP, &map_cmd));
+	}
+
+	/* Unmap not fully contained area doesn't work */
+	unmap_cmd.iova = self->base_iova - 4 * PAGE_SIZE;
+	unmap_cmd.length = 8 * PAGE_SIZE;
+	ASSERT_EQ(ENOENT,
+		  ioctl(self->fd, IOMMU_IOAS_UNMAP, &unmap_cmd));
+
+	unmap_cmd.iova = self->base_iova + 3 * 16 * PAGE_SIZE + 8 * PAGE_SIZE - 4 
* PAGE_SIZE;
+	unmap_cmd.length = 8 * PAGE_SIZE;
+	ASSERT_EQ(ENOENT,
+		  ioctl(self->fd, IOMMU_IOAS_UNMAP, &unmap_cmd));
+
+	/* Unmap fully contained areas works */
+	unmap_cmd.iova = self->base_iova - 4 * PAGE_SIZE;
+	unmap_cmd.length = 3 * 16 * PAGE_SIZE + 8 * PAGE_SIZE + 4 * PAGE_SIZE;
+	ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_UNMAP, &unmap_cmd));
+	ASSERT_EQ(32, unmap_cmd.length);
+}
+
  TEST_F(iommufd_ioas, area_auto_iova)
  {
  	struct iommu_test_cmd test_cmd = {
Yi Liu April 18, 2022, 10:47 a.m. UTC | #11
Hi Jason,

On 2022/4/17 22:56, Yi Liu wrote:
> On 2022/4/13 22:49, Yi Liu wrote:
>> On 2022/4/13 22:36, Jason Gunthorpe wrote:
>>> On Wed, Apr 13, 2022 at 10:02:58PM +0800, Yi Liu wrote:
>>>>> +/**
>>>>> + * iopt_unmap_iova() - Remove a range of iova
>>>>> + * @iopt: io_pagetable to act on
>>>>> + * @iova: Starting iova to unmap
>>>>> + * @length: Number of bytes to unmap
>>>>> + *
>>>>> + * The requested range must exactly match an existing range.
>>>>> + * Splitting/truncating IOVA mappings is not allowed.
>>>>> + */
>>>>> +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
>>>>> +            unsigned long length)
>>>>> +{
>>>>> +    struct iopt_pages *pages;
>>>>> +    struct iopt_area *area;
>>>>> +    unsigned long iova_end;
>>>>> +    int rc;
>>>>> +
>>>>> +    if (!length)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    if (check_add_overflow(iova, length - 1, &iova_end))
>>>>> +        return -EOVERFLOW;
>>>>> +
>>>>> +    down_read(&iopt->domains_rwsem);
>>>>> +    down_write(&iopt->iova_rwsem);
>>>>> +    area = iopt_find_exact_area(iopt, iova, iova_end);
>>>>
>>>> when testing vIOMMU with Qemu using iommufd, I hit a problem as log #3
>>>> shows. Qemu failed when trying to do map due to an IOVA still in use.
>>>> After debugging, the 0xfffff000 IOVA is mapped but not unmapped. But 
>>>> per log
>>>> #2, Qemu has issued unmap with a larger range (0xff000000 -
>>>> 0x100000000) which includes the 0xfffff000. But iopt_find_exact_area()
>>>> doesn't find any area. So 0xfffff000 is not unmapped. Is this correct? 
>>>> Same
>>>> test passed with vfio iommu type1 driver. any idea?
>>>
>>> There are a couple of good reasons why the iopt_unmap_iova() should
>>> proccess any contiguous range of fully contained areas, so I would
>>> consider this something worth fixing. can you send a small patch and
>>> test case and I'll fold it in?
>>
>> sure. just spotted it, so haven't got fix patch yet. I may work on
>> it tomorrow.
> 
> Hi Jason,
> 
> Got below patch for it. Also pushed to the exploration branch.
> 
> https://github.com/luxis1999/iommufd/commit/d764f3288de0fd52c578684788a437701ec31b2d 

0-day reports a use without initialization to me. So updated it. Please get
the change in below commit. Sorry for the noise.

https://github.com/luxis1999/iommufd/commit/10674417c235cb4a4caf2202fffb078611441da2
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 05a0e91e30afad..b66a8c47ff55ec 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -1,5 +1,6 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 iommufd-y := \
+	io_pagetable.o \
 	main.o \
 	pages.o
 
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
new file mode 100644
index 00000000000000..f9f3b06946bfb9
--- /dev/null
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -0,0 +1,890 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
+ * PFNs can be placed into an iommu_domain, or returned to the caller as a page
+ * list for access by an in-kernel user.
+ *
+ * The datastructure uses the iopt_pages to optimize the storage of the PFNs
+ * between the domains and xarray.
+ */
+#include <linux/lockdep.h>
+#include <linux/iommu.h>
+#include <linux/sched/mm.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "io_pagetable.h"
+
+static unsigned long iopt_area_iova_to_index(struct iopt_area *area,
+					     unsigned long iova)
+{
+	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+		WARN_ON(iova < iopt_area_iova(area) ||
+			iova > iopt_area_last_iova(area));
+	return (iova - (iopt_area_iova(area) & PAGE_MASK)) / PAGE_SIZE;
+}
+
+static struct iopt_area *iopt_find_exact_area(struct io_pagetable *iopt,
+					      unsigned long iova,
+					      unsigned long last_iova)
+{
+	struct iopt_area *area;
+
+	area = iopt_area_iter_first(iopt, iova, last_iova);
+	if (!area || !area->pages || iopt_area_iova(area) != iova ||
+	    iopt_area_last_iova(area) != last_iova)
+		return NULL;
+	return area;
+}
+
+static bool __alloc_iova_check_hole(struct interval_tree_span_iter *span,
+				    unsigned long length,
+				    unsigned long iova_alignment,
+				    unsigned long page_offset)
+{
+	if (!span->is_hole || span->last_hole - span->start_hole < length - 1)
+		return false;
+
+	span->start_hole =
+		ALIGN(span->start_hole, iova_alignment) | page_offset;
+	if (span->start_hole > span->last_hole ||
+	    span->last_hole - span->start_hole < length - 1)
+		return false;
+	return true;
+}
+
+/*
+ * Automatically find a block of IOVA that is not being used and not reserved.
+ * Does not return a 0 IOVA even if it is valid.
+ */
+static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
+			   unsigned long uptr, unsigned long length)
+{
+	struct interval_tree_span_iter reserved_span;
+	unsigned long page_offset = uptr % PAGE_SIZE;
+	struct interval_tree_span_iter area_span;
+	unsigned long iova_alignment;
+
+	lockdep_assert_held(&iopt->iova_rwsem);
+
+	/* Protect roundup_pow-of_two() from overflow */
+	if (length == 0 || length >= ULONG_MAX / 2)
+		return -EOVERFLOW;
+
+	/*
+	 * Keep alignment present in the uptr when building the IOVA, this
+	 * increases the chance we can map a THP.
+	 */
+	if (!uptr)
+		iova_alignment = roundup_pow_of_two(length);
+	else
+		iova_alignment =
+			min_t(unsigned long, roundup_pow_of_two(length),
+			      1UL << __ffs64(uptr));
+
+	if (iova_alignment < iopt->iova_alignment)
+		return -EINVAL;
+	for (interval_tree_span_iter_first(&area_span, &iopt->area_itree,
+					   PAGE_SIZE, ULONG_MAX - PAGE_SIZE);
+	     !interval_tree_span_iter_done(&area_span);
+	     interval_tree_span_iter_next(&area_span)) {
+		if (!__alloc_iova_check_hole(&area_span, length, iova_alignment,
+					     page_offset))
+			continue;
+
+		for (interval_tree_span_iter_first(
+			     &reserved_span, &iopt->reserved_iova_itree,
+			     area_span.start_hole, area_span.last_hole);
+		     !interval_tree_span_iter_done(&reserved_span);
+		     interval_tree_span_iter_next(&reserved_span)) {
+			if (!__alloc_iova_check_hole(&reserved_span, length,
+						     iova_alignment,
+						     page_offset))
+				continue;
+
+			*iova = reserved_span.start_hole;
+			return 0;
+		}
+	}
+	return -ENOSPC;
+}
+
+/*
+ * The area takes a slice of the pages from start_bytes to start_byte + length
+ */
+static struct iopt_area *
+iopt_alloc_area(struct io_pagetable *iopt, struct iopt_pages *pages,
+		unsigned long iova, unsigned long start_byte,
+		unsigned long length, int iommu_prot, unsigned int flags)
+{
+	struct iopt_area *area;
+	int rc;
+
+	area = kzalloc(sizeof(*area), GFP_KERNEL);
+	if (!area)
+		return ERR_PTR(-ENOMEM);
+
+	area->iopt = iopt;
+	area->iommu_prot = iommu_prot;
+	area->page_offset = start_byte % PAGE_SIZE;
+	area->pages_node.start = start_byte / PAGE_SIZE;
+	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
+		return ERR_PTR(-EOVERFLOW);
+	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
+	if (WARN_ON(area->pages_node.last >= pages->npages))
+		return ERR_PTR(-EOVERFLOW);
+
+	down_write(&iopt->iova_rwsem);
+	if (flags & IOPT_ALLOC_IOVA) {
+		rc = iopt_alloc_iova(iopt, &iova,
+				     (uintptr_t)pages->uptr + start_byte,
+				     length);
+		if (rc)
+			goto out_unlock;
+	}
+
+	if (check_add_overflow(iova, length - 1, &area->node.last)) {
+		rc = -EOVERFLOW;
+		goto out_unlock;
+	}
+
+	if (!(flags & IOPT_ALLOC_IOVA)) {
+		if ((iova & (iopt->iova_alignment - 1)) ||
+		    (length & (iopt->iova_alignment - 1)) || !length) {
+			rc = -EINVAL;
+			goto out_unlock;
+		}
+
+		/* No reserved IOVA intersects the range */
+		if (interval_tree_iter_first(&iopt->reserved_iova_itree, iova,
+					     area->node.last)) {
+			rc = -ENOENT;
+			goto out_unlock;
+		}
+
+		/* Check that there is not already a mapping in the range */
+		if (iopt_area_iter_first(iopt, iova, area->node.last)) {
+			rc = -EADDRINUSE;
+			goto out_unlock;
+		}
+	}
+
+	/*
+	 * The area is inserted with a NULL pages indicating it is not fully
+	 * initialized yet.
+	 */
+	area->node.start = iova;
+	interval_tree_insert(&area->node, &area->iopt->area_itree);
+	up_write(&iopt->iova_rwsem);
+	return area;
+
+out_unlock:
+	up_write(&iopt->iova_rwsem);
+	kfree(area);
+	return ERR_PTR(rc);
+}
+
+static void iopt_abort_area(struct iopt_area *area)
+{
+	down_write(&area->iopt->iova_rwsem);
+	interval_tree_remove(&area->node, &area->iopt->area_itree);
+	up_write(&area->iopt->iova_rwsem);
+	kfree(area);
+}
+
+static int iopt_finalize_area(struct iopt_area *area, struct iopt_pages *pages)
+{
+	int rc;
+
+	down_read(&area->iopt->domains_rwsem);
+	rc = iopt_area_fill_domains(area, pages);
+	if (!rc) {
+		/*
+		 * area->pages must be set inside the domains_rwsem to ensure
+		 * any newly added domains will get filled. Moves the reference
+		 * in from the caller
+		 */
+		down_write(&area->iopt->iova_rwsem);
+		area->pages = pages;
+		up_write(&area->iopt->iova_rwsem);
+	}
+	up_read(&area->iopt->domains_rwsem);
+	return rc;
+}
+
+int iopt_map_pages(struct io_pagetable *iopt, struct iopt_pages *pages,
+		   unsigned long *dst_iova, unsigned long start_bytes,
+		   unsigned long length, int iommu_prot, unsigned int flags)
+{
+	struct iopt_area *area;
+	int rc;
+
+	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
+		return -EPERM;
+
+	area = iopt_alloc_area(iopt, pages, *dst_iova, start_bytes, length,
+			       iommu_prot, flags);
+	if (IS_ERR(area))
+		return PTR_ERR(area);
+	*dst_iova = iopt_area_iova(area);
+
+	rc = iopt_finalize_area(area, pages);
+	if (rc) {
+		iopt_abort_area(area);
+		return rc;
+	}
+	return 0;
+}
+
+/**
+ * iopt_map_user_pages() - Map a user VA to an iova in the io page table
+ * @iopt: io_pagetable to act on
+ * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
+ *        the chosen iova on output. Otherwise is the iova to map to on input
+ * @uptr: User VA to map
+ * @length: Number of bytes to map
+ * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
+ * @flags: IOPT_ALLOC_IOVA or zero
+ *
+ * iova, uptr, and length must be aligned to iova_alignment. For domain backed
+ * page tables this will pin the pages and load them into the domain at iova.
+ * For non-domain page tables this will only setup a lazy reference and the
+ * caller must use iopt_access_pages() to touch them.
+ *
+ * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
+ * destroyed.
+ */
+int iopt_map_user_pages(struct io_pagetable *iopt, unsigned long *iova,
+			void __user *uptr, unsigned long length, int iommu_prot,
+			unsigned int flags)
+{
+	struct iopt_pages *pages;
+	int rc;
+
+	pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	rc = iopt_map_pages(iopt, pages, iova, uptr - pages->uptr, length,
+			    iommu_prot, flags);
+	if (rc) {
+		iopt_put_pages(pages);
+		return rc;
+	}
+	return 0;
+}
+
+struct iopt_pages *iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
+				  unsigned long *start_byte,
+				  unsigned long length)
+{
+	unsigned long iova_end;
+	struct iopt_pages *pages;
+	struct iopt_area *area;
+
+	if (check_add_overflow(iova, length - 1, &iova_end))
+		return ERR_PTR(-EOVERFLOW);
+
+	down_read(&iopt->iova_rwsem);
+	area = iopt_find_exact_area(iopt, iova, iova_end);
+	if (!area) {
+		up_read(&iopt->iova_rwsem);
+		return ERR_PTR(-ENOENT);
+	}
+	pages = area->pages;
+	*start_byte = area->page_offset + iopt_area_index(area) * PAGE_SIZE;
+	kref_get(&pages->kref);
+	up_read(&iopt->iova_rwsem);
+
+	return pages;
+}
+
+static int __iopt_unmap_iova(struct io_pagetable *iopt, struct iopt_area *area,
+			     struct iopt_pages *pages)
+{
+	/* Drivers have to unpin on notification. */
+	if (WARN_ON(atomic_read(&area->num_users)))
+		return -EBUSY;
+
+	iopt_area_unfill_domains(area, pages);
+	WARN_ON(atomic_read(&area->num_users));
+	iopt_abort_area(area);
+	iopt_put_pages(pages);
+	return 0;
+}
+
+/**
+ * iopt_unmap_iova() - Remove a range of iova
+ * @iopt: io_pagetable to act on
+ * @iova: Starting iova to unmap
+ * @length: Number of bytes to unmap
+ *
+ * The requested range must exactly match an existing range.
+ * Splitting/truncating IOVA mappings is not allowed.
+ */
+int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
+		    unsigned long length)
+{
+	struct iopt_pages *pages;
+	struct iopt_area *area;
+	unsigned long iova_end;
+	int rc;
+
+	if (!length)
+		return -EINVAL;
+
+	if (check_add_overflow(iova, length - 1, &iova_end))
+		return -EOVERFLOW;
+
+	down_read(&iopt->domains_rwsem);
+	down_write(&iopt->iova_rwsem);
+	area = iopt_find_exact_area(iopt, iova, iova_end);
+	if (!area) {
+		up_write(&iopt->iova_rwsem);
+		up_read(&iopt->domains_rwsem);
+		return -ENOENT;
+	}
+	pages = area->pages;
+	area->pages = NULL;
+	up_write(&iopt->iova_rwsem);
+
+	rc = __iopt_unmap_iova(iopt, area, pages);
+	up_read(&iopt->domains_rwsem);
+	return rc;
+}
+
+int iopt_unmap_all(struct io_pagetable *iopt)
+{
+	struct iopt_area *area;
+	int rc;
+
+	down_read(&iopt->domains_rwsem);
+	down_write(&iopt->iova_rwsem);
+	while ((area = iopt_area_iter_first(iopt, 0, ULONG_MAX))) {
+		struct iopt_pages *pages;
+
+		/* Userspace should not race unmap all and map */
+		if (!area->pages) {
+			rc = -EBUSY;
+			goto out_unlock_iova;
+		}
+		pages = area->pages;
+		area->pages = NULL;
+		up_write(&iopt->iova_rwsem);
+
+		rc = __iopt_unmap_iova(iopt, area, pages);
+		if (rc)
+			goto out_unlock_domains;
+
+		down_write(&iopt->iova_rwsem);
+	}
+	rc = 0;
+
+out_unlock_iova:
+	up_write(&iopt->iova_rwsem);
+out_unlock_domains:
+	up_read(&iopt->domains_rwsem);
+	return rc;
+}
+
+/**
+ * iopt_access_pages() - Return a list of pages under the iova
+ * @iopt: io_pagetable to act on
+ * @iova: Starting IOVA
+ * @length: Number of bytes to access
+ * @out_pages: Output page list
+ * @write: True if access is for writing
+ *
+ * Reads @npages starting at iova and returns the struct page * pointers. These
+ * can be kmap'd by the caller for CPU access.
+ *
+ * The caller must perform iopt_unaccess_pages() when done to balance this.
+ *
+ * iova can be unaligned from PAGE_SIZE. The first returned byte starts at
+ * page_to_phys(out_pages[0]) + (iova % PAGE_SIZE). The caller promises not to
+ * touch memory outside the requested iova slice.
+ *
+ * FIXME: callers that need a DMA mapping via a sgl should create another
+ * interface to build the SGL efficiently
+ */
+int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
+		      unsigned long length, struct page **out_pages, bool write)
+{
+	unsigned long cur_iova = iova;
+	unsigned long last_iova;
+	struct iopt_area *area;
+	int rc;
+
+	if (!length)
+		return -EINVAL;
+	if (check_add_overflow(iova, length - 1, &last_iova))
+		return -EOVERFLOW;
+
+	down_read(&iopt->iova_rwsem);
+	for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
+	     area = iopt_area_iter_next(area, iova, last_iova)) {
+		unsigned long last = min(last_iova, iopt_area_last_iova(area));
+		unsigned long last_index;
+		unsigned long index;
+
+		/* Need contiguous areas in the access */
+		if (iopt_area_iova(area) < cur_iova || !area->pages) {
+			rc = -EINVAL;
+			goto out_remove;
+		}
+
+		index = iopt_area_iova_to_index(area, cur_iova);
+		last_index = iopt_area_iova_to_index(area, last);
+		rc = iopt_pages_add_user(area->pages, index, last_index,
+					 out_pages, write);
+		if (rc)
+			goto out_remove;
+		if (last == last_iova)
+			break;
+		/*
+		 * Can't cross areas that are not aligned to the system page
+		 * size with this API.
+		 */
+		if (cur_iova % PAGE_SIZE) {
+			rc = -EINVAL;
+			goto out_remove;
+		}
+		cur_iova = last + 1;
+		out_pages += last_index - index;
+		atomic_inc(&area->num_users);
+	}
+
+	up_read(&iopt->iova_rwsem);
+	return 0;
+
+out_remove:
+	if (cur_iova != iova)
+		iopt_unaccess_pages(iopt, iova, cur_iova - iova);
+	up_read(&iopt->iova_rwsem);
+	return rc;
+}
+
+/**
+ * iopt_unaccess_pages() - Undo iopt_access_pages
+ * @iopt: io_pagetable to act on
+ * @iova: Starting IOVA
+ * @length:- Number of bytes to access
+ *
+ * Return the struct page's. The caller must stop accessing them before calling
+ * this. The iova/length must exactly match the one provided to access_pages.
+ */
+void iopt_unaccess_pages(struct io_pagetable *iopt, unsigned long iova,
+			 size_t length)
+{
+	unsigned long cur_iova = iova;
+	unsigned long last_iova;
+	struct iopt_area *area;
+
+	if (WARN_ON(!length) ||
+	    WARN_ON(check_add_overflow(iova, length - 1, &last_iova)))
+		return;
+
+	down_read(&iopt->iova_rwsem);
+	for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
+	     area = iopt_area_iter_next(area, iova, last_iova)) {
+		unsigned long last = min(last_iova, iopt_area_last_iova(area));
+		int num_users;
+
+		iopt_pages_remove_user(area->pages,
+				       iopt_area_iova_to_index(area, cur_iova),
+				       iopt_area_iova_to_index(area, last));
+		if (last == last_iova)
+			break;
+		cur_iova = last + 1;
+		num_users = atomic_dec_return(&area->num_users);
+		WARN_ON(num_users < 0);
+	}
+	up_read(&iopt->iova_rwsem);
+}
+
+struct iopt_reserved_iova {
+	struct interval_tree_node node;
+	void *owner;
+};
+
+int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
+		      unsigned long last, void *owner)
+{
+	struct iopt_reserved_iova *reserved;
+
+	lockdep_assert_held_write(&iopt->iova_rwsem);
+
+	if (iopt_area_iter_first(iopt, start, last))
+		return -EADDRINUSE;
+
+	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL);
+	if (!reserved)
+		return -ENOMEM;
+	reserved->node.start = start;
+	reserved->node.last = last;
+	reserved->owner = owner;
+	interval_tree_insert(&reserved->node, &iopt->reserved_iova_itree);
+	return 0;
+}
+
+void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
+{
+
+	struct interval_tree_node *node;
+
+	for (node = interval_tree_iter_first(&iopt->reserved_iova_itree, 0,
+					     ULONG_MAX);
+	     node;) {
+		struct iopt_reserved_iova *reserved =
+			container_of(node, struct iopt_reserved_iova, node);
+
+		node = interval_tree_iter_next(node, 0, ULONG_MAX);
+
+		if (reserved->owner == owner) {
+			interval_tree_remove(&reserved->node,
+					     &iopt->reserved_iova_itree);
+			kfree(reserved);
+		}
+	}
+}
+
+int iopt_init_table(struct io_pagetable *iopt)
+{
+	init_rwsem(&iopt->iova_rwsem);
+	init_rwsem(&iopt->domains_rwsem);
+	iopt->area_itree = RB_ROOT_CACHED;
+	iopt->reserved_iova_itree = RB_ROOT_CACHED;
+	xa_init(&iopt->domains);
+
+	/*
+	 * iopt's start as SW tables that can use the entire size_t IOVA space
+	 * due to the use of size_t in the APIs. They have no alignment
+	 * restriction.
+	 */
+	iopt->iova_alignment = 1;
+
+	return 0;
+}
+
+void iopt_destroy_table(struct io_pagetable *iopt)
+{
+	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+		iopt_remove_reserved_iova(iopt, NULL);
+	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_iova_itree.rb_root));
+	WARN_ON(!xa_empty(&iopt->domains));
+	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
+}
+
+/**
+ * iopt_unfill_domain() - Unfill a domain with PFNs
+ * @iopt: io_pagetable to act on
+ * @domain: domain to unfill
+ *
+ * This is used when removing a domain from the iopt. Every area in the iopt
+ * will be unmapped from the domain. The domain must already be removed from the
+ * domains xarray.
+ */
+static void iopt_unfill_domain(struct io_pagetable *iopt,
+			       struct iommu_domain *domain)
+{
+	struct iopt_area *area;
+
+	lockdep_assert_held(&iopt->iova_rwsem);
+	lockdep_assert_held_write(&iopt->domains_rwsem);
+
+	/*
+	 * Some other domain is holding all the pfns still, rapidly unmap this
+	 * domain.
+	 */
+	if (iopt->next_domain_id != 0) {
+		/* Pick an arbitrary remaining domain to act as storage */
+		struct iommu_domain *storage_domain =
+			xa_load(&iopt->domains, 0);
+
+		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+			struct iopt_pages *pages = area->pages;
+
+			if (WARN_ON(!pages))
+				continue;
+
+			mutex_lock(&pages->mutex);
+			if (area->storage_domain != domain) {
+				mutex_unlock(&pages->mutex);
+				continue;
+			}
+			area->storage_domain = storage_domain;
+			mutex_unlock(&pages->mutex);
+		}
+
+
+		iopt_unmap_domain(iopt, domain);
+		return;
+	}
+
+	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+		struct iopt_pages *pages = area->pages;
+
+		if (WARN_ON(!pages))
+			continue;
+
+		mutex_lock(&pages->mutex);
+		interval_tree_remove(&area->pages_node,
+				     &area->pages->domains_itree);
+		WARN_ON(area->storage_domain != domain);
+		area->storage_domain = NULL;
+		iopt_area_unfill_domain(area, pages, domain);
+		mutex_unlock(&pages->mutex);
+	}
+}
+
+/**
+ * iopt_fill_domain() - Fill a domain with PFNs
+ * @iopt: io_pagetable to act on
+ * @domain: domain to fill
+ *
+ * Fill the domain with PFNs from every area in the iopt. On failure the domain
+ * is left unchanged.
+ */
+static int iopt_fill_domain(struct io_pagetable *iopt,
+			    struct iommu_domain *domain)
+{
+	struct iopt_area *end_area;
+	struct iopt_area *area;
+	int rc;
+
+	lockdep_assert_held(&iopt->iova_rwsem);
+	lockdep_assert_held_write(&iopt->domains_rwsem);
+
+	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+		struct iopt_pages *pages = area->pages;
+
+		if (WARN_ON(!pages))
+			continue;
+
+		mutex_lock(&pages->mutex);
+		rc = iopt_area_fill_domain(area, domain);
+		if (rc) {
+			mutex_unlock(&pages->mutex);
+			goto out_unfill;
+		}
+		if (!area->storage_domain) {
+			WARN_ON(iopt->next_domain_id != 0);
+			area->storage_domain = domain;
+			interval_tree_insert(&area->pages_node,
+					     &pages->domains_itree);
+		}
+		mutex_unlock(&pages->mutex);
+	}
+	return 0;
+
+out_unfill:
+	end_area = area;
+	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+		struct iopt_pages *pages = area->pages;
+
+		if (area == end_area)
+			break;
+		if (WARN_ON(!pages))
+			continue;
+		mutex_lock(&pages->mutex);
+		if (iopt->next_domain_id == 0) {
+			interval_tree_remove(&area->pages_node,
+					     &pages->domains_itree);
+			area->storage_domain = NULL;
+		}
+		iopt_area_unfill_domain(area, pages, domain);
+		mutex_unlock(&pages->mutex);
+	}
+	return rc;
+}
+
+/* All existing area's conform to an increased page size */
+static int iopt_check_iova_alignment(struct io_pagetable *iopt,
+				     unsigned long new_iova_alignment)
+{
+	struct iopt_area *area;
+
+	lockdep_assert_held(&iopt->iova_rwsem);
+
+	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
+		if ((iopt_area_iova(area) % new_iova_alignment) ||
+		    (iopt_area_length(area) % new_iova_alignment))
+			return -EADDRINUSE;
+	return 0;
+}
+
+int iopt_table_add_domain(struct io_pagetable *iopt,
+			  struct iommu_domain *domain)
+{
+	const struct iommu_domain_geometry *geometry = &domain->geometry;
+	struct iommu_domain *iter_domain;
+	unsigned int new_iova_alignment;
+	unsigned long index;
+	int rc;
+
+	down_write(&iopt->domains_rwsem);
+	down_write(&iopt->iova_rwsem);
+
+	xa_for_each (&iopt->domains, index, iter_domain) {
+		if (WARN_ON(iter_domain == domain)) {
+			rc = -EEXIST;
+			goto out_unlock;
+		}
+	}
+
+	/*
+	 * The io page size drives the iova_alignment. Internally the iopt_pages
+	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
+	 * objects into the iommu_domain.
+	 *
+	 * A iommu_domain must always be able to accept PAGE_SIZE to be
+	 * compatible as we can't guarantee higher contiguity.
+	 */
+	new_iova_alignment =
+		max_t(unsigned long, 1UL << __ffs(domain->pgsize_bitmap),
+		      iopt->iova_alignment);
+	if (new_iova_alignment > PAGE_SIZE) {
+		rc = -EINVAL;
+		goto out_unlock;
+	}
+	if (new_iova_alignment != iopt->iova_alignment) {
+		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
+		if (rc)
+			goto out_unlock;
+	}
+
+	/* No area exists that is outside the allowed domain aperture */
+	if (geometry->aperture_start != 0) {
+		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
+				       domain);
+		if (rc)
+			goto out_reserved;
+	}
+	if (geometry->aperture_end != ULONG_MAX) {
+		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
+				       ULONG_MAX, domain);
+		if (rc)
+			goto out_reserved;
+	}
+
+	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
+	if (rc)
+		goto out_reserved;
+
+	rc = iopt_fill_domain(iopt, domain);
+	if (rc)
+		goto out_release;
+
+	iopt->iova_alignment = new_iova_alignment;
+	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
+	iopt->next_domain_id++;
+	up_write(&iopt->iova_rwsem);
+	up_write(&iopt->domains_rwsem);
+	return 0;
+out_release:
+	xa_release(&iopt->domains, iopt->next_domain_id);
+out_reserved:
+	iopt_remove_reserved_iova(iopt, domain);
+out_unlock:
+	up_write(&iopt->iova_rwsem);
+	up_write(&iopt->domains_rwsem);
+	return rc;
+}
+
+void iopt_table_remove_domain(struct io_pagetable *iopt,
+			      struct iommu_domain *domain)
+{
+	struct iommu_domain *iter_domain = NULL;
+	unsigned long new_iova_alignment;
+	unsigned long index;
+
+	down_write(&iopt->domains_rwsem);
+	down_write(&iopt->iova_rwsem);
+
+	xa_for_each (&iopt->domains, index, iter_domain)
+		if (iter_domain == domain)
+			break;
+	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
+		goto out_unlock;
+
+	/*
+	 * Compress the xarray to keep it linear by swapping the entry to erase
+	 * with the tail entry and shrinking the tail.
+	 */
+	iopt->next_domain_id--;
+	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
+	if (index != iopt->next_domain_id)
+		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
+
+	iopt_unfill_domain(iopt, domain);
+	iopt_remove_reserved_iova(iopt, domain);
+
+	/* Recalculate the iova alignment without the domain */
+	new_iova_alignment = 1;
+	xa_for_each (&iopt->domains, index, iter_domain)
+		new_iova_alignment = max_t(unsigned long,
+					   1UL << __ffs(domain->pgsize_bitmap),
+					   new_iova_alignment);
+	if (!WARN_ON(new_iova_alignment > iopt->iova_alignment))
+		iopt->iova_alignment = new_iova_alignment;
+
+out_unlock:
+	up_write(&iopt->iova_rwsem);
+	up_write(&iopt->domains_rwsem);
+}
+
+/* Narrow the valid_iova_itree to include reserved ranges from a group. */
+int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt,
+					  struct iommu_group *group,
+					  phys_addr_t *sw_msi_start)
+{
+	struct iommu_resv_region *resv;
+	struct iommu_resv_region *tmp;
+	LIST_HEAD(group_resv_regions);
+	int rc;
+
+	down_write(&iopt->iova_rwsem);
+	rc = iommu_get_group_resv_regions(group, &group_resv_regions);
+	if (rc)
+		goto out_unlock;
+
+	list_for_each_entry (resv, &group_resv_regions, list) {
+		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
+			continue;
+
+		/*
+		 * The presence of any 'real' MSI regions should take precedence
+		 * over the software-managed one if the IOMMU driver happens to
+		 * advertise both types.
+		 */
+		if (sw_msi_start && resv->type == IOMMU_RESV_MSI) {
+			*sw_msi_start = 0;
+			sw_msi_start = NULL;
+		}
+		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI)
+			*sw_msi_start = resv->start;
+
+		rc = iopt_reserve_iova(iopt, resv->start,
+				       resv->length - 1 + resv->start, group);
+		if (rc)
+			goto out_reserved;
+	}
+	rc = 0;
+	goto out_free_resv;
+
+out_reserved:
+	iopt_remove_reserved_iova(iopt, group);
+out_free_resv:
+	list_for_each_entry_safe (resv, tmp, &group_resv_regions, list)
+		kfree(resv);
+out_unlock:
+	up_write(&iopt->iova_rwsem);
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 2f1301d39bba7c..bcf08e61bc87e9 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -9,6 +9,9 @@ 
 #include <linux/refcount.h>
 #include <linux/uaccess.h>
 
+struct iommu_domain;
+struct iommu_group;
+
 /*
  * The IOVA to PFN map. The mapper automatically copies the PFNs into multiple
  * domains and permits sharing of PFNs between io_pagetable instances. This
@@ -27,8 +30,40 @@  struct io_pagetable {
 	struct rw_semaphore iova_rwsem;
 	struct rb_root_cached area_itree;
 	struct rb_root_cached reserved_iova_itree;
+	unsigned long iova_alignment;
 };
 
+int iopt_init_table(struct io_pagetable *iopt);
+void iopt_destroy_table(struct io_pagetable *iopt);
+struct iopt_pages *iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
+				  unsigned long *start_byte,
+				  unsigned long length);
+enum { IOPT_ALLOC_IOVA = 1 << 0 };
+int iopt_map_user_pages(struct io_pagetable *iopt, unsigned long *iova,
+			void __user *uptr, unsigned long length, int iommu_prot,
+			unsigned int flags);
+int iopt_map_pages(struct io_pagetable *iopt, struct iopt_pages *pages,
+		   unsigned long *dst_iova, unsigned long start_byte,
+		   unsigned long length, int iommu_prot, unsigned int flags);
+int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
+		    unsigned long length);
+int iopt_unmap_all(struct io_pagetable *iopt);
+
+int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
+		      unsigned long npages, struct page **out_pages, bool write);
+void iopt_unaccess_pages(struct io_pagetable *iopt, unsigned long iova,
+			 size_t npages);
+int iopt_table_add_domain(struct io_pagetable *iopt,
+			  struct iommu_domain *domain);
+void iopt_table_remove_domain(struct io_pagetable *iopt,
+			      struct iommu_domain *domain);
+int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt,
+					  struct iommu_group *group,
+					  phys_addr_t *sw_msi_start);
+int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
+		      unsigned long last, void *owner);
+void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner);
+
 struct iommufd_ctx {
 	struct file *filp;
 	struct xarray objects;