diff mbox series

[v3,10/15] iommufd: IOCTLs for the io_pagetable

Message ID 10-v3-402a7d6459de+24b-iommufd_jgg@nvidia.com (mailing list archive)
State New
Headers show
Series IOMMUFD Generic interface | expand

Commit Message

Jason Gunthorpe Oct. 25, 2022, 6:12 p.m. UTC
Connect the IOAS to its IOCTL interface. This exposes most of the
functionality in the io_pagetable to userspace.

This is intended to be the core of the generic interface that IOMMUFD will
provide. Every IOMMU driver should be able to implement an iommu_domain
that is compatible with this generic mechanism.

It is also designed to be easy to use for simple non virtual machine
monitor users, like DPDK:
 - Universal simple support for all IOMMUs (no PPC special path)
 - An IOVA allocator that considers the aperture and the allowed/reserved
   ranges
 - io_pagetable allows any number of iommu_domains to be connected to the
   IOAS
 - Automatic allocation and re-use of iommu_domains

Along with room in the design to add non-generic features to cater to
specific HW functionality.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/Makefile          |   1 +
 drivers/iommu/iommufd/ioas.c            | 384 ++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |  32 ++
 drivers/iommu/iommufd/main.c            |  45 +++
 include/uapi/linux/iommufd.h            | 239 +++++++++++++++
 5 files changed, 701 insertions(+)
 create mode 100644 drivers/iommu/iommufd/ioas.c

Comments

Jason Gunthorpe Oct. 26, 2022, 5:01 p.m. UTC | #1
On Tue, Oct 25, 2022 at 03:12:19PM -0300, Jason Gunthorpe wrote:
> +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_ioas_alloc *cmd = ucmd->cmd;
> +	struct iommufd_ioas *ioas;
> +	int rc;
> +
> +	if (cmd->flags)
> +		return -EOPNOTSUPP;
> +
> +	ioas = iommufd_ioas_alloc(ucmd->ictx);
> +	if (IS_ERR(ioas))
> +		return PTR_ERR(ioas);
> +
> +	cmd->out_ioas_id = ioas->obj.id;
> +	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
> +	if (rc)
> +		goto out_table;
> +	iommufd_object_finalize(ucmd->ictx, &ioas->obj);
> +	return 0;
> +
> +out_table:
> +	iommufd_ioas_destroy(&ioas->obj);
> +	return rc;

syzkaller says this should be:

out_table:
	iommufd_object_abort_and_destroy(ucmd->ictx, &ioas->obj);

Jason
Jason Gunthorpe Oct. 26, 2022, 11:17 p.m. UTC | #2
On Tue, Oct 25, 2022 at 03:12:19PM -0300, Jason Gunthorpe wrote:
>  static struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
>  	IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id),
> +	IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
> +		 struct iommu_ioas_alloc, out_ioas_id),
> +	IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
> +		 struct iommu_ioas_allow_iovas, allowed_iovas),
> +	IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy,
> +		 src_iova),
> +	IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
> +		 struct iommu_ioas_iova_ranges, out_iova_alignment),
> +	IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map,
> +		 __reserved),

Syzkaller indirectly noticed that __reserved is no longer the last
item in this struct:

        IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map,
-                __reserved),
+                iova),

Also added a test to cover basic struct extensibility on all the ioctls.

Jason
Baolu Lu Oct. 29, 2022, 7:25 a.m. UTC | #3
On 2022/10/26 2:12, Jason Gunthorpe wrote:
> +/**
> + * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES)
> + * @size: sizeof(struct iommu_ioas_iova_ranges)
> + * @ioas_id: IOAS ID to read ranges from
> + * @num_iovas: Input/Output total number of ranges in the IOAS
> + * @__reserved: Must be 0
> + * @allowed_iovas: Pointer to the output array of struct iommu_iova_range
> + * @out_iova_alignment: Minimum alignment required for mapping IOVA
> + *
> + * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges
> + * is not allowed. out_num_iovas will be set to the total number of iovas and
> + * the out_valid_iovas[] will be filled in as space permits.

"out_num_iovas" and "out_valid_iovas[]" are outdated.

> + *
> + * The allowed ranges are dependent on the HW path the DMA operation takes, and
> + * can change during the lifetime of the IOAS. A fresh empty IOAS will have a
> + * full range, and each attached device will narrow the ranges based on that
> + * devices HW restrictions. Detatching a device can widen the ranges. Userspace
> + * should query ranges after every attach/detatch to know what IOVAs are valid
> + * for mapping.
> + *
> + * On input num_iovas is the length of the allowed_iovas array. On output it is
> + * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set
> + * num_iovas to the required value if num_iovas is too small. In this case the
> + * caller should allocate a larger output array and re-issue the ioctl.
> + */
> +struct iommu_ioas_iova_ranges {
> +	__u32 size;
> +	__u32 ioas_id;
> +	__u32 num_iovas;
> +	__u32 __reserved;
> +	__aligned_u64 allowed_iovas;
> +	__aligned_u64 out_iova_alignment;
> +};
> +#define IOMMU_IOAS_IOVA_RANGES _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_IOVA_RANGES)
> +
> +/**
> + * struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS)
> + * @size: sizeof(struct iommu_ioas_allow_iovas)
> + * @ioas_id: IOAS ID to allow IOVAs from

@num_iovas: The number of elements in @allowed_iovas array

> + * @allowed_iovas: Pointer to array of struct iommu_iova_range
> + *
> + * Ensure a range of IOVAs are always available for allocation. If this call
> + * succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of IOVA ranges
> + * that are narrower than the ranges provided here. This call will fail if
> + * IOMMU_IOAS_IOVA_RANGES is currently narrower than the given ranges.
> + *
> + * When an IOAS is first created the IOVA_RANGES will be maximally sized, and as
> + * devices are attached the IOVA will narrow based on the device restrictions.
> + * When an allowed range is specified any narrowing will be refused, ie device
> + * attachment can fail if the device requires limiting within the allowed range.
> + *
> + * Automatic IOVA allocation is also impacted by this call. MAP will only
> + * allocate within the allowed IOVAs if they are present.
> + *
> + * This call replaces the entire allowed list with the given list.
> + */
> +struct iommu_ioas_allow_iovas {
> +	__u32 size;
> +	__u32 ioas_id;
> +	__u32 num_iovas;
> +	__u32 __reserved;
> +	__aligned_u64 allowed_iovas;
> +};
> +#define IOMMU_IOAS_ALLOW_IOVAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOW_IOVAS)

Best regards,
baolu
Tian, Kevin Nov. 4, 2022, 8:32 a.m. UTC | #4
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, October 26, 2022 2:12 AM
> 
> +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_ioas_allow_iovas *cmd = ucmd->cmd;
> +	struct rb_root_cached allowed_iova = RB_ROOT_CACHED;
> +	struct interval_tree_node *node;
> +	struct iommufd_ioas *ioas;
> +	struct io_pagetable *iopt;
> +	int rc = 0;
> +
> +	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
> +	if (IS_ERR(ioas))
> +		return PTR_ERR(ioas);
> +	iopt = &ioas->iopt;

Missed the check of __reserved field

> +
> +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_ioas_copy *cmd = ucmd->cmd;
> +	struct iommufd_ioas *src_ioas;
> +	struct iommufd_ioas *dst_ioas;
> +	unsigned int flags = 0;
> +	LIST_HEAD(pages_list);
> +	unsigned long iova;
> +	int rc;
> +
> +	if ((cmd->flags &
> +	     ~(IOMMU_IOAS_MAP_FIXED_IOVA |
> IOMMU_IOAS_MAP_WRITEABLE |
> +	       IOMMU_IOAS_MAP_READABLE)))
> +		return -EOPNOTSUPP;
> +	if (cmd->length >= ULONG_MAX)
> +		return -EOVERFLOW;

and overflow on cmd->dest_iova/src_iova

> +
> +	src_ioas = iommufd_get_ioas(ucmd, cmd->src_ioas_id);
> +	if (IS_ERR(src_ioas))
> +		return PTR_ERR(src_ioas);
> +	rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length,
> +			    &pages_list);
> +	iommufd_put_object(&src_ioas->obj);
> +	if (rc)
> +		goto out_pages;

direct return given iopt_get_pages() already called
iopt_free_pages_list() upon error.

> +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_ioas_unmap *cmd = ucmd->cmd;
> +	struct iommufd_ioas *ioas;
> +	unsigned long unmapped = 0;
> +	int rc;
> +
> +	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
> +	if (IS_ERR(ioas))
> +		return PTR_ERR(ioas);
> +
> +	if (cmd->iova == 0 && cmd->length == U64_MAX) {
> +		rc = iopt_unmap_all(&ioas->iopt, &unmapped);
> +		if (rc)
> +			goto out_put;
> +	} else {
> +		if (cmd->iova >= ULONG_MAX || cmd->length >=
> ULONG_MAX) {
> +			rc = -EOVERFLOW;
> +			goto out_put;
> +		}

Above check can be moved before iommufd_get_ioas().

> +static int iommufd_option(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_option *cmd = ucmd->cmd;
> +	int rc;
> +

lack of __reserved check

>  static struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
>  	IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct
> iommu_destroy, id),
> +	IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
> +		 struct iommu_ioas_alloc, out_ioas_id),
> +	IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
> +		 struct iommu_ioas_allow_iovas, allowed_iovas),
> +	IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct
> iommu_ioas_copy,
> +		 src_iova),
> +	IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
> +		 struct iommu_ioas_iova_ranges, out_iova_alignment),
> +	IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct
> iommu_ioas_map,
> +		 __reserved),
> +	IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct
> iommu_ioas_unmap,
> +		 length),
> +	IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
> +		 val64),
>  };

Just personal preference - it reads better to me if the above order (and
the enum definition in iommufd.h) can be same as how those commands
are defined/explained in iommufd.h.

> +/**
> + * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES)
> + * @size: sizeof(struct iommu_ioas_iova_ranges)
> + * @ioas_id: IOAS ID to read ranges from
> + * @num_iovas: Input/Output total number of ranges in the IOAS
> + * @__reserved: Must be 0
> + * @allowed_iovas: Pointer to the output array of struct iommu_iova_range
> + * @out_iova_alignment: Minimum alignment required for mapping IOVA
> + *
> + * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these
> ranges
> + * is not allowed. out_num_iovas will be set to the total number of iovas
> and
> + * the out_valid_iovas[] will be filled in as space permits.

out_num_iovas and out_valid_iovas[] are stale.

> + *
> + * The allowed ranges are dependent on the HW path the DMA operation
> takes, and
> + * can change during the lifetime of the IOAS. A fresh empty IOAS will have a
> + * full range, and each attached device will narrow the ranges based on that
> + * devices HW restrictions. Detatching a device can widen the ranges.

devices -> device's

> +/**
> + * struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS)
> + * @size: sizeof(struct iommu_ioas_allow_iovas)
> + * @ioas_id: IOAS ID to allow IOVAs from

missed num_iovas and __reserved

> + * @allowed_iovas: Pointer to array of struct iommu_iova_range
> + *
> + * Ensure a range of IOVAs are always available for allocation. If this call
> + * succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of
> IOVA ranges
> + * that are narrower than the ranges provided here. This call will fail if
> + * IOMMU_IOAS_IOVA_RANGES is currently narrower than the given ranges.
> + *
> + * When an IOAS is first created the IOVA_RANGES will be maximally sized,
> and as
> + * devices are attached the IOVA will narrow based on the device
> restrictions.
> + * When an allowed range is specified any narrowing will be refused, ie
> device
> + * attachment can fail if the device requires limiting within the allowed
> range.
> + *
> + * Automatic IOVA allocation is also impacted by this call. MAP will only
> + * allocate within the allowed IOVAs if they are present.

According to iopt_check_iova() FIXED_IOVA can specify an iova which
is not in allowed list but in the list of reported IOVA_RANGES. Is it
correct or make more sense to have FIXED_IOVA also under guard of
the allowed list (if violating then fail the map call)?

> +/**
> + * struct iommu_ioas_unmap - ioctl(IOMMU_IOAS_UNMAP)
> + * @size: sizeof(struct iommu_ioas_unmap)
> + * @ioas_id: IOAS ID to change the mapping of
> + * @iova: IOVA to start the unmapping at
> + * @length: Number of bytes to unmap, and return back the bytes
> unmapped
> + *
> + * Unmap an IOVA range. The iova/length must be a superset of a
> previously
> + * mapped range used with IOMMU_IOAS_PAGETABLE_MAP or COPY.

remove 'PAGETABLE'

> +/**
> + * enum iommufd_option
> + * @IOMMU_OPTION_RLIMIT_MODE:
> + *    Change how RLIMIT_MEMLOCK accounting works. The caller must have
> privilege
> + *    to invoke this. Value 0 (default) is user based accouting, 1 uses process
> + *    based accounting. Global option, object_id must be 0
> + * @IOMMU_OPTION_HUGE_PAGES:
> + *    Value 1 (default) allows contiguous pages to be combined when
> generating
> + *    iommu mappings. Value 0 disables combining, everything is mapped to
> + *    PAGE_SIZE. This can be useful for benchmarking.  This is a per-IOAS
> + *    option, the object_id must be the IOAS ID.

What about HWPT ID? Is there value of supporting HWPT's with different
mapping size attached to the same IOAS?

> +/**
> + * @size: sizeof(struct iommu_option)
> + * @option_id: One of enum iommufd_option
> + * @op: One of enum iommufd_option_ops
> + * @__reserved: Must be 0
> + * @object_id: ID of the object if required
> + * @val64: Option value to set or value returned on get
> + *
> + * Change a simple option value. This multiplexor allows controlling a
> options
> + * on objects. IOMMU_OPTION_OP_SET will load an option and
> IOMMU_OPTION_OP_GET
> + * will return the current value.
> + */

This is quite generic. Does it imply that future device capability reporting
can be also implemented based on this cmd, i.e. have OP_GET on a
device object?
Jason Gunthorpe Nov. 7, 2022, 2:17 p.m. UTC | #5
On Sat, Oct 29, 2022 at 03:25:00PM +0800, Baolu Lu wrote:
> On 2022/10/26 2:12, Jason Gunthorpe wrote:
> > +/**
> > + * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES)
> > + * @size: sizeof(struct iommu_ioas_iova_ranges)
> > + * @ioas_id: IOAS ID to read ranges from
> > + * @num_iovas: Input/Output total number of ranges in the IOAS
> > + * @__reserved: Must be 0
> > + * @allowed_iovas: Pointer to the output array of struct iommu_iova_range
> > + * @out_iova_alignment: Minimum alignment required for mapping IOVA
> > + *
> > + * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges
> > + * is not allowed. out_num_iovas will be set to the total number of iovas and
> > + * the out_valid_iovas[] will be filled in as space permits.
> 
> "out_num_iovas" and "out_valid_iovas[]" are outdated.
> 
> > +/**
> > + * struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS)
> > + * @size: sizeof(struct iommu_ioas_allow_iovas)
> > + * @ioas_id: IOAS ID to allow IOVAs from
> 
> @num_iovas: The number of elements in @allowed_iovas array

Done both, thanks

Jason
Jason Gunthorpe Nov. 7, 2022, 3:02 p.m. UTC | #6
On Fri, Nov 04, 2022 at 08:32:30AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Wednesday, October 26, 2022 2:12 AM
> > 
> > +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd)
> > +{
> > +	struct iommu_ioas_allow_iovas *cmd = ucmd->cmd;
> > +	struct rb_root_cached allowed_iova = RB_ROOT_CACHED;
> > +	struct interval_tree_node *node;
> > +	struct iommufd_ioas *ioas;
> > +	struct io_pagetable *iopt;
> > +	int rc = 0;
> > +
> > +	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
> > +	if (IS_ERR(ioas))
> > +		return PTR_ERR(ioas);
> > +	iopt = &ioas->iopt;
> 
> Missed the check of __reserved field

Done

> > +
> > +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd)
> > +{
> > +	struct iommu_ioas_copy *cmd = ucmd->cmd;
> > +	struct iommufd_ioas *src_ioas;
> > +	struct iommufd_ioas *dst_ioas;
> > +	unsigned int flags = 0;
> > +	LIST_HEAD(pages_list);
> > +	unsigned long iova;
> > +	int rc;
> > +
> > +	if ((cmd->flags &
> > +	     ~(IOMMU_IOAS_MAP_FIXED_IOVA |
> > IOMMU_IOAS_MAP_WRITEABLE |
> > +	       IOMMU_IOAS_MAP_READABLE)))
> > +		return -EOPNOTSUPP;
> > +	if (cmd->length >= ULONG_MAX)
> > +		return -EOVERFLOW;
> 
> and overflow on cmd->dest_iova/src_iova

Yep

> > +
> > +	src_ioas = iommufd_get_ioas(ucmd, cmd->src_ioas_id);
> > +	if (IS_ERR(src_ioas))
> > +		return PTR_ERR(src_ioas);
> > +	rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length,
> > +			    &pages_list);
> > +	iommufd_put_object(&src_ioas->obj);
> > +	if (rc)
> > +		goto out_pages;
> 
> direct return given iopt_get_pages() already called
> iopt_free_pages_list() upon error.

Ok

> > +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
> > +{
> > +	struct iommu_ioas_unmap *cmd = ucmd->cmd;
> > +	struct iommufd_ioas *ioas;
> > +	unsigned long unmapped = 0;
> > +	int rc;
> > +
> > +	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
> > +	if (IS_ERR(ioas))
> > +		return PTR_ERR(ioas);
> > +
> > +	if (cmd->iova == 0 && cmd->length == U64_MAX) {
> > +		rc = iopt_unmap_all(&ioas->iopt, &unmapped);
> > +		if (rc)
> > +			goto out_put;
> > +	} else {
> > +		if (cmd->iova >= ULONG_MAX || cmd->length >=
> > ULONG_MAX) {
> > +			rc = -EOVERFLOW;
> > +			goto out_put;
> > +		}
> 
> Above check can be moved before iommufd_get_ioas().

They have to be after this:

	if (cmd->iova == 0 && cmd->length == U64_MAX) {

Or it will false trigger


> > +static int iommufd_option(struct iommufd_ucmd *ucmd)
> > +{
> > +	struct iommu_option *cmd = ucmd->cmd;
> > +	int rc;
> > +
> 
> lack of __reserved check

Done

> >  static struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
> >  	IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct
> > iommu_destroy, id),
> > +	IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
> > +		 struct iommu_ioas_alloc, out_ioas_id),
> > +	IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
> > +		 struct iommu_ioas_allow_iovas, allowed_iovas),
> > +	IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct
> > iommu_ioas_copy,
> > +		 src_iova),
> > +	IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
> > +		 struct iommu_ioas_iova_ranges, out_iova_alignment),
> > +	IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct
> > iommu_ioas_map,
> > +		 __reserved),
> > +	IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct
> > iommu_ioas_unmap,
> > +		 length),
> > +	IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
> > +		 val64),
> >  };
> 
> Just personal preference - it reads better to me if the above order (and
> the enum definition in iommufd.h) can be same as how those commands
> are defined/explained in iommufd.h.

I prefer "keep sorted" for these kinds of lists, it is much easier to
maintain in the long run

> > + * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these
> > ranges
> > + * is not allowed. out_num_iovas will be set to the total number of iovas
> > and
> > + * the out_valid_iovas[] will be filled in as space permits.
> 
> out_num_iovas and out_valid_iovas[] are stale.

Done

> > + *
> > + * The allowed ranges are dependent on the HW path the DMA operation
> > takes, and
> > + * can change during the lifetime of the IOAS. A fresh empty IOAS will have a
> > + * full range, and each attached device will narrow the ranges based on that
> > + * devices HW restrictions. Detatching a device can widen the ranges.
> 
> devices -> device's

Ok

> > +/**
> > + * struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS)
> > + * @size: sizeof(struct iommu_ioas_allow_iovas)
> > + * @ioas_id: IOAS ID to allow IOVAs from
> 
> missed num_iovas and __reserved

Hurm. how do I get make W=1 to cover the header files?

> > + * @allowed_iovas: Pointer to array of struct iommu_iova_range
> > + *
> > + * Ensure a range of IOVAs are always available for allocation. If this call
> > + * succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of
> > IOVA ranges
> > + * that are narrower than the ranges provided here. This call will fail if
> > + * IOMMU_IOAS_IOVA_RANGES is currently narrower than the given ranges.
> > + *
> > + * When an IOAS is first created the IOVA_RANGES will be maximally sized,
> > and as
> > + * devices are attached the IOVA will narrow based on the device
> > restrictions.
> > + * When an allowed range is specified any narrowing will be refused, ie
> > device
> > + * attachment can fail if the device requires limiting within the allowed
> > range.
> > + *
> > + * Automatic IOVA allocation is also impacted by this call. MAP will only
> > + * allocate within the allowed IOVAs if they are present.
> 
> According to iopt_check_iova() FIXED_IOVA can specify an iova which
> is not in allowed list but in the list of reported IOVA_RANGES. Is it
> correct or make more sense to have FIXED_IOVA also under guard of
> the allowed list (if violating then fail the map call)?

The concept was the allow list only really impacts domain
attachment. When a user uses FIXED they have to know what they are
doing. There is not a good reason to deny the user to use any IOVA
that is not restricted by the reserved list.

> > + * @length: Number of bytes to unmap, and return back the bytes
> > unmapped
> > + *
> > + * Unmap an IOVA range. The iova/length must be a superset of a
> > previously
> > + * mapped range used with IOMMU_IOAS_PAGETABLE_MAP or COPY.
> 
> remove 'PAGETABLE'

Done

> 
> > +/**
> > + * enum iommufd_option
> > + * @IOMMU_OPTION_RLIMIT_MODE:
> > + *    Change how RLIMIT_MEMLOCK accounting works. The caller must have
> > privilege
> > + *    to invoke this. Value 0 (default) is user based accouting, 1 uses process
> > + *    based accounting. Global option, object_id must be 0
> > + * @IOMMU_OPTION_HUGE_PAGES:
> > + *    Value 1 (default) allows contiguous pages to be combined when
> > generating
> > + *    iommu mappings. Value 0 disables combining, everything is mapped to
> > + *    PAGE_SIZE. This can be useful for benchmarking.  This is a per-IOAS
> > + *    option, the object_id must be the IOAS ID.
> 
> What about HWPT ID? Is there value of supporting HWPT's with different
> mapping size attached to the same IOAS?

This is a global IOAS flag, it just makes everything use PAGE_SIZE. It
is really only interesting for debugging and benchmarking. The test
suite and syzkaller have both made use of this to improve coverage.

> > +/**
> > + * @size: sizeof(struct iommu_option)
> > + * @option_id: One of enum iommufd_option
> > + * @op: One of enum iommufd_option_ops
> > + * @__reserved: Must be 0
> > + * @object_id: ID of the object if required
> > + * @val64: Option value to set or value returned on get
> > + *
> > + * Change a simple option value. This multiplexor allows controlling a
> > options
> > + * on objects. IOMMU_OPTION_OP_SET will load an option and
> > IOMMU_OPTION_OP_GET
> > + * will return the current value.
> > + */
> 
> This is quite generic. Does it imply that future device capability reporting
> can be also implemented based on this cmd, i.e. have OP_GET on a
> device object?

I don't view this as a way to do capabilities. I think we will have a
capability ioctl as well. This is really for something that can be
get & set.

Thanks,
Jason
Tian, Kevin Nov. 8, 2022, 2:05 a.m. UTC | #7
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Monday, November 7, 2022 11:02 PM
> > > + * @allowed_iovas: Pointer to array of struct iommu_iova_range
> > > + *
> > > + * Ensure a range of IOVAs are always available for allocation. If this call
> > > + * succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of
> > > IOVA ranges
> > > + * that are narrower than the ranges provided here. This call will fail if
> > > + * IOMMU_IOAS_IOVA_RANGES is currently narrower than the given
> ranges.
> > > + *
> > > + * When an IOAS is first created the IOVA_RANGES will be maximally
> sized,
> > > and as
> > > + * devices are attached the IOVA will narrow based on the device
> > > restrictions.
> > > + * When an allowed range is specified any narrowing will be refused, ie
> > > device
> > > + * attachment can fail if the device requires limiting within the allowed
> > > range.
> > > + *
> > > + * Automatic IOVA allocation is also impacted by this call. MAP will only
> > > + * allocate within the allowed IOVAs if they are present.
> >
> > According to iopt_check_iova() FIXED_IOVA can specify an iova which
> > is not in allowed list but in the list of reported IOVA_RANGES. Is it
> > correct or make more sense to have FIXED_IOVA also under guard of
> > the allowed list (if violating then fail the map call)?
> 
> The concept was the allow list only really impacts domain
> attachment. When a user uses FIXED they have to know what they are

it also impacts automatic IOVA

> doing. There is not a good reason to deny the user to use any IOVA
> that is not restricted by the reserved list.
> 

I just didn't get why different restrictions are applied to automatics vs.
fixed allocation.
Jason Gunthorpe Nov. 8, 2022, 5:29 p.m. UTC | #8
On Tue, Nov 08, 2022 at 02:05:20AM +0000, Tian, Kevin wrote:
> > The concept was the allow list only really impacts domain
> > attachment. When a user uses FIXED they have to know what they are
> 
> it also impacts automatic IOVA
> 
> > doing. There is not a good reason to deny the user to use any IOVA
> > that is not restricted by the reserved list.
> 
> I just didn't get why different restrictions are applied to automatics vs.
> fixed allocation.

Because it isn't a restriction on allocation, it is a restriction on
domain attachment. (and this is a bit confusing and subtle, but it is
what it was built for)

The purpose is to allow the IOMMU driver to allocate a domain with
knowledge of what the user would like to do. For instance a small
allowed range may allow the driver to allocate fewer hops in the IO
page table, and a create a domain with a smaller aperture.

"automatic" is supposed to find a good IOVA that is the best IOVA for
that mapping. We have defined the best IOVA as one that doesn't reach
into areas that future domains are allowed to mark as reserved.

fixed can do anything legal, including violate the allowances if it
wants. This just means a future domain attachment might fail. Domain's
won't look at any currently mapped IOVA, only the allowance ranges
when they size themselves.

Jason
Tian, Kevin Nov. 9, 2022, 2:50 a.m. UTC | #9
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, November 9, 2022 1:30 AM
> 
> On Tue, Nov 08, 2022 at 02:05:20AM +0000, Tian, Kevin wrote:
> > > The concept was the allow list only really impacts domain
> > > attachment. When a user uses FIXED they have to know what they are
> >
> > it also impacts automatic IOVA
> >
> > > doing. There is not a good reason to deny the user to use any IOVA
> > > that is not restricted by the reserved list.
> >
> > I just didn't get why different restrictions are applied to automatics vs.
> > fixed allocation.
> 
> Because it isn't a restriction on allocation, it is a restriction on
> domain attachment. (and this is a bit confusing and subtle, but it is
> what it was built for)
> 
> The purpose is to allow the IOMMU driver to allocate a domain with
> knowledge of what the user would like to do. For instance a small
> allowed range may allow the driver to allocate fewer hops in the IO
> page table, and a create a domain with a smaller aperture.

if a domain can be created with a smaller aperture according to
allowed ranges then the impact is not just on future domain
attachment. a fixed IOVA outside of the allowed ranges will
find error on current domain too.

Should a driver which does this update the reserved ranges too?

> 
> "automatic" is supposed to find a good IOVA that is the best IOVA for
> that mapping. We have defined the best IOVA as one that doesn't reach
> into areas that future domains are allowed to mark as reserved.

'good' or 'best' just implies that the allocation prefers to permitted
ranges over reserved ranges. then when permitted ranges are used
up the allocator can then go to find less-favorable holes outside of
the permitted ranges if not reserved. same as what FIXED IOVA allows.

But the behavior in this uAPI is that automatic IOVAs can be only
in permitted ranges. This is a restriction on allocation instead of
preference.

IMHO I'd like this uAPI clearly defined as either of below:

1) only related to domain attach then automatic IOVA can be allocated
   outside of permitted ranges after the latter are used up;

2) applied to both domain attach and IOVA allocation which then applied
   to both automatic IOVA and fixed IOVA;

> 
> fixed can do anything legal, including violate the allowances if it
> wants. This just means a future domain attachment might fail. Domain's
> won't look at any currently mapped IOVA, only the allowance ranges
> when they size themselves.
> 
> Jason
Jason Gunthorpe Nov. 9, 2022, 1:05 p.m. UTC | #10
On Wed, Nov 09, 2022 at 02:50:34AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Wednesday, November 9, 2022 1:30 AM
> > 
> > On Tue, Nov 08, 2022 at 02:05:20AM +0000, Tian, Kevin wrote:
> > > > The concept was the allow list only really impacts domain
> > > > attachment. When a user uses FIXED they have to know what they are
> > >
> > > it also impacts automatic IOVA
> > >
> > > > doing. There is not a good reason to deny the user to use any IOVA
> > > > that is not restricted by the reserved list.
> > >
> > > I just didn't get why different restrictions are applied to automatics vs.
> > > fixed allocation.
> > 
> > Because it isn't a restriction on allocation, it is a restriction on
> > domain attachment. (and this is a bit confusing and subtle, but it is
> > what it was built for)
> > 
> > The purpose is to allow the IOMMU driver to allocate a domain with
> > knowledge of what the user would like to do. For instance a small
> > allowed range may allow the driver to allocate fewer hops in the IO
> > page table, and a create a domain with a smaller aperture.
> 
> if a domain can be created with a smaller aperture according to
> allowed ranges then the impact is not just on future domain
> attachment. a fixed IOVA outside of the allowed ranges will
> find error on current domain too.
>
> Should a driver which does this update the reserved ranges too?

Not reserved ranges, the domain should have a smaller aperture, and
the checks during domain attachment will refuse if IOVA is to be
mapped out side the domain's aperture.

So if someone mis-uses fixed mode and gets a domain with an aperture
that doesn't cover stuff they mapped then domain attachment will fail.

It is no different from mapping something before domain attachment.

> > "automatic" is supposed to find a good IOVA that is the best IOVA for
> > that mapping. We have defined the best IOVA as one that doesn't reach
> > into areas that future domains are allowed to mark as reserved.
> 
> 'good' or 'best' just implies that the allocation prefers to permitted
> ranges over reserved ranges. then when permitted ranges are used
> up the allocator can then go to find less-favorable holes outside of
> the permitted ranges if not reserved. same as what FIXED IOVA allows.
> 
> But the behavior in this uAPI is that automatic IOVAs can be only
> in permitted ranges. This is a restriction on allocation instead of
> preference.

Again, it is not a restriction. It is just how the automatic allocater
works.

> IMHO I'd like this uAPI clearly defined as either of below:
> 
> 1) only related to domain attach then automatic IOVA can be allocated
>    outside of permitted ranges after the latter are used up;

This makes automatic allocation functionally useless when composed
with allowed ranges. There is no reason to do this.

> 2) applied to both domain attach and IOVA allocation which then applied
>    to both automatic IOVA and fixed IOVA;

And this is more complicated and slow, without any real purpose.

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index b66a8c47ff55ec..2b4f36f1b72f9d 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 iommufd-y := \
 	io_pagetable.o \
+	ioas.o \
 	main.o \
 	pages.o
 
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
new file mode 100644
index 00000000000000..6963b7b0a3c957
--- /dev/null
+++ b/drivers/iommu/iommufd/ioas.c
@@ -0,0 +1,384 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/interval_tree.h>
+#include <linux/iommufd.h>
+#include <linux/iommu.h>
+#include <uapi/linux/iommufd.h>
+
+#include "io_pagetable.h"
+
+void iommufd_ioas_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj);
+	int rc;
+
+	rc = iopt_unmap_all(&ioas->iopt, NULL);
+	WARN_ON(rc && rc != -ENOENT);
+	iopt_destroy_table(&ioas->iopt);
+}
+
+struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx)
+{
+	struct iommufd_ioas *ioas;
+	int rc;
+
+	ioas = iommufd_object_alloc(ictx, ioas, IOMMUFD_OBJ_IOAS);
+	if (IS_ERR(ioas))
+		return ioas;
+
+	rc = iopt_init_table(&ioas->iopt);
+	if (rc)
+		goto out_abort;
+	return ioas;
+
+out_abort:
+	iommufd_object_abort(ictx, &ioas->obj);
+	return ERR_PTR(rc);
+}
+
+int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_ioas_alloc *cmd = ucmd->cmd;
+	struct iommufd_ioas *ioas;
+	int rc;
+
+	if (cmd->flags)
+		return -EOPNOTSUPP;
+
+	ioas = iommufd_ioas_alloc(ucmd->ictx);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+
+	cmd->out_ioas_id = ioas->obj.id;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_table;
+	iommufd_object_finalize(ucmd->ictx, &ioas->obj);
+	return 0;
+
+out_table:
+	iommufd_ioas_destroy(&ioas->obj);
+	return rc;
+}
+
+int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_iova_range __user *ranges;
+	struct iommu_ioas_iova_ranges *cmd = ucmd->cmd;
+	struct iommufd_ioas *ioas;
+	struct interval_tree_span_iter span;
+	u32 max_iovas;
+	int rc;
+
+	if (cmd->__reserved)
+		return -EOPNOTSUPP;
+
+	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+
+	down_read(&ioas->iopt.iova_rwsem);
+	max_iovas = cmd->num_iovas;
+	ranges = u64_to_user_ptr(cmd->allowed_iovas);
+	cmd->num_iovas = 0;
+	cmd->out_iova_alignment = ioas->iopt.iova_alignment;
+	interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
+				    ULONG_MAX) {
+		if (!span.is_hole)
+			continue;
+		if (cmd->num_iovas < max_iovas) {
+			struct iommu_iova_range elm = {
+				.start = span.start_hole,
+				.last = span.last_hole,
+			};
+
+			if (copy_to_user(&ranges[cmd->num_iovas], &elm,
+					 sizeof(elm))) {
+				rc = -EFAULT;
+				goto out_put;
+			}
+		}
+		cmd->num_iovas++;
+	}
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_put;
+	if (cmd->num_iovas > max_iovas)
+		rc = -EMSGSIZE;
+out_put:
+	up_read(&ioas->iopt.iova_rwsem);
+	iommufd_put_object(&ioas->obj);
+	return rc;
+}
+
+static int iommufd_ioas_load_iovas(struct rb_root_cached *itree,
+				   struct iommu_iova_range __user *ranges,
+				   u32 num)
+{
+	u32 i;
+
+	for (i = 0; i != num; i++) {
+		struct iommu_iova_range range;
+		struct iopt_allowed *allowed;
+
+		if (copy_from_user(&range, ranges + i, sizeof(range)))
+			return -EFAULT;
+
+		if (range.start >= range.last)
+			return -EINVAL;
+
+		if (interval_tree_iter_first(itree, range.start, range.last))
+			return -EINVAL;
+
+		allowed = kzalloc(sizeof(*allowed), GFP_KERNEL_ACCOUNT);
+		if (!allowed)
+			return -ENOMEM;
+		allowed->node.start = range.start;
+		allowed->node.last = range.last;
+
+		interval_tree_insert(&allowed->node, itree);
+	}
+	return 0;
+}
+
+int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_ioas_allow_iovas *cmd = ucmd->cmd;
+	struct rb_root_cached allowed_iova = RB_ROOT_CACHED;
+	struct interval_tree_node *node;
+	struct iommufd_ioas *ioas;
+	struct io_pagetable *iopt;
+	int rc = 0;
+
+	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+	iopt = &ioas->iopt;
+
+	rc = iommufd_ioas_load_iovas(&allowed_iova,
+				     u64_to_user_ptr(cmd->allowed_iovas),
+				     cmd->num_iovas);
+	if (rc)
+		goto out_free;
+
+	rc = iopt_set_allow_iova(iopt, &allowed_iova);
+out_free:
+	while ((node = interval_tree_iter_first(&allowed_iova, 0, ULONG_MAX))) {
+		interval_tree_remove(node, &allowed_iova);
+		kfree(container_of(node, struct iopt_allowed, node));
+	}
+	iommufd_put_object(&ioas->obj);
+	return rc;
+}
+
+static int conv_iommu_prot(u32 map_flags)
+{
+	int iommu_prot;
+
+	/*
+	 * We provide no manual cache coherency ioctls to userspace and most
+	 * architectures make the CPU ops for cache flushing privileged.
+	 * Therefore we require the underlying IOMMU to support CPU coherent
+	 * operation. Support for IOMMU_CACHE is enforced by the
+	 * IOMMU_CAP_CACHE_COHERENCY test during bind.
+	 */
+	iommu_prot = IOMMU_CACHE;
+	if (map_flags & IOMMU_IOAS_MAP_WRITEABLE)
+		iommu_prot |= IOMMU_WRITE;
+	if (map_flags & IOMMU_IOAS_MAP_READABLE)
+		iommu_prot |= IOMMU_READ;
+	return iommu_prot;
+}
+
+int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_ioas_map *cmd = ucmd->cmd;
+	struct iommufd_ioas *ioas;
+	unsigned int flags = 0;
+	unsigned long iova;
+	int rc;
+
+	if ((cmd->flags &
+	     ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+	       IOMMU_IOAS_MAP_READABLE)) ||
+	    cmd->__reserved)
+		return -EOPNOTSUPP;
+	if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
+		return -EOVERFLOW;
+
+	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+
+	if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+		flags = IOPT_ALLOC_IOVA;
+	iova = cmd->iova;
+	rc = iopt_map_user_pages(ucmd->ictx, &ioas->iopt, &iova,
+				 u64_to_user_ptr(cmd->user_va), cmd->length,
+				 conv_iommu_prot(cmd->flags), flags);
+	if (rc)
+		goto out_put;
+
+	cmd->iova = iova;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put:
+	iommufd_put_object(&ioas->obj);
+	return rc;
+}
+
+int iommufd_ioas_copy(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_ioas_copy *cmd = ucmd->cmd;
+	struct iommufd_ioas *src_ioas;
+	struct iommufd_ioas *dst_ioas;
+	unsigned int flags = 0;
+	LIST_HEAD(pages_list);
+	unsigned long iova;
+	int rc;
+
+	if ((cmd->flags &
+	     ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+	       IOMMU_IOAS_MAP_READABLE)))
+		return -EOPNOTSUPP;
+	if (cmd->length >= ULONG_MAX)
+		return -EOVERFLOW;
+
+	src_ioas = iommufd_get_ioas(ucmd, cmd->src_ioas_id);
+	if (IS_ERR(src_ioas))
+		return PTR_ERR(src_ioas);
+	rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length,
+			    &pages_list);
+	iommufd_put_object(&src_ioas->obj);
+	if (rc)
+		goto out_pages;
+
+	dst_ioas = iommufd_get_ioas(ucmd, cmd->dst_ioas_id);
+	if (IS_ERR(dst_ioas)) {
+		rc = PTR_ERR(dst_ioas);
+		goto out_pages;
+	}
+
+	if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+		flags = IOPT_ALLOC_IOVA;
+	iova = cmd->dst_iova;
+	rc = iopt_map_pages(&dst_ioas->iopt, &pages_list, cmd->length, &iova,
+			    conv_iommu_prot(cmd->flags), flags);
+	if (rc)
+		goto out_put_dst;
+
+	cmd->dst_iova = iova;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put_dst:
+	iommufd_put_object(&dst_ioas->obj);
+out_pages:
+	iopt_free_pages_list(&pages_list);
+	return rc;
+}
+
+int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_ioas_unmap *cmd = ucmd->cmd;
+	struct iommufd_ioas *ioas;
+	unsigned long unmapped = 0;
+	int rc;
+
+	ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+
+	if (cmd->iova == 0 && cmd->length == U64_MAX) {
+		rc = iopt_unmap_all(&ioas->iopt, &unmapped);
+		if (rc)
+			goto out_put;
+	} else {
+		if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) {
+			rc = -EOVERFLOW;
+			goto out_put;
+		}
+		rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length,
+				     &unmapped);
+		if (rc)
+			goto out_put;
+	}
+
+	cmd->length = unmapped;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+out_put:
+	iommufd_put_object(&ioas->obj);
+	return rc;
+}
+
+int iommufd_option_rlimit_mode(struct iommu_option *cmd,
+			       struct iommufd_ctx *ictx)
+{
+	if (cmd->op == IOMMU_OPTION_OP_GET) {
+		cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM;
+		return 0;
+	}
+	if (cmd->op == IOMMU_OPTION_OP_SET) {
+		int rc = 0;
+
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		xa_lock(&ictx->objects);
+		if (!xa_empty(&ictx->objects)) {
+			rc = -EBUSY;
+		} else {
+			if (cmd->val64 == 0)
+				ictx->account_mode = IOPT_PAGES_ACCOUNT_USER;
+			else if (cmd->val64 == 1)
+				ictx->account_mode = IOPT_PAGES_ACCOUNT_MM;
+			else
+				rc = -EINVAL;
+		}
+		xa_unlock(&ictx->objects);
+
+		return rc;
+	}
+	return -EOPNOTSUPP;
+}
+
+static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
+					  struct iommufd_ioas *ioas)
+{
+	if (cmd->op == IOMMU_OPTION_OP_GET) {
+		cmd->val64 = !ioas->iopt.disable_large_pages;
+		return 0;
+	}
+	if (cmd->op == IOMMU_OPTION_OP_SET) {
+		if (cmd->val64 == 0)
+			return iopt_disable_large_pages(&ioas->iopt);
+		if (cmd->val64 == 1) {
+			iopt_enable_large_pages(&ioas->iopt);
+			return 0;
+		}
+		return -EINVAL;
+	}
+	return -EOPNOTSUPP;
+}
+
+int iommufd_ioas_option(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_option *cmd = ucmd->cmd;
+	struct iommufd_ioas *ioas;
+	int rc = 0;
+
+	ioas = iommufd_get_ioas(ucmd, cmd->object_id);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+
+	switch (cmd->option_id) {
+	case IOMMU_OPTION_HUGE_PAGES:
+		rc = iommufd_ioas_option_huge_pages(cmd, ioas);
+		break;
+	default:
+		rc = -EOPNOTSUPP;
+	}
+
+	iommufd_put_object(&ioas->obj);
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index ba8ecdc209ab6e..20e1e3801fc525 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -11,6 +11,7 @@ 
 
 struct iommu_domain;
 struct iommu_group;
+struct iommu_option;
 
 struct iommufd_ctx {
 	struct file *file;
@@ -101,6 +102,7 @@  static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
 enum iommufd_object_type {
 	IOMMUFD_OBJ_NONE,
 	IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
+	IOMMUFD_OBJ_IOAS,
 };
 
 /* Base struct for all objects with a userspace ID handle. */
@@ -172,4 +174,34 @@  struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
 			     type),                                            \
 		     typeof(*(ptr)), obj)
 
+/*
+ * The IO Address Space (IOAS) pagetable is a virtual page table backed by the
+ * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
+ * mapping is copied into all of the associated domains and made available to
+ * in-kernel users.
+ */
+struct iommufd_ioas {
+	struct iommufd_object obj;
+	struct io_pagetable iopt;
+};
+
+static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ucmd *ucmd,
+						    u32 id)
+{
+	return container_of(iommufd_get_object(ucmd->ictx, id,
+					       IOMMUFD_OBJ_IOAS),
+			    struct iommufd_ioas, obj);
+}
+
+struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx);
+int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_ioas_destroy(struct iommufd_object *obj);
+int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
+int iommufd_option_rlimit_mode(struct iommu_option *cmd,
+			       struct iommufd_ctx *ictx);
 #endif
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index d4f3aa06ee28f8..c91156ba3e3ab3 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -202,8 +202,36 @@  static int iommufd_fops_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int iommufd_option(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_option *cmd = ucmd->cmd;
+	int rc;
+
+	switch (cmd->option_id) {
+	case IOMMU_OPTION_RLIMIT_MODE:
+		rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
+		break;
+	case IOMMU_OPTION_HUGE_PAGES:
+		rc = iommufd_ioas_option(ucmd);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+	if (rc)
+		return rc;
+	if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64,
+			 &cmd->val64, sizeof(cmd->val64)))
+		return -EFAULT;
+	return 0;
+}
+
 union ucmd_buffer {
 	struct iommu_destroy destroy;
+	struct iommu_ioas_alloc alloc;
+	struct iommu_ioas_allow_iovas allow_iovas;
+	struct iommu_ioas_iova_ranges iova_ranges;
+	struct iommu_ioas_map map;
+	struct iommu_ioas_unmap unmap;
 };
 
 struct iommufd_ioctl_op {
@@ -224,6 +252,20 @@  struct iommufd_ioctl_op {
 	}
 static struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 	IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id),
+	IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
+		 struct iommu_ioas_alloc, out_ioas_id),
+	IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
+		 struct iommu_ioas_allow_iovas, allowed_iovas),
+	IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy,
+		 src_iova),
+	IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
+		 struct iommu_ioas_iova_ranges, out_iova_alignment),
+	IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map,
+		 __reserved),
+	IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap,
+		 length),
+	IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
+		 val64),
 };
 
 static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd,
@@ -310,6 +352,9 @@  void iommufd_ctx_put(struct iommufd_ctx *ictx)
 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD);
 
 static struct iommufd_object_ops iommufd_object_ops[] = {
+	[IOMMUFD_OBJ_IOAS] = {
+		.destroy = iommufd_ioas_destroy,
+	},
 };
 
 static struct miscdevice iommu_misc_dev = {
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index b77b7eb0aeb13c..06608cecbe19e6 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -37,6 +37,13 @@ 
 enum {
 	IOMMUFD_CMD_BASE = 0x80,
 	IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE,
+	IOMMUFD_CMD_IOAS_ALLOC,
+	IOMMUFD_CMD_IOAS_ALLOW_IOVAS,
+	IOMMUFD_CMD_IOAS_COPY,
+	IOMMUFD_CMD_IOAS_IOVA_RANGES,
+	IOMMUFD_CMD_IOAS_MAP,
+	IOMMUFD_CMD_IOAS_UNMAP,
+	IOMMUFD_CMD_OPTION,
 };
 
 /**
@@ -52,4 +59,236 @@  struct iommu_destroy {
 };
 #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)
 
+/**
+ * struct iommu_ioas_alloc - ioctl(IOMMU_IOAS_ALLOC)
+ * @size: sizeof(struct iommu_ioas_alloc)
+ * @flags: Must be 0
+ * @out_ioas_id: Output IOAS ID for the allocated object
+ *
+ * Allocate an IO Address Space (IOAS) which holds an IO Virtual Address (IOVA)
+ * to memory mapping.
+ */
+struct iommu_ioas_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 out_ioas_id;
+};
+#define IOMMU_IOAS_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOC)
+
+/**
+ * struct iommu_iova_range
+ * @start: First IOVA
+ * @last: Inclusive last IOVA
+ *
+ * An interval in IOVA space.
+ */
+struct iommu_iova_range {
+	__aligned_u64 start;
+	__aligned_u64 last;
+};
+
+/**
+ * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES)
+ * @size: sizeof(struct iommu_ioas_iova_ranges)
+ * @ioas_id: IOAS ID to read ranges from
+ * @num_iovas: Input/Output total number of ranges in the IOAS
+ * @__reserved: Must be 0
+ * @allowed_iovas: Pointer to the output array of struct iommu_iova_range
+ * @out_iova_alignment: Minimum alignment required for mapping IOVA
+ *
+ * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges
+ * is not allowed. out_num_iovas will be set to the total number of iovas and
+ * the out_valid_iovas[] will be filled in as space permits.
+ *
+ * The allowed ranges are dependent on the HW path the DMA operation takes, and
+ * can change during the lifetime of the IOAS. A fresh empty IOAS will have a
+ * full range, and each attached device will narrow the ranges based on that
+ * devices HW restrictions. Detatching a device can widen the ranges. Userspace
+ * should query ranges after every attach/detatch to know what IOVAs are valid
+ * for mapping.
+ *
+ * On input num_iovas is the length of the allowed_iovas array. On output it is
+ * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set
+ * num_iovas to the required value if num_iovas is too small. In this case the
+ * caller should allocate a larger output array and re-issue the ioctl.
+ */
+struct iommu_ioas_iova_ranges {
+	__u32 size;
+	__u32 ioas_id;
+	__u32 num_iovas;
+	__u32 __reserved;
+	__aligned_u64 allowed_iovas;
+	__aligned_u64 out_iova_alignment;
+};
+#define IOMMU_IOAS_IOVA_RANGES _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_IOVA_RANGES)
+
+/**
+ * struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS)
+ * @size: sizeof(struct iommu_ioas_allow_iovas)
+ * @ioas_id: IOAS ID to allow IOVAs from
+ * @allowed_iovas: Pointer to array of struct iommu_iova_range
+ *
+ * Ensure a range of IOVAs are always available for allocation. If this call
+ * succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of IOVA ranges
+ * that are narrower than the ranges provided here. This call will fail if
+ * IOMMU_IOAS_IOVA_RANGES is currently narrower than the given ranges.
+ *
+ * When an IOAS is first created the IOVA_RANGES will be maximally sized, and as
+ * devices are attached the IOVA will narrow based on the device restrictions.
+ * When an allowed range is specified any narrowing will be refused, ie device
+ * attachment can fail if the device requires limiting within the allowed range.
+ *
+ * Automatic IOVA allocation is also impacted by this call. MAP will only
+ * allocate within the allowed IOVAs if they are present.
+ *
+ * This call replaces the entire allowed list with the given list.
+ */
+struct iommu_ioas_allow_iovas {
+	__u32 size;
+	__u32 ioas_id;
+	__u32 num_iovas;
+	__u32 __reserved;
+	__aligned_u64 allowed_iovas;
+};
+#define IOMMU_IOAS_ALLOW_IOVAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOW_IOVAS)
+
+/**
+ * enum iommufd_ioas_map_flags - Flags for map and copy
+ * @IOMMU_IOAS_MAP_FIXED_IOVA: If clear the kernel will compute an appropriate
+ *                             IOVA to place the mapping at
+ * @IOMMU_IOAS_MAP_WRITEABLE: DMA is allowed to write to this mapping
+ * @IOMMU_IOAS_MAP_READABLE: DMA is allowed to read from this mapping
+ */
+enum iommufd_ioas_map_flags {
+	IOMMU_IOAS_MAP_FIXED_IOVA = 1 << 0,
+	IOMMU_IOAS_MAP_WRITEABLE = 1 << 1,
+	IOMMU_IOAS_MAP_READABLE = 1 << 2,
+};
+
+/**
+ * struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP)
+ * @size: sizeof(struct iommu_ioas_map)
+ * @flags: Combination of enum iommufd_ioas_map_flags
+ * @ioas_id: IOAS ID to change the mapping of
+ * @__reserved: Must be 0
+ * @user_va: Userspace pointer to start mapping from
+ * @length: Number of bytes to map
+ * @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set
+ *        then this must be provided as input.
+ *
+ * Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the
+ * mapping will be established at iova, otherwise a suitable location based on
+ * the reserved and allowed lists will be automatically selected and returned in
+ * iova.
+ */
+struct iommu_ioas_map {
+	__u32 size;
+	__u32 flags;
+	__u32 ioas_id;
+	__u32 __reserved;
+	__aligned_u64 user_va;
+	__aligned_u64 length;
+	__aligned_u64 iova;
+};
+#define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP)
+
+/**
+ * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY)
+ * @size: sizeof(struct iommu_ioas_copy)
+ * @flags: Combination of enum iommufd_ioas_map_flags
+ * @dst_ioas_id: IOAS ID to change the mapping of
+ * @src_ioas_id: IOAS ID to copy from
+ * @length: Number of bytes to copy and map
+ * @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is
+ *            set then this must be provided as input.
+ * @src_iova: IOVA to start the copy
+ *
+ * Copy an already existing mapping from src_ioas_id and establish it in
+ * dst_ioas_id. The src iova/length must exactly match a range used with
+ * IOMMU_IOAS_MAP.
+ *
+ * This may be used to efficiently clone a subset of an IOAS to another, or as a
+ * kind of 'cache' to speed up mapping. Copy has an effciency advantage over
+ * establishing equivalent new mappings, as internal resources are shared, and
+ * the kernel will pin the user memory only once.
+ */
+struct iommu_ioas_copy {
+	__u32 size;
+	__u32 flags;
+	__u32 dst_ioas_id;
+	__u32 src_ioas_id;
+	__aligned_u64 length;
+	__aligned_u64 dst_iova;
+	__aligned_u64 src_iova;
+};
+#define IOMMU_IOAS_COPY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_COPY)
+
+/**
+ * struct iommu_ioas_unmap - ioctl(IOMMU_IOAS_UNMAP)
+ * @size: sizeof(struct iommu_ioas_unmap)
+ * @ioas_id: IOAS ID to change the mapping of
+ * @iova: IOVA to start the unmapping at
+ * @length: Number of bytes to unmap, and return back the bytes unmapped
+ *
+ * Unmap an IOVA range. The iova/length must be a superset of a previously
+ * mapped range used with IOMMU_IOAS_PAGETABLE_MAP or COPY. Splitting or
+ * truncating ranges is not allowed. The values 0 to U64_MAX will unmap
+ * everything.
+ */
+struct iommu_ioas_unmap {
+	__u32 size;
+	__u32 ioas_id;
+	__aligned_u64 iova;
+	__aligned_u64 length;
+};
+#define IOMMU_IOAS_UNMAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_UNMAP)
+
+/**
+ * enum iommufd_option
+ * @IOMMU_OPTION_RLIMIT_MODE:
+ *    Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
+ *    to invoke this. Value 0 (default) is user based accouting, 1 uses process
+ *    based accounting. Global option, object_id must be 0
+ * @IOMMU_OPTION_HUGE_PAGES:
+ *    Value 1 (default) allows contiguous pages to be combined when generating
+ *    iommu mappings. Value 0 disables combining, everything is mapped to
+ *    PAGE_SIZE. This can be useful for benchmarking.  This is a per-IOAS
+ *    option, the object_id must be the IOAS ID.
+ */
+enum iommufd_option {
+	IOMMU_OPTION_RLIMIT_MODE = 0,
+	IOMMU_OPTION_HUGE_PAGES = 1,
+};
+
+/**
+ * enum iommufd_option_ops
+ * @IOMMU_OPTION_OP_SET: Set the option's value
+ * @IOMMU_OPTION_OP_GET: Get the option's value
+ */
+enum iommufd_option_ops {
+	IOMMU_OPTION_OP_SET = 0,
+	IOMMU_OPTION_OP_GET = 1,
+};
+
+/**
+ * @size: sizeof(struct iommu_option)
+ * @option_id: One of enum iommufd_option
+ * @op: One of enum iommufd_option_ops
+ * @__reserved: Must be 0
+ * @object_id: ID of the object if required
+ * @val64: Option value to set or value returned on get
+ *
+ * Change a simple option value. This multiplexor allows controlling a options
+ * on objects. IOMMU_OPTION_OP_SET will load an option and IOMMU_OPTION_OP_GET
+ * will return the current value.
+ */
+struct iommu_option {
+	__u32 size;
+	__u32 option_id;
+	__u16 op;
+	__u16 __reserved;
+	__u32 object_id;
+	__aligned_u64 val64;
+};
+#define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
 #endif