diff mbox series

[v3,08/17] iommufd: IOMMU_HWPT_ALLOC allocation with user data

Message ID 20230724110406.107212-9-yi.l.liu@intel.com (mailing list archive)
State New
Headers show
Series iommufd: Add nesting infrastructure | expand

Commit Message

Yi Liu July 24, 2023, 11:03 a.m. UTC
IOMMU_HWPT_ALLOC already supports iommu_domain allocation for usersapce.
But it can only allocate hw_pagetables linked with IOAS. There are needs
to support hw_pagetable allocation with parameters specified by user. For
example, in nested translation, user needs to allocate hw_pagetable for
the stage-1 translation (e.g. a single I/O page table or a set of I/O page
tables) with user data. It also needs to provide a stage-2 hw_pagetable
which is linked to the GPA IOAS.

This extends IOMMU_HWPT_ALLOC to accept user specified parameter and hwpt
ID in @pt_id field. It can be used to allocate user-managed stage-1 hwpt,
which requires a parent hwpt to point to the stage-2 translation.

Co-developed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/iommu/iommufd/hw_pagetable.c | 65 ++++++++++++++++++++++++----
 drivers/iommu/iommufd/main.c         |  2 +-
 include/uapi/linux/iommufd.h         | 20 ++++++++-
 3 files changed, 77 insertions(+), 10 deletions(-)

Comments

Jason Gunthorpe July 28, 2023, 5:55 p.m. UTC | #1
On Mon, Jul 24, 2023 at 04:03:57AM -0700, Yi Liu wrote:

> +	switch (pt_obj->type) {
> +	case IOMMUFD_OBJ_IOAS:
> +		ioas = container_of(pt_obj, struct iommufd_ioas, obj);
> +		break;
> +	case IOMMUFD_OBJ_HW_PAGETABLE:
> +		/* pt_id points HWPT only when hwpt_type is !IOMMU_HWPT_TYPE_DEFAULT */
> +		if (cmd->hwpt_type == IOMMU_HWPT_TYPE_DEFAULT) {
> +			rc = -EINVAL;
> +			goto out_put_pt;
> +		}
> +
> +		parent = container_of(pt_obj, struct iommufd_hw_pagetable, obj);
> +		/*
> +		 * Cannot allocate user-managed hwpt linking to auto_created
> +		 * hwpt. If the parent hwpt is already a user-managed hwpt,
> +		 * don't allocate another user-managed hwpt linking to it.
> +		 */
> +		if (parent->auto_domain || parent->parent) {
> +			rc = -EINVAL;
> +			goto out_put_pt;
> +		}
> +		ioas = parent->ioas;

Why do we set ioas here? I would think it should be NULL.

I think it is looking like a mistake to try and re-use
iommufd_hw_pagetable_alloc() directly for the nested case. It should
not have a IOAS argument, it should not do enforce_cc, or iopt_*
functions

So must of the function is removed. Probably better to make a new
ioas-less function for the nesting case.

> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index 510db114fc61..5f4420626421 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -426,7 +426,7 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
>  	IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info,
>  		 __reserved),
>  	IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
> -		 __reserved),
> +		 data_uptr),

Nono, these can never change once we put them it. It specifies the
hard minimum size that userspace must provide. If userspace gives less
than this then the ioctl always fails. Changing it breaks all the
existing software.

The core code ensures that the trailing part of the cmd struct is
zero'd the extended implementation must cope with Zero'd values, which
this does.

> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index f2026cde2d64..73bf9af91e99 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -364,12 +364,27 @@ enum iommu_hwpt_type {
>   * @pt_id: The IOAS to connect this HWPT to
>   * @out_hwpt_id: The ID of the new HWPT
>   * @__reserved: Must be 0
> + * @hwpt_type: One of enum iommu_hwpt_type
> + * @data_len: Length of the type specific data
> + * @data_uptr: User pointer to the type specific data
>   *
>   * Explicitly allocate a hardware page table object. This is the same object
>   * type that is returned by iommufd_device_attach() and represents the
>   * underlying iommu driver's iommu_domain kernel object.
>   *
> - * A HWPT will be created with the IOVA mappings from the given IOAS.
> + * A kernel-managed HWPT will be created with the mappings from the given
> + * IOAS via the @pt_id. The @hwpt_type for this allocation can be set to
> + * either IOMMU_HWPT_TYPE_DEFAULT or a pre-defined type corresponding to
> + * an I/O page table type supported by the underlying IOMMU hardware.


> + * A user-managed HWPT will be created from a given parent HWPT via the
> + * @pt_id, in which the parent HWPT must be allocated previously via the
> + * same ioctl from a given IOAS (@pt_id). In this case, the @hwpt_type
> + * must be set to a pre-defined type corresponding to an I/O page table
> + * type supported by the underlying IOMMU hardware.
> + *
> + * If the @hwpt_type is set to IOMMU_HWPT_TYPE_DEFAULT, both the @data_len
> + * and the @data_uptr will be ignored. Otherwise, both must be
> given.

 If the @hwpt_type is set to IOMMU_HWPT_TYPE_DEFAULT then @data_len
 must be zero.

Jason
Nicolin Chen July 28, 2023, 7:10 p.m. UTC | #2
On Fri, Jul 28, 2023 at 02:55:57PM -0300, Jason Gunthorpe wrote:
> On Mon, Jul 24, 2023 at 04:03:57AM -0700, Yi Liu wrote:
> 
> > +	switch (pt_obj->type) {
> > +	case IOMMUFD_OBJ_IOAS:
> > +		ioas = container_of(pt_obj, struct iommufd_ioas, obj);
> > +		break;
> > +	case IOMMUFD_OBJ_HW_PAGETABLE:
> > +		/* pt_id points HWPT only when hwpt_type is !IOMMU_HWPT_TYPE_DEFAULT */
> > +		if (cmd->hwpt_type == IOMMU_HWPT_TYPE_DEFAULT) {
> > +			rc = -EINVAL;
> > +			goto out_put_pt;
> > +		}
> > +
> > +		parent = container_of(pt_obj, struct iommufd_hw_pagetable, obj);
> > +		/*
> > +		 * Cannot allocate user-managed hwpt linking to auto_created
> > +		 * hwpt. If the parent hwpt is already a user-managed hwpt,
> > +		 * don't allocate another user-managed hwpt linking to it.
> > +		 */
> > +		if (parent->auto_domain || parent->parent) {
> > +			rc = -EINVAL;
> > +			goto out_put_pt;
> > +		}
> > +		ioas = parent->ioas;
> 
> Why do we set ioas here? I would think it should be NULL.
> 
> I think it is looking like a mistake to try and re-use
> iommufd_hw_pagetable_alloc() directly for the nested case. It should
> not have a IOAS argument, it should not do enforce_cc, or iopt_*
> functions
> 
> So must of the function is removed. Probably better to make a new
> ioas-less function for the nesting case.

OK.

@Yi, along with the PATCH-6 (IOMMU_RESV_SW_MSI), I will rework
on this -- mainly breaking up NESTED hwpt with IOAS.

Thanks
Nic
Tian, Kevin July 31, 2023, 6:31 a.m. UTC | #3
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Saturday, July 29, 2023 1:56 AM
> 
> On Mon, Jul 24, 2023 at 04:03:57AM -0700, Yi Liu wrote:
> 
> > +	switch (pt_obj->type) {
> > +	case IOMMUFD_OBJ_IOAS:
> > +		ioas = container_of(pt_obj, struct iommufd_ioas, obj);
> > +		break;
> > +	case IOMMUFD_OBJ_HW_PAGETABLE:
> > +		/* pt_id points HWPT only when hwpt_type
> is !IOMMU_HWPT_TYPE_DEFAULT */
> > +		if (cmd->hwpt_type == IOMMU_HWPT_TYPE_DEFAULT) {
> > +			rc = -EINVAL;
> > +			goto out_put_pt;
> > +		}
> > +
> > +		parent = container_of(pt_obj, struct iommufd_hw_pagetable,
> obj);
> > +		/*
> > +		 * Cannot allocate user-managed hwpt linking to
> auto_created
> > +		 * hwpt. If the parent hwpt is already a user-managed hwpt,
> > +		 * don't allocate another user-managed hwpt linking to it.
> > +		 */
> > +		if (parent->auto_domain || parent->parent) {
> > +			rc = -EINVAL;
> > +			goto out_put_pt;
> > +		}
> > +		ioas = parent->ioas;
> 
> Why do we set ioas here? I would think it should be NULL.
> 
> I think it is looking like a mistake to try and re-use
> iommufd_hw_pagetable_alloc() directly for the nested case. It should
> not have a IOAS argument, it should not do enforce_cc, or iopt_*
> functions

enforce_cc is still required since it's about memory accesses post
page table walking (no matter the walked table is single stage or
nested).

> 
> So must of the function is removed. Probably better to make a new
> ioas-less function for the nesting case.
> 
> > diff --git a/drivers/iommu/iommufd/main.c
> b/drivers/iommu/iommufd/main.c
> > index 510db114fc61..5f4420626421 100644
> > --- a/drivers/iommu/iommufd/main.c
> > +++ b/drivers/iommu/iommufd/main.c
> > @@ -426,7 +426,7 @@ static const struct iommufd_ioctl_op
> iommufd_ioctl_ops[] = {
> >  	IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct
> iommu_hw_info,
> >  		 __reserved),
> >  	IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct
> iommu_hwpt_alloc,
> > -		 __reserved),
> > +		 data_uptr),
> 
> Nono, these can never change once we put them it. It specifies the
> hard minimum size that userspace must provide. If userspace gives less
> than this then the ioctl always fails. Changing it breaks all the
> existing software.
> 
> The core code ensures that the trailing part of the cmd struct is
> zero'd the extended implementation must cope with Zero'd values, which
> this does.
> 

Ideally expanding uAPI structure size should come with new flag bits.

this one might be a bit special. hwpt_alloc() series was just queued to
iommufd-next. If the nesting series could come together in one cycle
then probably changing it in current way is fine since there is no
existing software. Otherwise we need follow common practice. 
Yi Liu July 31, 2023, 7:22 a.m. UTC | #4
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Saturday, July 29, 2023 3:10 AM
> 
> On Fri, Jul 28, 2023 at 02:55:57PM -0300, Jason Gunthorpe wrote:
> > On Mon, Jul 24, 2023 at 04:03:57AM -0700, Yi Liu wrote:
> >
> > > +	switch (pt_obj->type) {
> > > +	case IOMMUFD_OBJ_IOAS:
> > > +		ioas = container_of(pt_obj, struct iommufd_ioas, obj);
> > > +		break;
> > > +	case IOMMUFD_OBJ_HW_PAGETABLE:
> > > +		/* pt_id points HWPT only when hwpt_type
> is !IOMMU_HWPT_TYPE_DEFAULT */
> > > +		if (cmd->hwpt_type == IOMMU_HWPT_TYPE_DEFAULT) {
> > > +			rc = -EINVAL;
> > > +			goto out_put_pt;
> > > +		}
> > > +
> > > +		parent = container_of(pt_obj, struct iommufd_hw_pagetable, obj);
> > > +		/*
> > > +		 * Cannot allocate user-managed hwpt linking to auto_created
> > > +		 * hwpt. If the parent hwpt is already a user-managed hwpt,
> > > +		 * don't allocate another user-managed hwpt linking to it.
> > > +		 */
> > > +		if (parent->auto_domain || parent->parent) {
> > > +			rc = -EINVAL;
> > > +			goto out_put_pt;
> > > +		}
> > > +		ioas = parent->ioas;
> >
> > Why do we set ioas here? I would think it should be NULL.
> >
> > I think it is looking like a mistake to try and re-use
> > iommufd_hw_pagetable_alloc() directly for the nested case. It should
> > not have a IOAS argument, it should not do enforce_cc, or iopt_*
> > functions
> >
> > So must of the function is removed. Probably better to make a new
> > ioas-less function for the nesting case.
> 
> OK.
> 
> @Yi, along with the PATCH-6 (IOMMU_RESV_SW_MSI), I will rework
> on this -- mainly breaking up NESTED hwpt with IOAS.

Thanks. Then I'll address the hw_info comments first.

Regards,
Yi Liu
Jason Gunthorpe July 31, 2023, 1:16 p.m. UTC | #5
On Mon, Jul 31, 2023 at 06:31:20AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Saturday, July 29, 2023 1:56 AM
> > 
> > On Mon, Jul 24, 2023 at 04:03:57AM -0700, Yi Liu wrote:
> > 
> > > +	switch (pt_obj->type) {
> > > +	case IOMMUFD_OBJ_IOAS:
> > > +		ioas = container_of(pt_obj, struct iommufd_ioas, obj);
> > > +		break;
> > > +	case IOMMUFD_OBJ_HW_PAGETABLE:
> > > +		/* pt_id points HWPT only when hwpt_type
> > is !IOMMU_HWPT_TYPE_DEFAULT */
> > > +		if (cmd->hwpt_type == IOMMU_HWPT_TYPE_DEFAULT) {
> > > +			rc = -EINVAL;
> > > +			goto out_put_pt;
> > > +		}
> > > +
> > > +		parent = container_of(pt_obj, struct iommufd_hw_pagetable,
> > obj);
> > > +		/*
> > > +		 * Cannot allocate user-managed hwpt linking to
> > auto_created
> > > +		 * hwpt. If the parent hwpt is already a user-managed hwpt,
> > > +		 * don't allocate another user-managed hwpt linking to it.
> > > +		 */
> > > +		if (parent->auto_domain || parent->parent) {
> > > +			rc = -EINVAL;
> > > +			goto out_put_pt;
> > > +		}
> > > +		ioas = parent->ioas;
> > 
> > Why do we set ioas here? I would think it should be NULL.
> > 
> > I think it is looking like a mistake to try and re-use
> > iommufd_hw_pagetable_alloc() directly for the nested case. It should
> > not have a IOAS argument, it should not do enforce_cc, or iopt_*
> > functions
> 
> enforce_cc is still required since it's about memory accesses post
> page table walking (no matter the walked table is single stage or
> nested).

enforce_cc only has meaning if the kernel owns the IOPTEs, and if we
are creating a nest it does not. The guest has to set any necessary
IOPTE bits.

enforce_cc will be done on the parent of the nest as normal.

> Ideally expanding uAPI structure size should come with new flag bits.

Flags or some kind of 'zero is the same behavior as a smaller struct'
scheme.

This patch is doing the zero option:

 	__u32 __reserved;
+	__u32 hwpt_type;
+	__u32 data_len;
+	__aligned_u64 data_uptr;
 };

hwpt_type == 0 means default type
data_len == 0 means no data
data_uptr is ignored (zero is safe)

So there is no need to change it

Jason
Tian, Kevin Aug. 1, 2023, 2:35 a.m. UTC | #6
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Monday, July 31, 2023 9:16 PM
> 
> On Mon, Jul 31, 2023 at 06:31:20AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Saturday, July 29, 2023 1:56 AM
> > >
> > > On Mon, Jul 24, 2023 at 04:03:57AM -0700, Yi Liu wrote:
> > >
> > > > +	switch (pt_obj->type) {
> > > > +	case IOMMUFD_OBJ_IOAS:
> > > > +		ioas = container_of(pt_obj, struct iommufd_ioas, obj);
> > > > +		break;
> > > > +	case IOMMUFD_OBJ_HW_PAGETABLE:
> > > > +		/* pt_id points HWPT only when hwpt_type
> > > is !IOMMU_HWPT_TYPE_DEFAULT */
> > > > +		if (cmd->hwpt_type == IOMMU_HWPT_TYPE_DEFAULT) {
> > > > +			rc = -EINVAL;
> > > > +			goto out_put_pt;
> > > > +		}
> > > > +
> > > > +		parent = container_of(pt_obj, struct iommufd_hw_pagetable,
> > > obj);
> > > > +		/*
> > > > +		 * Cannot allocate user-managed hwpt linking to
> > > auto_created
> > > > +		 * hwpt. If the parent hwpt is already a user-managed hwpt,
> > > > +		 * don't allocate another user-managed hwpt linking to it.
> > > > +		 */
> > > > +		if (parent->auto_domain || parent->parent) {
> > > > +			rc = -EINVAL;
> > > > +			goto out_put_pt;
> > > > +		}
> > > > +		ioas = parent->ioas;
> > >
> > > Why do we set ioas here? I would think it should be NULL.
> > >
> > > I think it is looking like a mistake to try and re-use
> > > iommufd_hw_pagetable_alloc() directly for the nested case. It should
> > > not have a IOAS argument, it should not do enforce_cc, or iopt_*
> > > functions
> >
> > enforce_cc is still required since it's about memory accesses post
> > page table walking (no matter the walked table is single stage or
> > nested).
> 
> enforce_cc only has meaning if the kernel owns the IOPTEs, and if we
> are creating a nest it does not. The guest has to set any necessary
> IOPTE bits.
> 
> enforce_cc will be done on the parent of the nest as normal.

Ah, yes. What I described is the final behavior but in concept it's
done for the parent.

> 
> > Ideally expanding uAPI structure size should come with new flag bits.
> 
> Flags or some kind of 'zero is the same behavior as a smaller struct'
> scheme.
> 
> This patch is doing the zero option:
> 
>  	__u32 __reserved;
> +	__u32 hwpt_type;
> +	__u32 data_len;
> +	__aligned_u64 data_uptr;
>  };
> 
> hwpt_type == 0 means default type
> data_len == 0 means no data
> data_uptr is ignored (zero is safe)
> 
> So there is no need to change it
> 

Make sense
Nicolin Chen Aug. 2, 2023, 11:42 p.m. UTC | #7
On Mon, Jul 31, 2023 at 10:16:17AM -0300, Jason Gunthorpe wrote:
 
> > Ideally expanding uAPI structure size should come with new flag bits.
> 
> Flags or some kind of 'zero is the same behavior as a smaller struct'
> scheme.
> 
> This patch is doing the zero option:
> 
>  	__u32 __reserved;
> +	__u32 hwpt_type;
> +	__u32 data_len;
> +	__aligned_u64 data_uptr;
>  };
> 
> hwpt_type == 0 means default type
> data_len == 0 means no data
> data_uptr is ignored (zero is safe)
> 
> So there is no need to change it

TEST_LENGTH passing ".size = sizeof(struct _struct) - 1" expects a
-EINVAL error code from "if (ucmd.user_size < op->min_size)" check
in the iommufd_fops_ioctl(). This has been working when min_size is
exactly the size of the structure.

When the size of the structure becomes larger than min_size, i.e.
the passing size above is larger than min_size, it bypasses that
min_size sanity and goes down to an ioctl handler with a potential
risk. And actually, the size range can be [min_size, struct_size),
making it harder for us to sanitize with the existing code.

I wonder what's the generic way of sanitizing this case? And, it
seems that TEST_LENGTH needs some rework to test min_size only?

Thanks
Nic
Jason Gunthorpe Aug. 2, 2023, 11:43 p.m. UTC | #8
On Wed, Aug 02, 2023 at 04:42:10PM -0700, Nicolin Chen wrote:
> On Mon, Jul 31, 2023 at 10:16:17AM -0300, Jason Gunthorpe wrote:
>  
> > > Ideally expanding uAPI structure size should come with new flag bits.
> > 
> > Flags or some kind of 'zero is the same behavior as a smaller struct'
> > scheme.
> > 
> > This patch is doing the zero option:
> > 
> >  	__u32 __reserved;
> > +	__u32 hwpt_type;
> > +	__u32 data_len;
> > +	__aligned_u64 data_uptr;
> >  };
> > 
> > hwpt_type == 0 means default type
> > data_len == 0 means no data
> > data_uptr is ignored (zero is safe)
> > 
> > So there is no need to change it
> 
> TEST_LENGTH passing ".size = sizeof(struct _struct) - 1" expects a
> -EINVAL error code from "if (ucmd.user_size < op->min_size)" check
> in the iommufd_fops_ioctl(). This has been working when min_size is
> exactly the size of the structure.
> 
> When the size of the structure becomes larger than min_size, i.e.
> the passing size above is larger than min_size, it bypasses that
> min_size sanity and goes down to an ioctl handler with a potential
> risk. And actually, the size range can be [min_size, struct_size),
> making it harder for us to sanitize with the existing code.
> 
> I wonder what's the generic way of sanitizing this case? And, it
> seems that TEST_LENGTH needs some rework to test min_size only?

Yes, it should technically test using offsetof and a matching set of
struct members.

Jason
Nicolin Chen Aug. 3, 2023, 12:53 a.m. UTC | #9
On Wed, Aug 02, 2023 at 08:43:12PM -0300, Jason Gunthorpe wrote:
> On Wed, Aug 02, 2023 at 04:42:10PM -0700, Nicolin Chen wrote:
> > On Mon, Jul 31, 2023 at 10:16:17AM -0300, Jason Gunthorpe wrote:
> >  
> > > > Ideally expanding uAPI structure size should come with new flag bits.
> > > 
> > > Flags or some kind of 'zero is the same behavior as a smaller struct'
> > > scheme.
> > > 
> > > This patch is doing the zero option:
> > > 
> > >  	__u32 __reserved;
> > > +	__u32 hwpt_type;
> > > +	__u32 data_len;
> > > +	__aligned_u64 data_uptr;
> > >  };
> > > 
> > > hwpt_type == 0 means default type
> > > data_len == 0 means no data
> > > data_uptr is ignored (zero is safe)
> > > 
> > > So there is no need to change it
> > 
> > TEST_LENGTH passing ".size = sizeof(struct _struct) - 1" expects a
> > -EINVAL error code from "if (ucmd.user_size < op->min_size)" check
> > in the iommufd_fops_ioctl(). This has been working when min_size is
> > exactly the size of the structure.
> > 
> > When the size of the structure becomes larger than min_size, i.e.
> > the passing size above is larger than min_size, it bypasses that
> > min_size sanity and goes down to an ioctl handler with a potential
> > risk. And actually, the size range can be [min_size, struct_size),
> > making it harder for us to sanitize with the existing code.
> > 
> > I wonder what's the generic way of sanitizing this case? And, it
> > seems that TEST_LENGTH needs some rework to test min_size only?
> 
> Yes, it should technically test using offsetof and a matching set of
> struct members.

OK. I copied 3 lines for offsetofend from the kernel and did this:
--------------------------------------------------------------------------
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 6b075a68b928..a15a475c1243 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -86,12 +86,13 @@ TEST_F(iommufd, cmd_fail)

 TEST_F(iommufd, cmd_length)
 {
-#define TEST_LENGTH(_struct, _ioctl)                                     \
+#define TEST_LENGTH(_struct, _ioctl, _last)                              \
        {                                                                \
+               size_t min_size = offsetofend(struct _struct, _last);    \
                struct {                                                 \
                        struct _struct cmd;                              \
                        uint8_t extra;                                   \
-               } cmd = { .cmd = { .size = sizeof(struct _struct) - 1 }, \
+               } cmd = { .cmd = { .size = min_size - 1 },               \
                          .extra = UINT8_MAX };                          \
                int old_errno;                                           \
                int rc;                                                  \
--------------------------------------------------------------------------

Any misaligned size within the range of [min_size, struct_size) still
doesn't have a coverage though. Is this something that we have to let
it fail with a potential risk?

Thanks
Nic
Jason Gunthorpe Aug. 3, 2023, 4:47 p.m. UTC | #10
On Wed, Aug 02, 2023 at 05:53:40PM -0700, Nicolin Chen wrote:
> On Wed, Aug 02, 2023 at 08:43:12PM -0300, Jason Gunthorpe wrote:
> > On Wed, Aug 02, 2023 at 04:42:10PM -0700, Nicolin Chen wrote:
> > > On Mon, Jul 31, 2023 at 10:16:17AM -0300, Jason Gunthorpe wrote:
> > >  
> > > > > Ideally expanding uAPI structure size should come with new flag bits.
> > > > 
> > > > Flags or some kind of 'zero is the same behavior as a smaller struct'
> > > > scheme.
> > > > 
> > > > This patch is doing the zero option:
> > > > 
> > > >  	__u32 __reserved;
> > > > +	__u32 hwpt_type;
> > > > +	__u32 data_len;
> > > > +	__aligned_u64 data_uptr;
> > > >  };
> > > > 
> > > > hwpt_type == 0 means default type
> > > > data_len == 0 means no data
> > > > data_uptr is ignored (zero is safe)
> > > > 
> > > > So there is no need to change it
> > > 
> > > TEST_LENGTH passing ".size = sizeof(struct _struct) - 1" expects a
> > > -EINVAL error code from "if (ucmd.user_size < op->min_size)" check
> > > in the iommufd_fops_ioctl(). This has been working when min_size is
> > > exactly the size of the structure.
> > > 
> > > When the size of the structure becomes larger than min_size, i.e.
> > > the passing size above is larger than min_size, it bypasses that
> > > min_size sanity and goes down to an ioctl handler with a potential
> > > risk. And actually, the size range can be [min_size, struct_size),
> > > making it harder for us to sanitize with the existing code.
> > > 
> > > I wonder what's the generic way of sanitizing this case? And, it
> > > seems that TEST_LENGTH needs some rework to test min_size only?
> > 
> > Yes, it should technically test using offsetof and a matching set of
> > struct members.
> 
> OK. I copied 3 lines for offsetofend from the kernel and did this:
> --------------------------------------------------------------------------
> diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
> index 6b075a68b928..a15a475c1243 100644
> --- a/tools/testing/selftests/iommu/iommufd.c
> +++ b/tools/testing/selftests/iommu/iommufd.c
> @@ -86,12 +86,13 @@ TEST_F(iommufd, cmd_fail)
> 
>  TEST_F(iommufd, cmd_length)
>  {
> -#define TEST_LENGTH(_struct, _ioctl)                                     \
> +#define TEST_LENGTH(_struct, _ioctl, _last)                              \
>         {                                                                \
> +               size_t min_size = offsetofend(struct _struct, _last);    \
>                 struct {                                                 \
>                         struct _struct cmd;                              \
>                         uint8_t extra;                                   \
> -               } cmd = { .cmd = { .size = sizeof(struct _struct) - 1 }, \
> +               } cmd = { .cmd = { .size = min_size - 1 },               \
>                           .extra = UINT8_MAX };                          \
>                 int old_errno;                                           \
>                 int rc;                                                  \
> --------------------------------------------------------------------------
> 
> Any misaligned size within the range of [min_size, struct_size) still
> doesn't have a coverage though. Is this something that we have to let
> it fail with a potential risk?

It looks about right, I didn't try to test all the permutations, it
could be done but I'm not sure it has value.

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index c7301cf0e85a..97e4114226de 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -193,29 +193,75 @@  iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 
 int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 {
+	struct iommufd_hw_pagetable *hwpt, *parent = NULL;
+	union iommu_domain_user_data *data = NULL;
 	struct iommu_hwpt_alloc *cmd = ucmd->cmd;
-	struct iommufd_hw_pagetable *hwpt;
+	struct iommufd_object *pt_obj;
 	struct iommufd_device *idev;
 	struct iommufd_ioas *ioas;
-	int rc;
+	int rc = 0;
 
 	if (cmd->flags || cmd->__reserved)
 		return -EOPNOTSUPP;
+	if (!cmd->data_len && cmd->hwpt_type != IOMMU_HWPT_TYPE_DEFAULT)
+		return -EINVAL;
 
 	idev = iommufd_get_device(ucmd, cmd->dev_id);
 	if (IS_ERR(idev))
 		return PTR_ERR(idev);
 
-	ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id);
-	if (IS_ERR(ioas)) {
-		rc = PTR_ERR(ioas);
+	pt_obj = iommufd_get_object(ucmd->ictx, cmd->pt_id, IOMMUFD_OBJ_ANY);
+	if (IS_ERR(pt_obj)) {
+		rc = -EINVAL;
 		goto out_put_idev;
 	}
 
+	switch (pt_obj->type) {
+	case IOMMUFD_OBJ_IOAS:
+		ioas = container_of(pt_obj, struct iommufd_ioas, obj);
+		break;
+	case IOMMUFD_OBJ_HW_PAGETABLE:
+		/* pt_id points HWPT only when hwpt_type is !IOMMU_HWPT_TYPE_DEFAULT */
+		if (cmd->hwpt_type == IOMMU_HWPT_TYPE_DEFAULT) {
+			rc = -EINVAL;
+			goto out_put_pt;
+		}
+
+		parent = container_of(pt_obj, struct iommufd_hw_pagetable, obj);
+		/*
+		 * Cannot allocate user-managed hwpt linking to auto_created
+		 * hwpt. If the parent hwpt is already a user-managed hwpt,
+		 * don't allocate another user-managed hwpt linking to it.
+		 */
+		if (parent->auto_domain || parent->parent) {
+			rc = -EINVAL;
+			goto out_put_pt;
+		}
+		ioas = parent->ioas;
+		break;
+	default:
+		rc = -EINVAL;
+		goto out_put_pt;
+	}
+
+	if (cmd->data_len) {
+		data = kzalloc(sizeof(*data), GFP_KERNEL);
+		if (!data) {
+			rc = -ENOMEM;
+			goto out_put_pt;
+		}
+
+		rc = copy_struct_from_user(data, sizeof(*data),
+					   u64_to_user_ptr(cmd->data_uptr),
+					   cmd->data_len);
+		if (rc)
+			goto out_free_data;
+	}
+
 	mutex_lock(&ioas->mutex);
 	hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, idev,
-					  IOMMU_HWPT_TYPE_DEFAULT,
-					  NULL, NULL, false);
+					  cmd->hwpt_type,
+					  parent, data, false);
 	if (IS_ERR(hwpt)) {
 		rc = PTR_ERR(hwpt);
 		goto out_unlock;
@@ -232,7 +278,10 @@  int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 	iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj);
 out_unlock:
 	mutex_unlock(&ioas->mutex);
-	iommufd_put_object(&ioas->obj);
+out_free_data:
+	kfree(data);
+out_put_pt:
+	iommufd_put_object(pt_obj);
 out_put_idev:
 	iommufd_put_object(&idev->obj);
 	return rc;
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 510db114fc61..5f4420626421 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -426,7 +426,7 @@  static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 	IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info,
 		 __reserved),
 	IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
-		 __reserved),
+		 data_uptr),
 	IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
 		 struct iommu_ioas_alloc, out_ioas_id),
 	IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index f2026cde2d64..73bf9af91e99 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -364,12 +364,27 @@  enum iommu_hwpt_type {
  * @pt_id: The IOAS to connect this HWPT to
  * @out_hwpt_id: The ID of the new HWPT
  * @__reserved: Must be 0
+ * @hwpt_type: One of enum iommu_hwpt_type
+ * @data_len: Length of the type specific data
+ * @data_uptr: User pointer to the type specific data
  *
  * Explicitly allocate a hardware page table object. This is the same object
  * type that is returned by iommufd_device_attach() and represents the
  * underlying iommu driver's iommu_domain kernel object.
  *
- * A HWPT will be created with the IOVA mappings from the given IOAS.
+ * A kernel-managed HWPT will be created with the mappings from the given
+ * IOAS via the @pt_id. The @hwpt_type for this allocation can be set to
+ * either IOMMU_HWPT_TYPE_DEFAULT or a pre-defined type corresponding to
+ * an I/O page table type supported by the underlying IOMMU hardware.
+ *
+ * A user-managed HWPT will be created from a given parent HWPT via the
+ * @pt_id, in which the parent HWPT must be allocated previously via the
+ * same ioctl from a given IOAS (@pt_id). In this case, the @hwpt_type
+ * must be set to a pre-defined type corresponding to an I/O page table
+ * type supported by the underlying IOMMU hardware.
+ *
+ * If the @hwpt_type is set to IOMMU_HWPT_TYPE_DEFAULT, both the @data_len
+ * and the @data_uptr will be ignored. Otherwise, both must be given.
  */
 struct iommu_hwpt_alloc {
 	__u32 size;
@@ -378,6 +393,9 @@  struct iommu_hwpt_alloc {
 	__u32 pt_id;
 	__u32 out_hwpt_id;
 	__u32 __reserved;
+	__u32 hwpt_type;
+	__u32 data_len;
+	__aligned_u64 data_uptr;
 };
 #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC)