diff mbox series

[v3,10/19] iommufd: Add IOMMU_HWPT_GET_DIRTY_IOVA

Message ID 20230923012511.10379-11-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series IOMMUFD Dirty Tracking | expand

Commit Message

Joao Martins Sept. 23, 2023, 1:25 a.m. UTC
Connect a hw_pagetable to the IOMMU core dirty tracking
read_and_clear_dirty iommu domain op. It exposes all of the functionality
for the UAPI that read the dirtied IOVAs while clearing the Dirty bits from
the PTEs

In doing so the previously internal iommufd_dirty_data structure is moved
over as the UAPI intermediate structure for representing iommufd dirty
bitmaps.

Contrary to past incantation of a similar interface in VFIO the IOVA range
to be scanned is tied in to the bitmap size, thus the application needs to
pass a appropriately sized bitmap address taking into account the iova
range being passed *and* page size ... as opposed to allowing
bitmap-iova != iova.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 drivers/iommu/iommufd/hw_pagetable.c    | 55 +++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h | 11 ++---
 drivers/iommu/iommufd/main.c            |  3 ++
 include/uapi/linux/iommufd.h            | 36 ++++++++++++++++
 4 files changed, 98 insertions(+), 7 deletions(-)

Comments

Jason Gunthorpe Oct. 13, 2023, 4:22 p.m. UTC | #1
On Sat, Sep 23, 2023 at 02:25:02AM +0100, Joao Martins wrote:

> +int iommufd_check_iova_range(struct iommufd_ioas *ioas,
> +			     struct iommufd_dirty_data *bitmap)
> +{
> +	unsigned long pgshift, npages;
> +	size_t iommu_pgsize;
> +	int rc = -EINVAL;
> +
> +	pgshift = __ffs(bitmap->page_size);
> +	npages = bitmap->length >> pgshift;
> +
> +	if (!npages || (npages > ULONG_MAX))
> +		return rc;
> +
> +	iommu_pgsize = 1 << __ffs(ioas->iopt.iova_alignment);

iova_alignment is not a bitmask, it is the alignment itself, so is
redundant.

> +	/* allow only smallest supported pgsize */
> +	if (bitmap->page_size != iommu_pgsize)
> +		return rc;

!= is smallest?

Why are we restricting this anyhow? I thought the iova bitmap stuff
did all the adaptation automatically?

I can sort of see restricting the start/stop iova


> +	if (bitmap->iova & (iommu_pgsize - 1))
> +		return rc;
> +
> +	if (!bitmap->length || bitmap->length & (iommu_pgsize - 1))
> +		return rc;
> +
> +	return 0;
> +}

> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -316,6 +316,7 @@ union ucmd_buffer {
>  	struct iommu_option option;
>  	struct iommu_vfio_ioas vfio_ioas;
>  	struct iommu_hwpt_set_dirty set_dirty;
> +	struct iommu_hwpt_get_dirty_iova get_dirty_iova;
>  #ifdef CONFIG_IOMMUFD_TEST
>  	struct iommu_test_cmd test;
>  #endif
> @@ -361,6 +362,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
>  		 __reserved),
>  	IOCTL_OP(IOMMU_HWPT_SET_DIRTY, iommufd_hwpt_set_dirty,
>  		 struct iommu_hwpt_set_dirty, __reserved),
> +	IOCTL_OP(IOMMU_HWPT_GET_DIRTY_IOVA, iommufd_hwpt_get_dirty_iova,
> +		 struct iommu_hwpt_get_dirty_iova, bitmap.data),

Also keep sorted

>  #ifdef CONFIG_IOMMUFD_TEST
>  	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
>  #endif
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index 37079e72d243..b35b7d0c4be0 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -48,6 +48,7 @@ enum {
>  	IOMMUFD_CMD_HWPT_ALLOC,
>  	IOMMUFD_CMD_GET_HW_INFO,
>  	IOMMUFD_CMD_HWPT_SET_DIRTY,
> +	IOMMUFD_CMD_HWPT_GET_DIRTY_IOVA,
>  };
>  
>  /**
> @@ -481,4 +482,39 @@ struct iommu_hwpt_set_dirty {
>  	__u32 __reserved;
>  };
>  #define IOMMU_HWPT_SET_DIRTY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_SET_DIRTY)
> +
> +/**
> + * struct iommufd_dirty_bitmap - Dirty IOVA tracking bitmap
> + * @iova: base IOVA of the bitmap
> + * @length: IOVA size
> + * @page_size: page size granularity of each bit in the bitmap
> + * @data: bitmap where to set the dirty bits. The bitmap bits each
> + * represent a page_size which you deviate from an arbitrary iova.
> + * Checking a given IOVA is dirty:
> + *
> + *  data[(iova / page_size) / 64] & (1ULL << (iova % 64))
> + */
> +struct iommufd_dirty_data {
> +	__aligned_u64 iova;
> +	__aligned_u64 length;
> +	__aligned_u64 page_size;
> +	__aligned_u64 *data;
> +};

Is there a reason to add this struct? Does something else use it?

> +/**
> + * struct iommu_hwpt_get_dirty_iova - ioctl(IOMMU_HWPT_GET_DIRTY_IOVA)
> + * @size: sizeof(struct iommu_hwpt_get_dirty_iova)
> + * @hwpt_id: HW pagetable ID that represents the IOMMU domain.
> + * @flags: Flags to control dirty tracking status.
> + * @bitmap: Bitmap of the range of IOVA to read out
> + */
> +struct iommu_hwpt_get_dirty_iova {
> +	__u32 size;
> +	__u32 hwpt_id;
> +	__u32 flags;
> +	__u32 __reserved;
> +	struct iommufd_dirty_data bitmap;

vs inlining here?

I see you are passing it around the internal API, but that could
easily pass the whole command too

Jason
Joao Martins Oct. 13, 2023, 4:58 p.m. UTC | #2
On 13/10/2023 17:22, Jason Gunthorpe wrote:
> On Sat, Sep 23, 2023 at 02:25:02AM +0100, Joao Martins wrote:
> 
>> +int iommufd_check_iova_range(struct iommufd_ioas *ioas,
>> +			     struct iommufd_dirty_data *bitmap)
>> +{
>> +	unsigned long pgshift, npages;
>> +	size_t iommu_pgsize;
>> +	int rc = -EINVAL;
>> +
>> +	pgshift = __ffs(bitmap->page_size);
>> +	npages = bitmap->length >> pgshift;
>> +
>> +	if (!npages || (npages > ULONG_MAX))
>> +		return rc;
>> +
>> +	iommu_pgsize = 1 << __ffs(ioas->iopt.iova_alignment);
> 
> iova_alignment is not a bitmask, it is the alignment itself, so is
> redundant.
> 
Yes, let me remove it

>> +	/* allow only smallest supported pgsize */
>> +	if (bitmap->page_size != iommu_pgsize)
>> +		return rc;
> 
> != is smallest?
> 
> Why are we restricting this anyhow? I thought the iova bitmap stuff
> did all the adaptation automatically?
> 
yes, it does

> I can sort of see restricting the start/stop iova
>

There's no fundamental reason to restricting it; I am probably just too obsessed
with making the most granular tracking, but I shouldn't restrict the user to
track at some other page granularity

> 
>> +	if (bitmap->iova & (iommu_pgsize - 1))
>> +		return rc;
>> +
>> +	if (!bitmap->length || bitmap->length & (iommu_pgsize - 1))
>> +		return rc;
>> +
>> +	return 0;
>> +}
> 
>> --- a/drivers/iommu/iommufd/main.c
>> +++ b/drivers/iommu/iommufd/main.c
>> @@ -316,6 +316,7 @@ union ucmd_buffer {
>>  	struct iommu_option option;
>>  	struct iommu_vfio_ioas vfio_ioas;
>>  	struct iommu_hwpt_set_dirty set_dirty;
>> +	struct iommu_hwpt_get_dirty_iova get_dirty_iova;
>>  #ifdef CONFIG_IOMMUFD_TEST
>>  	struct iommu_test_cmd test;
>>  #endif
>> @@ -361,6 +362,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
>>  		 __reserved),
>>  	IOCTL_OP(IOMMU_HWPT_SET_DIRTY, iommufd_hwpt_set_dirty,
>>  		 struct iommu_hwpt_set_dirty, __reserved),
>> +	IOCTL_OP(IOMMU_HWPT_GET_DIRTY_IOVA, iommufd_hwpt_get_dirty_iova,
>> +		 struct iommu_hwpt_get_dirty_iova, bitmap.data),
> 
> Also keep sorted
> 
OK

>>  #ifdef CONFIG_IOMMUFD_TEST
>>  	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
>>  #endif
>> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
>> index 37079e72d243..b35b7d0c4be0 100644
>> --- a/include/uapi/linux/iommufd.h
>> +++ b/include/uapi/linux/iommufd.h
>> @@ -48,6 +48,7 @@ enum {
>>  	IOMMUFD_CMD_HWPT_ALLOC,
>>  	IOMMUFD_CMD_GET_HW_INFO,
>>  	IOMMUFD_CMD_HWPT_SET_DIRTY,
>> +	IOMMUFD_CMD_HWPT_GET_DIRTY_IOVA,
>>  };
>>  
>>  /**
>> @@ -481,4 +482,39 @@ struct iommu_hwpt_set_dirty {
>>  	__u32 __reserved;
>>  };
>>  #define IOMMU_HWPT_SET_DIRTY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_SET_DIRTY)
>> +
>> +/**
>> + * struct iommufd_dirty_bitmap - Dirty IOVA tracking bitmap
>> + * @iova: base IOVA of the bitmap
>> + * @length: IOVA size
>> + * @page_size: page size granularity of each bit in the bitmap
>> + * @data: bitmap where to set the dirty bits. The bitmap bits each
>> + * represent a page_size which you deviate from an arbitrary iova.
>> + * Checking a given IOVA is dirty:
>> + *
>> + *  data[(iova / page_size) / 64] & (1ULL << (iova % 64))
>> + */
>> +struct iommufd_dirty_data {
>> +	__aligned_u64 iova;
>> +	__aligned_u64 length;
>> +	__aligned_u64 page_size;
>> +	__aligned_u64 *data;
>> +};
> 
> Is there a reason to add this struct? Does something else use it?
> 
I was just reducing how much data I really need to pass around so consolidated
all that into a struct representing the bitmap data considering (...)

>> +/**
>> + * struct iommu_hwpt_get_dirty_iova - ioctl(IOMMU_HWPT_GET_DIRTY_IOVA)
>> + * @size: sizeof(struct iommu_hwpt_get_dirty_iova)
>> + * @hwpt_id: HW pagetable ID that represents the IOMMU domain.
>> + * @flags: Flags to control dirty tracking status.
>> + * @bitmap: Bitmap of the range of IOVA to read out
>> + */
>> +struct iommu_hwpt_get_dirty_iova {
>> +	__u32 size;
>> +	__u32 hwpt_id;
>> +	__u32 flags;
>> +	__u32 __reserved;
>> +	struct iommufd_dirty_data bitmap;
> 
> vs inlining here?
> 
> I see you are passing it around the internal API, but that could
> easily pass the whole command too

I use it for the read_and_clear_dirty_data (and it's input validation). Kinda
weird to do:

	iommu_read_and_clear(domain, flags, cmd)

Considering none of those functions pass command data around. If you prefer with
passing the whole command then I can go at it;
Jason Gunthorpe Oct. 13, 2023, 5:03 p.m. UTC | #3
On Fri, Oct 13, 2023 at 05:58:43PM +0100, Joao Martins wrote:
> > I can sort of see restricting the start/stop iova

> There's no fundamental reason to restricting it; I am probably just too obsessed
> with making the most granular tracking, but I shouldn't restrict the user to
> track at some other page granularity

I would not restrict it, it makes the ABI less compatbile if you restrict
it.


> > I see you are passing it around the internal API, but that could
> > easily pass the whole command too
> 
> I use it for the read_and_clear_dirty_data (and it's input validation). Kinda
> weird to do:
> 
> 	iommu_read_and_clear(domain, flags, cmd)
> 
> Considering none of those functions pass command data around. If you prefer with
> passing the whole command then I can go at it;

Compared to adding some weirdness to the uapi header, I would prefer
weirdness in the internal code

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 22354b0ba554..a5712992bb4b 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -219,3 +219,58 @@  int iommufd_hwpt_set_dirty(struct iommufd_ucmd *ucmd)
 	iommufd_put_object(&hwpt->obj);
 	return rc;
 }
+
+int iommufd_check_iova_range(struct iommufd_ioas *ioas,
+			     struct iommufd_dirty_data *bitmap)
+{
+	unsigned long pgshift, npages;
+	size_t iommu_pgsize;
+	int rc = -EINVAL;
+
+	pgshift = __ffs(bitmap->page_size);
+	npages = bitmap->length >> pgshift;
+
+	if (!npages || (npages > ULONG_MAX))
+		return rc;
+
+	iommu_pgsize = 1 << __ffs(ioas->iopt.iova_alignment);
+
+	/* allow only smallest supported pgsize */
+	if (bitmap->page_size != iommu_pgsize)
+		return rc;
+
+	if (bitmap->iova & (iommu_pgsize - 1))
+		return rc;
+
+	if (!bitmap->length || bitmap->length & (iommu_pgsize - 1))
+		return rc;
+
+	return 0;
+}
+
+int iommufd_hwpt_get_dirty_iova(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_hwpt_get_dirty_iova *cmd = ucmd->cmd;
+	struct iommufd_hw_pagetable *hwpt;
+	struct iommufd_ioas *ioas;
+	int rc = -EOPNOTSUPP;
+
+	if ((cmd->flags || cmd->__reserved))
+		return -EOPNOTSUPP;
+
+	hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id);
+	if (IS_ERR(hwpt))
+		return PTR_ERR(hwpt);
+
+	ioas = hwpt->ioas;
+	rc = iommufd_check_iova_range(ioas, &cmd->bitmap);
+	if (rc)
+		goto out_put;
+
+	rc = iopt_read_and_clear_dirty_data(&ioas->iopt, hwpt->domain,
+					    cmd->flags, &cmd->bitmap);
+
+out_put:
+	iommufd_put_object(&hwpt->obj);
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 1101a1914513..608bb6eae64b 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -73,13 +73,6 @@  int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
 		    unsigned long length, unsigned long *unmapped);
 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped);
 
-struct iommufd_dirty_data {
-	unsigned long iova;
-	unsigned long length;
-	unsigned long page_size;
-	unsigned long long *data;
-};
-
 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
 				   struct iommu_domain *domain,
 				   unsigned long flags,
@@ -239,6 +232,8 @@  int iommufd_option_rlimit_mode(struct iommu_option *cmd,
 			       struct iommufd_ctx *ictx);
 
 int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
+int iommufd_check_iova_range(struct iommufd_ioas *ioas,
+			     struct iommufd_dirty_data *bitmap);
 
 /*
  * A HW pagetable is called an iommu_domain inside the kernel. This user object
@@ -265,6 +260,8 @@  static inline struct iommufd_hw_pagetable *iommufd_get_hwpt(
 			    struct iommufd_hw_pagetable, obj);
 }
 int iommufd_hwpt_set_dirty(struct iommufd_ucmd *ucmd);
+int iommufd_hwpt_get_dirty_iova(struct iommufd_ucmd *ucmd);
+
 struct iommufd_hw_pagetable *
 iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 			   struct iommufd_device *idev, u32 flags,
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index ec0c34086af3..17e356ffdf31 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -316,6 +316,7 @@  union ucmd_buffer {
 	struct iommu_option option;
 	struct iommu_vfio_ioas vfio_ioas;
 	struct iommu_hwpt_set_dirty set_dirty;
+	struct iommu_hwpt_get_dirty_iova get_dirty_iova;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -361,6 +362,8 @@  static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 __reserved),
 	IOCTL_OP(IOMMU_HWPT_SET_DIRTY, iommufd_hwpt_set_dirty,
 		 struct iommu_hwpt_set_dirty, __reserved),
+	IOCTL_OP(IOMMU_HWPT_GET_DIRTY_IOVA, iommufd_hwpt_get_dirty_iova,
+		 struct iommu_hwpt_get_dirty_iova, bitmap.data),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 37079e72d243..b35b7d0c4be0 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -48,6 +48,7 @@  enum {
 	IOMMUFD_CMD_HWPT_ALLOC,
 	IOMMUFD_CMD_GET_HW_INFO,
 	IOMMUFD_CMD_HWPT_SET_DIRTY,
+	IOMMUFD_CMD_HWPT_GET_DIRTY_IOVA,
 };
 
 /**
@@ -481,4 +482,39 @@  struct iommu_hwpt_set_dirty {
 	__u32 __reserved;
 };
 #define IOMMU_HWPT_SET_DIRTY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_SET_DIRTY)
+
+/**
+ * struct iommufd_dirty_bitmap - Dirty IOVA tracking bitmap
+ * @iova: base IOVA of the bitmap
+ * @length: IOVA size
+ * @page_size: page size granularity of each bit in the bitmap
+ * @data: bitmap where to set the dirty bits. The bitmap bits each
+ * represent a page_size which you deviate from an arbitrary iova.
+ * Checking a given IOVA is dirty:
+ *
+ *  data[(iova / page_size) / 64] & (1ULL << (iova % 64))
+ */
+struct iommufd_dirty_data {
+	__aligned_u64 iova;
+	__aligned_u64 length;
+	__aligned_u64 page_size;
+	__aligned_u64 *data;
+};
+
+/**
+ * struct iommu_hwpt_get_dirty_iova - ioctl(IOMMU_HWPT_GET_DIRTY_IOVA)
+ * @size: sizeof(struct iommu_hwpt_get_dirty_iova)
+ * @hwpt_id: HW pagetable ID that represents the IOMMU domain.
+ * @flags: Flags to control dirty tracking status.
+ * @bitmap: Bitmap of the range of IOVA to read out
+ */
+struct iommu_hwpt_get_dirty_iova {
+	__u32 size;
+	__u32 hwpt_id;
+	__u32 flags;
+	__u32 __reserved;
+	struct iommufd_dirty_data bitmap;
+};
+#define IOMMU_HWPT_GET_DIRTY_IOVA _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_GET_DIRTY_IOVA)
+
 #endif