diff mbox series

[v5,07/18] iommufd: Add IOMMU_HWPT_GET_DIRTY_BITMAP

Message ID 20231020222804.21850-8-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series IOMMUFD Dirty Tracking | expand

Commit Message

Joao Martins Oct. 20, 2023, 10:27 p.m. UTC
Connect a hw_pagetable to the IOMMU core dirty tracking
read_and_clear_dirty iommu domain op. It exposes all of the functionality
for the UAPI that read the dirtied IOVAs while clearing the Dirty bits from
the PTEs.

In doing so, add an IO pagetable API iopt_read_and_clear_dirty_data() that
performs the reading of dirty IOPTEs for a given IOVA range and then
copying back to userspace bitmap.

Underneath it uses the IOMMU domain kernel API which will read the dirty
bits, as well as atomically clearing the IOPTE dirty bit and flushing the
IOTLB at the end. The IOVA bitmaps usage takes care of the iteration of the
bitmaps user pages efficiently and without copies. Within the iterator
function we iterate over io-pagetable contigous areas that have been
mapped.

Contrary to past incantation of a similar interface in VFIO the IOVA range
to be scanned is tied in to the bitmap size, thus the application needs to
pass a appropriately sized bitmap address taking into account the iova
range being passed *and* page size ... as opposed to allowing bitmap-iova
!= iova.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
---
 drivers/iommu/iommufd/hw_pagetable.c    | 52 ++++++++++++++
 drivers/iommu/iommufd/io_pagetable.c    | 90 +++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h | 10 +++
 drivers/iommu/iommufd/main.c            |  4 ++
 include/uapi/linux/iommufd.h            | 34 ++++++++++
 5 files changed, 190 insertions(+)

Comments

Arnd Bergmann Oct. 23, 2023, 9:09 a.m. UTC | #1
On Sat, Oct 21, 2023, at 00:27, Joao Martins wrote:

> +int iommufd_check_iova_range(struct iommufd_ioas *ioas,
> +			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
> +{
> +	unsigned long npages;
> +	size_t iommu_pgsize;
> +	int rc = -EINVAL;
> +
> +	if (!bitmap->page_size)
> +		return rc;
> +
> +	npages = bitmap->length / bitmap->page_size;


The 64-bit division causes a link failure on 32-bit architectures:

arm-linux-gnueabi-ld: drivers/iommu/iommufd/hw_pagetable.o: in function `iommufd_check_iova_range':
hw_pagetable.c:(.text+0x77e): undefined reference to `__aeabi_uldivmod'
arm-linux-gnueabi/bin/arm-linux-gnueabi-ld: drivers/iommu/iommufd/hw_pagetable.o: in function `iommufd_hwpt_get_dirty_bitmap':
hw_pagetable.c:(.text+0x84c): undefined reference to `__aeabi_uldivmod'

I think it is safe to assume that the length of the bitmap
does not overflow an 'unsigned long', so it's probably
best to add a range check plus type cast, rather than an
expensive div_u64() here.

> +/**
> + * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP)
> + * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap)
> + * @hwpt_id: HW pagetable ID that represents the IOMMU domain
> + * @flags: Must be zero
> + * @iova: base IOVA of the bitmap first bit
> + * @length: IOVA range size
> + * @page_size: page size granularity of each bit in the bitmap
> + * @data: bitmap where to set the dirty bits. The bitmap bits each
> + *        represent a page_size which you deviate from an arbitrary iova.
> + *
> + * Checking a given IOVA is dirty:
> + *
> + *  data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64))
> + *
> + * Walk the IOMMU pagetables for a given IOVA range to return a bitmap
> + * with the dirty IOVAs. In doing so it will also by default clear any
> + * dirty bit metadata set in the IOPTE.
> + */
> +struct iommu_hwpt_get_dirty_bitmap {
> +	__u32 size;
> +	__u32 hwpt_id;
> +	__u32 flags;
> +	__u32 __reserved;
> +	__aligned_u64 iova;
> +	__aligned_u64 length;
> +	__aligned_u64 page_size;
> +	__aligned_u64 *data;
> +};
> +#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
> +					IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
> +

This is a flawed definition for an ioctl data structure. While
it appears that you have tried hard to follow the recommendations
in Documentation/driver-api/ioctl.rst, you accidentally added
a pointer here, which breaks compat mode handling because of
the uninitialized padding after the 32-bit 'data' pointer.

The correct fix here is to use a __u64 value for the pointer
itself and convert it like:

diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index a26f8ae1034dd..3f6db5b18c266 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -465,7 +465,7 @@ iommu_read_and_clear_dirty(struct iommu_domain *domain,
 		return -EOPNOTSUPP;
 
 	iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
-				 bitmap->page_size, bitmap->data);
+				 bitmap->page_size, u64_to_user_ptr(bitmap->data));
 	if (IS_ERR(iter))
 		return -ENOMEM;
 
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 50f98f01fe100..7fbf2d8a3c9b8 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -538,7 +538,7 @@ struct iommu_hwpt_get_dirty_bitmap {
 	__aligned_u64 iova;
 	__aligned_u64 length;
 	__aligned_u64 page_size;
-	__aligned_u64 *data;
+	__aligned_u64 data;
 };
 #define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
 					IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)


     Arnd
Joao Martins Oct. 23, 2023, 9:28 a.m. UTC | #2
On 23/10/2023 10:09, Arnd Bergmann wrote:
> On Sat, Oct 21, 2023, at 00:27, Joao Martins wrote:
> 
>> +int iommufd_check_iova_range(struct iommufd_ioas *ioas,
>> +			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
>> +{
>> +	unsigned long npages;
>> +	size_t iommu_pgsize;
>> +	int rc = -EINVAL;
>> +
>> +	if (!bitmap->page_size)
>> +		return rc;
>> +
>> +	npages = bitmap->length / bitmap->page_size;
> 
> 
> The 64-bit division causes a link failure on 32-bit architectures:
> 
> arm-linux-gnueabi-ld: drivers/iommu/iommufd/hw_pagetable.o: in function `iommufd_check_iova_range':
> hw_pagetable.c:(.text+0x77e): undefined reference to `__aeabi_uldivmod'
> arm-linux-gnueabi/bin/arm-linux-gnueabi-ld: drivers/iommu/iommufd/hw_pagetable.o: in function `iommufd_hwpt_get_dirty_bitmap':
> hw_pagetable.c:(.text+0x84c): undefined reference to `__aeabi_uldivmod'
> 
> I think it is safe to assume that the length of the bitmap
> does not overflow an 'unsigned long', 

Also I do check for the overflow but comes later; I should move the overflow
check from iopt_read_and_clear_dirty_data() into here as it's the only entry
point of that function, while deleting the duplicated alignment check from
iopt_read_and_clear_dirty_data().

> so it's probably
> best to add a range check plus type cast, rather than an
> expensive div_u64() here.
> 
OK

>> +/**
>> + * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP)
>> + * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap)
>> + * @hwpt_id: HW pagetable ID that represents the IOMMU domain
>> + * @flags: Must be zero
>> + * @iova: base IOVA of the bitmap first bit
>> + * @length: IOVA range size
>> + * @page_size: page size granularity of each bit in the bitmap
>> + * @data: bitmap where to set the dirty bits. The bitmap bits each
>> + *        represent a page_size which you deviate from an arbitrary iova.
>> + *
>> + * Checking a given IOVA is dirty:
>> + *
>> + *  data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64))
>> + *
>> + * Walk the IOMMU pagetables for a given IOVA range to return a bitmap
>> + * with the dirty IOVAs. In doing so it will also by default clear any
>> + * dirty bit metadata set in the IOPTE.
>> + */
>> +struct iommu_hwpt_get_dirty_bitmap {
>> +	__u32 size;
>> +	__u32 hwpt_id;
>> +	__u32 flags;
>> +	__u32 __reserved;
>> +	__aligned_u64 iova;
>> +	__aligned_u64 length;
>> +	__aligned_u64 page_size;
>> +	__aligned_u64 *data;
>> +};
>> +#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
>> +					IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
>> +
> 
> This is a flawed definition for an ioctl data structure. While
> it appears that you have tried hard to follow the recommendations
> in Documentation/driver-api/ioctl.rst, you accidentally added
> a pointer here, which breaks compat mode handling because of
> the uninitialized padding after the 32-bit 'data' pointer.
> 
Right

> The correct fix here is to use a __u64 value for the pointer
> itself and convert it like:
> 
> diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
> index a26f8ae1034dd..3f6db5b18c266 100644
> --- a/drivers/iommu/iommufd/io_pagetable.c
> +++ b/drivers/iommu/iommufd/io_pagetable.c
> @@ -465,7 +465,7 @@ iommu_read_and_clear_dirty(struct iommu_domain *domain,
>  		return -EOPNOTSUPP;
>  
>  	iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
> -				 bitmap->page_size, bitmap->data);
> +				 bitmap->page_size, u64_to_user_ptr(bitmap->data));
>  	if (IS_ERR(iter))
>  		return -ENOMEM;
>  
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index 50f98f01fe100..7fbf2d8a3c9b8 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -538,7 +538,7 @@ struct iommu_hwpt_get_dirty_bitmap {
>  	__aligned_u64 iova;
>  	__aligned_u64 length;
>  	__aligned_u64 page_size;
> -	__aligned_u64 *data;
> +	__aligned_u64 data;
>  };

This was meant to be like this from the beginning but it appears I've made the
the mistake since the first version. Thanks for both; I will fix it and respin
it for v6.
Jason Gunthorpe Oct. 23, 2023, 12:10 p.m. UTC | #3
On Mon, Oct 23, 2023 at 10:28:13AM +0100, Joao Martins wrote:
> > so it's probably
> > best to add a range check plus type cast, rather than an
> > expensive div_u64() here.
> 
> OK

Just keep it simple, we don't need to optimize for 32 bit. div_u64
will make the compiler happy.

> >> +struct iommu_hwpt_get_dirty_bitmap {
> >> +	__u32 size;
> >> +	__u32 hwpt_id;
> >> +	__u32 flags;
> >> +	__u32 __reserved;
> >> +	__aligned_u64 iova;
> >> +	__aligned_u64 length;
> >> +	__aligned_u64 page_size;
> >> +	__aligned_u64 *data;
> >> +};
> >> +#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
> >> +					IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
> >> +
> > 
> > This is a flawed definition for an ioctl data structure. While
> > it appears that you have tried hard to follow the recommendations
> > in Documentation/driver-api/ioctl.rst, you accidentally added
> > a pointer here, which breaks compat mode handling because of
> > the uninitialized padding after the 32-bit 'data' pointer.
> > 
> Right

oops how did we all miss that extra character :|

Jason
Arnd Bergmann Oct. 23, 2023, 12:41 p.m. UTC | #4
On Mon, Oct 23, 2023, at 14:10, Jason Gunthorpe wrote:
> On Mon, Oct 23, 2023 at 10:28:13AM +0100, Joao Martins wrote:
>> > so it's probably
>> > best to add a range check plus type cast, rather than an
>> > expensive div_u64() here.
>> 
>> OK
>
> Just keep it simple, we don't need to optimize for 32 bit. div_u64
> will make the compiler happy.

Fair enough. FWIW, I tried adding just the range check to see
if that would make the compiler turn it into a 32-bit division,
but that didn't work.

Some type of range check might still be good to have for
unrelated reasons.

    Arnd
Joao Martins Oct. 23, 2023, 3:56 p.m. UTC | #5
On 23/10/2023 13:41, Arnd Bergmann wrote:
> On Mon, Oct 23, 2023, at 14:10, Jason Gunthorpe wrote:
>> On Mon, Oct 23, 2023 at 10:28:13AM +0100, Joao Martins wrote:
>>>> so it's probably
>>>> best to add a range check plus type cast, rather than an
>>>> expensive div_u64() here.
>>>
>>> OK
>>
>> Just keep it simple, we don't need to optimize for 32 bit. div_u64
>> will make the compiler happy.
> 
> Fair enough. FWIW, I tried adding just the range check to see
> if that would make the compiler turn it into a 32-bit division,
> but that didn't work.
> 
> Some type of range check might still be good to have for
> unrelated reasons.

I can reproduce the arm32 build problem and I'm applying this diff below to this
patch to fix it. It essentially moves all the checks to
iommufd_check_iova_range(), including range-check and adding div_u64.

Additionally, perhaps should also move the iommufd_check_iova_range() invocation
via io_pagetable.c code rather than hw-pagetable code? It seems to make more
sense as there's nothing hw-pagetable specific that needs to be in here.

diff --git a/drivers/iommu/iommufd/hw_pagetable.c
b/drivers/iommu/iommufd/hw_pagetable.c
index a25042b0d3ba..4705954c51fe 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -224,23 +224,27 @@ int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd)
 int iommufd_check_iova_range(struct iommufd_ioas *ioas,
                             struct iommu_hwpt_get_dirty_bitmap *bitmap)
 {
-       unsigned long npages;
+       unsigned long npages, last_iova, iova = bitmap->iova;
+       unsigned long length = bitmap->length;
        size_t iommu_pgsize;
        int rc = -EINVAL;

        if (!bitmap->page_size)
                return rc;

-       npages = bitmap->length / bitmap->page_size;
+       if (check_add_overflow(iova, length - 1, &last_iova))
+               return -EOVERFLOW;
+
+       npages = div_u64(bitmap->length, bitmap->page_size);
        if (!npages || (npages > ULONG_MAX))
                return rc;

        iommu_pgsize = ioas->iopt.iova_alignment;

-       if (bitmap->iova & (iommu_pgsize - 1))
+       if (iova & (iommu_pgsize - 1))
                return rc;

-       if (!bitmap->length || bitmap->length & (iommu_pgsize - 1))
+       if (length || length & (iommu_pgsize - 1))
                return rc;

        return 0;
diff --git a/drivers/iommu/iommufd/io_pagetable.c
b/drivers/iommu/iommufd/io_pagetable.c
index 9955797862eb..00f5f60dc27e 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -486,15 +486,7 @@ int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
                                   unsigned long flags,
                                   struct iommu_hwpt_get_dirty_bitmap *bitmap)
 {
-       unsigned long last_iova, iova = bitmap->iova;
-       unsigned long length = bitmap->length;
-       int ret = -EINVAL;
-
-       if ((iova & (iopt->iova_alignment - 1)))
-               return -EINVAL;
-
-       if (check_add_overflow(iova, length - 1, &last_iova))
-               return -EOVERFLOW;
+       int ret;

        down_read(&iopt->iova_rwsem);
        ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
Jason Gunthorpe Oct. 23, 2023, 4:16 p.m. UTC | #6
On Mon, Oct 23, 2023 at 04:56:01PM +0100, Joao Martins wrote:
> On 23/10/2023 13:41, Arnd Bergmann wrote:
> > On Mon, Oct 23, 2023, at 14:10, Jason Gunthorpe wrote:
> >> On Mon, Oct 23, 2023 at 10:28:13AM +0100, Joao Martins wrote:
> >>>> so it's probably
> >>>> best to add a range check plus type cast, rather than an
> >>>> expensive div_u64() here.
> >>>
> >>> OK
> >>
> >> Just keep it simple, we don't need to optimize for 32 bit. div_u64
> >> will make the compiler happy.
> > 
> > Fair enough. FWIW, I tried adding just the range check to see
> > if that would make the compiler turn it into a 32-bit division,
> > but that didn't work.
> > 
> > Some type of range check might still be good to have for
> > unrelated reasons.
> 
> I can reproduce the arm32 build problem and I'm applying this diff below to this
> patch to fix it. It essentially moves all the checks to
> iommufd_check_iova_range(), including range-check and adding div_u64.
> 
> Additionally, perhaps should also move the iommufd_check_iova_range() invocation
> via io_pagetable.c code rather than hw-pagetable code? It seems to make more
> sense as there's nothing hw-pagetable specific that needs to be in here.

Don't you need the IOAS though?

Write it like this:

int iommufd_check_iova_range(struct iommufd_ioas *ioas,
			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
{
	size_t iommu_pgsize = ioas->iopt.iova_alignment;
	u64 last_iova;

	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
		return -EOVERFLOW;

	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
		return -EOVERFLOW;

	if ((bitmap->iova & (iommu_pgsize - 1)) ||
	    ((last_iova + 1) & (iommu_pgsize - 1)))
		return -EINVAL;
	return 0;
}

And if 0 should really be rejected then check iova == last_iova

Jason
Joao Martins Oct. 23, 2023, 4:31 p.m. UTC | #7
On 23/10/2023 17:16, Jason Gunthorpe wrote:
> On Mon, Oct 23, 2023 at 04:56:01PM +0100, Joao Martins wrote:
>> On 23/10/2023 13:41, Arnd Bergmann wrote:
>>> On Mon, Oct 23, 2023, at 14:10, Jason Gunthorpe wrote:
>>>> On Mon, Oct 23, 2023 at 10:28:13AM +0100, Joao Martins wrote:
>>>>>> so it's probably
>>>>>> best to add a range check plus type cast, rather than an
>>>>>> expensive div_u64() here.
>>>>>
>>>>> OK
>>>>
>>>> Just keep it simple, we don't need to optimize for 32 bit. div_u64
>>>> will make the compiler happy.
>>>
>>> Fair enough. FWIW, I tried adding just the range check to see
>>> if that would make the compiler turn it into a 32-bit division,
>>> but that didn't work.
>>>
>>> Some type of range check might still be good to have for
>>> unrelated reasons.
>>
>> I can reproduce the arm32 build problem and I'm applying this diff below to this
>> patch to fix it. It essentially moves all the checks to
>> iommufd_check_iova_range(), including range-check and adding div_u64.
>>
>> Additionally, perhaps should also move the iommufd_check_iova_range() invocation
>> via io_pagetable.c code rather than hw-pagetable code? It seems to make more
>> sense as there's nothing hw-pagetable specific that needs to be in here.
> 
> Don't you need the IOAS though?
> 
No, for these checks I only need the iopt, which I already pass into
iopt_read_and_clear_dirty_data(). Everything can really be placed in the later.

> Write it like this:
> 
> int iommufd_check_iova_range(struct iommufd_ioas *ioas,
> 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
> {
> 	size_t iommu_pgsize = ioas->iopt.iova_alignment;
> 	u64 last_iova;
> 
> 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
> 		return -EOVERFLOW;
> 
> 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
> 		return -EOVERFLOW;
> 
> 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
> 	    ((last_iova + 1) & (iommu_pgsize - 1)))
> 		return -EINVAL;
> 	return 0;
> }
> 
> And if 0 should really be rejected then check iova == last_iova

It should; Perhaps extending the above and replicate that second the ::page_size
alignment check is important as it's what's used by the bitmap e.g.

int iommufd_check_iova_range(struct iommufd_ioas *ioas,
 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
{
 	size_t iommu_pgsize = ioas->iopt.iova_alignment;
 	u64 last_iova;

 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
 		return -EOVERFLOW;

 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
 		return -EOVERFLOW;

 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
 	    ((last_iova + 1) & (iommu_pgsize - 1)))
 		return -EINVAL;

 	if ((bitmap->iova & (bitmap->page_size - 1)) ||
 	    ((last_iova + 1) & (bitmap->page_size - 1)))
 		return -EINVAL;

 	return 0;
}
Jason Gunthorpe Oct. 23, 2023, 4:34 p.m. UTC | #8
On Mon, Oct 23, 2023 at 05:31:22PM +0100, Joao Martins wrote:
> > Write it like this:
> > 
> > int iommufd_check_iova_range(struct iommufd_ioas *ioas,
> > 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
> > {
> > 	size_t iommu_pgsize = ioas->iopt.iova_alignment;
> > 	u64 last_iova;
> > 
> > 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
> > 		return -EOVERFLOW;
> > 
> > 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
> > 		return -EOVERFLOW;
> > 
> > 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
> > 	    ((last_iova + 1) & (iommu_pgsize - 1)))
> > 		return -EINVAL;
> > 	return 0;
> > }
> > 
> > And if 0 should really be rejected then check iova == last_iova
> 
> It should; Perhaps extending the above and replicate that second the ::page_size
> alignment check is important as it's what's used by the bitmap e.g.

That makes sense, much clearer what it is trying to do that way

Jason
Joao Martins Oct. 23, 2023, 5:55 p.m. UTC | #9
On 23/10/2023 17:34, Jason Gunthorpe wrote:
> On Mon, Oct 23, 2023 at 05:31:22PM +0100, Joao Martins wrote:
>>> Write it like this:
>>>
>>> int iommufd_check_iova_range(struct iommufd_ioas *ioas,
>>> 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
>>> {
>>> 	size_t iommu_pgsize = ioas->iopt.iova_alignment;
>>> 	u64 last_iova;
>>>
>>> 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
>>> 		return -EOVERFLOW;
>>>
>>> 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
>>> 		return -EOVERFLOW;
>>>
>>> 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
>>> 	    ((last_iova + 1) & (iommu_pgsize - 1)))
>>> 		return -EINVAL;
>>> 	return 0;
>>> }
>>>
>>> And if 0 should really be rejected then check iova == last_iova
>>
>> It should; Perhaps extending the above and replicate that second the ::page_size
>> alignment check is important as it's what's used by the bitmap e.g.
> 
> That makes sense, much clearer what it is trying to do that way

In regards to clearity, I am still checking if bitmap::page_size being 0 or not,
to avoid being so implicitly in the last check i.e. (bitmap->iova &
(bitmap->page_size - 1)) being an and to 0xfffffffff.
Jason Gunthorpe Oct. 23, 2023, 6:08 p.m. UTC | #10
On Mon, Oct 23, 2023 at 06:55:56PM +0100, Joao Martins wrote:
> On 23/10/2023 17:34, Jason Gunthorpe wrote:
> > On Mon, Oct 23, 2023 at 05:31:22PM +0100, Joao Martins wrote:
> >>> Write it like this:
> >>>
> >>> int iommufd_check_iova_range(struct iommufd_ioas *ioas,
> >>> 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
> >>> {
> >>> 	size_t iommu_pgsize = ioas->iopt.iova_alignment;
> >>> 	u64 last_iova;
> >>>
> >>> 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
> >>> 		return -EOVERFLOW;
> >>>
> >>> 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
> >>> 		return -EOVERFLOW;
> >>>
> >>> 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
> >>> 	    ((last_iova + 1) & (iommu_pgsize - 1)))
> >>> 		return -EINVAL;
> >>> 	return 0;
> >>> }
> >>>
> >>> And if 0 should really be rejected then check iova == last_iova
> >>
> >> It should; Perhaps extending the above and replicate that second the ::page_size
> >> alignment check is important as it's what's used by the bitmap e.g.
> > 
> > That makes sense, much clearer what it is trying to do that way
> 
> In regards to clearity, I am still checking if bitmap::page_size being 0 or not,
> to avoid being so implicitly in the last check i.e. (bitmap->iova &
> (bitmap->page_size - 1)) being an and to 0xfffffffff.

I think you have to check explicitly, iova/last_iova could also be
zero..

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index db7ff8c99e29..72be43fa5248 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -220,3 +220,55 @@  int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd)
 	iommufd_put_object(&hwpt->obj);
 	return rc;
 }
+
+int iommufd_check_iova_range(struct iommufd_ioas *ioas,
+			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+	unsigned long npages;
+	size_t iommu_pgsize;
+	int rc = -EINVAL;
+
+	if (!bitmap->page_size)
+		return rc;
+
+	npages = bitmap->length / bitmap->page_size;
+	if (!npages || (npages > ULONG_MAX))
+		return rc;
+
+	iommu_pgsize = ioas->iopt.iova_alignment;
+
+	if (bitmap->iova & (iommu_pgsize - 1))
+		return rc;
+
+	if (!bitmap->length || bitmap->length & (iommu_pgsize - 1))
+		return rc;
+
+	return 0;
+}
+
+int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd;
+	struct iommufd_hw_pagetable *hwpt;
+	struct iommufd_ioas *ioas;
+	int rc = -EOPNOTSUPP;
+
+	if ((cmd->flags || cmd->__reserved))
+		return -EOPNOTSUPP;
+
+	hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id);
+	if (IS_ERR(hwpt))
+		return PTR_ERR(hwpt);
+
+	ioas = hwpt->ioas;
+	rc = iommufd_check_iova_range(ioas, cmd);
+	if (rc)
+		goto out_put;
+
+	rc = iopt_read_and_clear_dirty_data(&ioas->iopt, hwpt->domain,
+					    cmd->flags, cmd);
+
+out_put:
+	iommufd_put_object(&hwpt->obj);
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 535d73466e15..cb168fc415b0 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -15,6 +15,7 @@ 
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
+#include <uapi/linux/iommufd.h>
 
 #include "io_pagetable.h"
 #include "double_span.h"
@@ -412,6 +413,95 @@  int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
 	return 0;
 }
 
+struct iova_bitmap_fn_arg {
+	struct io_pagetable *iopt;
+	struct iommu_domain *domain;
+	struct iommu_dirty_bitmap *dirty;
+};
+
+static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
+					unsigned long iova, size_t length,
+					void *opaque)
+{
+	struct iopt_area *area;
+	struct iopt_area_contig_iter iter;
+	struct iova_bitmap_fn_arg *arg = opaque;
+	struct iommu_domain *domain = arg->domain;
+	struct iommu_dirty_bitmap *dirty = arg->dirty;
+	const struct iommu_dirty_ops *ops = domain->dirty_ops;
+	unsigned long last_iova = iova + length - 1;
+	int ret;
+
+	iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
+		unsigned long last = min(last_iova, iopt_area_last_iova(area));
+
+		ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
+						last - iter.cur_iova + 1,
+						0, dirty);
+		if (ret)
+			return ret;
+	}
+
+	if (!iopt_area_contig_done(&iter))
+		return -EINVAL;
+	return 0;
+}
+
+static int iommu_read_and_clear_dirty(struct iommu_domain *domain,
+				      struct io_pagetable *iopt,
+				      unsigned long flags,
+				      struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+	const struct iommu_dirty_ops *ops = domain->dirty_ops;
+	struct iommu_iotlb_gather gather;
+	struct iommu_dirty_bitmap dirty;
+	struct iova_bitmap_fn_arg arg;
+	struct iova_bitmap *iter;
+	int ret = 0;
+
+	if (!ops || !ops->read_and_clear_dirty)
+		return -EOPNOTSUPP;
+
+	iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
+				 bitmap->page_size, bitmap->data);
+	if (IS_ERR(iter))
+		return -ENOMEM;
+
+	iommu_dirty_bitmap_init(&dirty, iter, &gather);
+
+	arg.iopt = iopt;
+	arg.domain = domain;
+	arg.dirty = &dirty;
+	iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
+
+	iommu_iotlb_sync(domain, &gather);
+	iova_bitmap_free(iter);
+
+	return ret;
+}
+
+int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
+				   struct iommu_domain *domain,
+				   unsigned long flags,
+				   struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+	unsigned long last_iova, iova = bitmap->iova;
+	unsigned long length = bitmap->length;
+	int ret = -EINVAL;
+
+	if ((iova & (iopt->iova_alignment - 1)))
+		return -EINVAL;
+
+	if (check_add_overflow(iova, length - 1, &last_iova))
+		return -EOVERFLOW;
+
+	down_read(&iopt->iova_rwsem);
+	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
+	up_read(&iopt->iova_rwsem);
+
+	return ret;
+}
+
 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
 				 struct iommu_domain *domain)
 {
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index f896f47c705a..a2f83cd9706d 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -8,6 +8,8 @@ 
 #include <linux/xarray.h>
 #include <linux/refcount.h>
 #include <linux/uaccess.h>
+#include <linux/iommu.h>
+#include <linux/iova_bitmap.h>
 #include <uapi/linux/iommufd.h>
 
 struct iommu_domain;
@@ -71,6 +73,10 @@  int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
 		    unsigned long length, unsigned long *unmapped);
 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped);
 
+int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
+				   struct iommu_domain *domain,
+				   unsigned long flags,
+				   struct iommu_hwpt_get_dirty_bitmap *bitmap);
 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
 			    struct iommu_domain *domain, bool enable);
 
@@ -226,6 +232,8 @@  int iommufd_option_rlimit_mode(struct iommu_option *cmd,
 			       struct iommufd_ctx *ictx);
 
 int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
+int iommufd_check_iova_range(struct iommufd_ioas *ioas,
+			     struct iommu_hwpt_get_dirty_bitmap *bitmap);
 
 /*
  * A HW pagetable is called an iommu_domain inside the kernel. This user object
@@ -252,6 +260,8 @@  static inline struct iommufd_hw_pagetable *iommufd_get_hwpt(
 			    struct iommufd_hw_pagetable, obj);
 }
 int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd);
+int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd);
+
 struct iommufd_hw_pagetable *
 iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 			   struct iommufd_device *idev, u32 flags,
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 46fedd779714..d50f42a730aa 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -307,6 +307,7 @@  union ucmd_buffer {
 	struct iommu_destroy destroy;
 	struct iommu_hw_info info;
 	struct iommu_hwpt_alloc hwpt;
+	struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
 	struct iommu_hwpt_set_dirty_tracking set_dirty_tracking;
 	struct iommu_ioas_alloc alloc;
 	struct iommu_ioas_allow_iovas allow_iovas;
@@ -343,6 +344,8 @@  static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 __reserved),
 	IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
 		 __reserved),
+	IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap,
+		 struct iommu_hwpt_get_dirty_bitmap, data),
 	IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking,
 		 struct iommu_hwpt_set_dirty_tracking, __reserved),
 	IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
@@ -555,5 +558,6 @@  MODULE_ALIAS_MISCDEV(VFIO_MINOR);
 MODULE_ALIAS("devname:vfio/vfio");
 #endif
 MODULE_IMPORT_NS(IOMMUFD_INTERNAL);
+MODULE_IMPORT_NS(IOMMUFD);
 MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices");
 MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 525b6e5cedee..06724bbe8af3 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -48,6 +48,7 @@  enum {
 	IOMMUFD_CMD_HWPT_ALLOC,
 	IOMMUFD_CMD_GET_HW_INFO,
 	IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
+	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
 };
 
 /**
@@ -480,4 +481,37 @@  struct iommu_hwpt_set_dirty_tracking {
 };
 #define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \
 					  IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING)
+
+/**
+ * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP)
+ * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap)
+ * @hwpt_id: HW pagetable ID that represents the IOMMU domain
+ * @flags: Must be zero
+ * @iova: base IOVA of the bitmap first bit
+ * @length: IOVA range size
+ * @page_size: page size granularity of each bit in the bitmap
+ * @data: bitmap where to set the dirty bits. The bitmap bits each
+ *        represent a page_size which you deviate from an arbitrary iova.
+ *
+ * Checking a given IOVA is dirty:
+ *
+ *  data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64))
+ *
+ * Walk the IOMMU pagetables for a given IOVA range to return a bitmap
+ * with the dirty IOVAs. In doing so it will also by default clear any
+ * dirty bit metadata set in the IOPTE.
+ */
+struct iommu_hwpt_get_dirty_bitmap {
+	__u32 size;
+	__u32 hwpt_id;
+	__u32 flags;
+	__u32 __reserved;
+	__aligned_u64 iova;
+	__aligned_u64 length;
+	__aligned_u64 page_size;
+	__aligned_u64 *data;
+};
+#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
+					IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
+
 #endif