diff mbox series

[v3,2/4] iommu/io-pgtable-arm: Add read_and_clear_dirty() support

Message ID 20240430134308.1604-3-shameerali.kolothum.thodi@huawei.com (mailing list archive)
State New, archived
Headers show
Series iommu/smmuv3: Add IOMMUFD dirty tracking support for SMMUv3 | expand

Commit Message

Shameerali Kolothum Thodi April 30, 2024, 1:43 p.m. UTC
.read_and_clear_dirty() IOMMU domain op takes care of reading the dirty
bits (i.e. PTE has DBM set and AP[2] clear) and marshalling into a
bitmap of a given page size.

While reading the dirty bits we also set the PTE AP[2] bit to mark it
as writable-clean depending on read_and_clear_dirty() flags.

PTE states with respect to DBM bit:

                       DBM bit        AP[2]("RDONLY" bit)
1. writable_clean        1                 1
2. writable_dirty        1                 0
3. read-only             0                 1

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/iommu/io-pgtable-arm.c | 105 ++++++++++++++++++++++++++++++++-
 1 file changed, 103 insertions(+), 2 deletions(-)

Comments

Ryan Roberts April 30, 2024, 2:51 p.m. UTC | #1
On 30/04/2024 14:43, Shameer Kolothum wrote:
> .read_and_clear_dirty() IOMMU domain op takes care of reading the dirty
> bits (i.e. PTE has DBM set and AP[2] clear) and marshalling into a
> bitmap of a given page size.
> 
> While reading the dirty bits we also set the PTE AP[2] bit to mark it
> as writable-clean depending on read_and_clear_dirty() flags.
> 
> PTE states with respect to DBM bit:
> 
>                        DBM bit        AP[2]("RDONLY" bit)
> 1. writable_clean        1                 1
> 2. writable_dirty        1                 0
> 3. read-only             0                 1
> 
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

Couple of nits/comments, but regardless:

Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>

> ---
>  drivers/iommu/io-pgtable-arm.c | 105 ++++++++++++++++++++++++++++++++-
>  1 file changed, 103 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> index f7828a7aad41..da6cc52859ba 100644
> --- a/drivers/iommu/io-pgtable-arm.c
> +++ b/drivers/iommu/io-pgtable-arm.c
> @@ -75,6 +75,7 @@
>  
>  #define ARM_LPAE_PTE_NSTABLE		(((arm_lpae_iopte)1) << 63)
>  #define ARM_LPAE_PTE_XN			(((arm_lpae_iopte)3) << 53)
> +#define ARM_LPAE_PTE_DBM		(((arm_lpae_iopte)1) << 51)
>  #define ARM_LPAE_PTE_AF			(((arm_lpae_iopte)1) << 10)
>  #define ARM_LPAE_PTE_SH_NS		(((arm_lpae_iopte)0) << 8)
>  #define ARM_LPAE_PTE_SH_OS		(((arm_lpae_iopte)2) << 8)
> @@ -84,7 +85,7 @@
>  
>  #define ARM_LPAE_PTE_ATTR_LO_MASK	(((arm_lpae_iopte)0x3ff) << 2)
>  /* Ignore the contiguous bit for block splitting */
> -#define ARM_LPAE_PTE_ATTR_HI_MASK	(((arm_lpae_iopte)6) << 52)
> +#define ARM_LPAE_PTE_ATTR_HI_MASK	(ARM_LPAE_PTE_XN | ARM_LPAE_PTE_DBM)
>  #define ARM_LPAE_PTE_ATTR_MASK		(ARM_LPAE_PTE_ATTR_LO_MASK |	\
>  					 ARM_LPAE_PTE_ATTR_HI_MASK)
>  /* Software bit for solving coherency races */
> @@ -92,7 +93,11 @@
>  
>  /* Stage-1 PTE */
>  #define ARM_LPAE_PTE_AP_UNPRIV		(((arm_lpae_iopte)1) << 6)
> -#define ARM_LPAE_PTE_AP_RDONLY		(((arm_lpae_iopte)2) << 6)
> +#define ARM_LPAE_PTE_AP_RDONLY_BIT	7
> +#define ARM_LPAE_PTE_AP_RDONLY		(((arm_lpae_iopte)1) << \
> +					   ARM_LPAE_PTE_AP_RDONLY_BIT)
> +#define ARM_LPAE_PTE_AP_WRITABLE_CLEAN	(ARM_LPAE_PTE_AP_RDONLY | \
> +					 ARM_LPAE_PTE_DBM)
>  #define ARM_LPAE_PTE_ATTRINDX_SHIFT	2
>  #define ARM_LPAE_PTE_nG			(((arm_lpae_iopte)1) << 11)
>  
> @@ -138,6 +143,9 @@
>  
>  #define iopte_prot(pte)	((pte) & ARM_LPAE_PTE_ATTR_MASK)
>  
> +#define iopte_hw_dirty(pte)	(((pte) & ARM_LPAE_PTE_AP_WRITABLE_CLEAN) == \
> +				   ARM_LPAE_PTE_DBM)

Could this simply be called "iopte_dirty"? The hw and sw dirty concept only come
about in the arch code because we keep an additional sw bit to track dirty in
the pte, which you are not doing here.

> +
>  struct arm_lpae_io_pgtable {
>  	struct io_pgtable	iop;
>  
> @@ -729,6 +737,98 @@ static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,
>  	return iopte_to_paddr(pte, data) | iova;
>  }
>  
> +struct io_pgtable_walk_data {
> +	struct iommu_dirty_bitmap	*dirty;
> +	unsigned long			flags;
> +	u64				addr;
> +	const u64			end;
> +};
> +
> +static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data,
> +				       struct io_pgtable_walk_data *walk_data,
> +				       arm_lpae_iopte *ptep,
> +				       int lvl);
> +
> +static int io_pgtable_visit_dirty(struct arm_lpae_io_pgtable *data,
> +				  struct io_pgtable_walk_data *walk_data,
> +				  arm_lpae_iopte *ptep, int lvl)
> +{
> +	struct io_pgtable *iop = &data->iop;
> +	arm_lpae_iopte pte = READ_ONCE(*ptep);
> +
> +	if (WARN_ON(!pte))

Wouldn't it actually be better to check that the pte is valid (i.e. bit 0 is
set)? And do we really need the warn here? I'm guessing you never expect to be
walking over a hole? - if it is legitimately possible to walk over a hole, then
the WARN is definitely incorrect and you would want to continue walking rather
than return.

> +		return -EINVAL;
> +
> +	if (iopte_leaf(pte, lvl, iop->fmt)) {
> +		size_t size = ARM_LPAE_BLOCK_SIZE(lvl, data);
> +
> +		if (iopte_hw_dirty(pte)) {
> +			iommu_dirty_bitmap_record(walk_data->dirty,
> +						  walk_data->addr, size);
> +			if (!(walk_data->flags & IOMMU_DIRTY_NO_CLEAR))
> +				set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT,
> +					(unsigned long *)ptep);

Just checking, set_bit() is atomic right? So safe against racing updates with
the HW. Pretty sure that's the case...

> +		}
> +		walk_data->addr += size;
> +		return 0;
> +	}
> +
> +	ptep = iopte_deref(pte, data);
> +	return __arm_lpae_iopte_walk_dirty(data, walk_data, ptep, lvl + 1);
> +}
> +
> +static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data,
> +				       struct io_pgtable_walk_data *walk_data,
> +				       arm_lpae_iopte *ptep,
> +				       int lvl)
> +{
> +	u32 idx;
> +	int max_entries, ret;
> +
> +	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
> +		return -EINVAL;
> +
> +	if (lvl == data->start_level)
> +		max_entries = ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte);
> +	else
> +		max_entries = ARM_LPAE_PTES_PER_TABLE(data);
> +
> +	for (idx = ARM_LPAE_LVL_IDX(walk_data->addr, lvl, data);
> +	     (idx < max_entries) && (walk_data->addr < walk_data->end); ++idx) {
> +		ret = io_pgtable_visit_dirty(data, walk_data, ptep + idx, lvl);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops,
> +					 unsigned long iova, size_t size,
> +					 unsigned long flags,
> +					 struct iommu_dirty_bitmap *dirty)
> +{
> +	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
> +	struct io_pgtable_cfg *cfg = &data->iop.cfg;
> +	struct io_pgtable_walk_data walk_data = {
> +		.dirty = dirty,
> +		.flags = flags,
> +		.addr = iova,
> +		.end = iova + size,
> +	};
> +	arm_lpae_iopte *ptep = data->pgd;
> +	int lvl = data->start_level;
> +
> +	if (WARN_ON(!size))
> +		return -EINVAL;
> +	if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1)))
> +		return -EINVAL;
> +	if (data->iop.fmt != ARM_64_LPAE_S1)
> +		return -EINVAL;
> +
> +	return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl);
> +}
> +
>  static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg)
>  {
>  	unsigned long granule, page_sizes;
> @@ -807,6 +907,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
>  		.map_pages	= arm_lpae_map_pages,
>  		.unmap_pages	= arm_lpae_unmap_pages,
>  		.iova_to_phys	= arm_lpae_iova_to_phys,
> +		.read_and_clear_dirty = arm_lpae_read_and_clear_dirty,
>  	};
>  
>  	return data;
Jason Gunthorpe May 12, 2024, 12:51 p.m. UTC | #2
On Tue, Apr 30, 2024 at 02:43:06PM +0100, Shameer Kolothum wrote:
> +static int io_pgtable_visit_dirty(struct arm_lpae_io_pgtable *data,
> +				  struct io_pgtable_walk_data *walk_data,
> +				  arm_lpae_iopte *ptep, int lvl)
> +{
> +	struct io_pgtable *iop = &data->iop;
> +	arm_lpae_iopte pte = READ_ONCE(*ptep);
> +
> +	if (WARN_ON(!pte))
> +		return -EINVAL;

This seems poorly placed, why would pte ever be zero?

> +	if (iopte_leaf(pte, lvl, iop->fmt)) {
> +		size_t size = ARM_LPAE_BLOCK_SIZE(lvl, data);
> +
> +		if (iopte_hw_dirty(pte)) {
> +			iommu_dirty_bitmap_record(walk_data->dirty,
> +						  walk_data->addr, size);
> +			if (!(walk_data->flags & IOMMU_DIRTY_NO_CLEAR))
> +				set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT,
> +					(unsigned long *)ptep);
> +		}
> +		walk_data->addr += size;
> +		return 0;
> +	}
> +
> +	ptep = iopte_deref(pte, data);

This would be a better spot, if the pte doesn't indicate a next level
table then something has gone wrong, otherwise the returned pointer
has to be valid.

Something like this maybe?

static inline bool iopte_table(arm_lpae_iopte pte, int lvl,
			      enum io_pgtable_fmt fmt)
{
	if (lvl == (ARM_LPAE_MAX_LEVELS - 1))
		return false;
	return iopte_type(pte) == ARM_LPAE_PTE_TYPE_TABLE;
}


> +	return __arm_lpae_iopte_walk_dirty(data, walk_data, ptep, lvl + 1);
> +}
> +
> +static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data,
> +				       struct io_pgtable_walk_data *walk_data,
> +				       arm_lpae_iopte *ptep,
> +				       int lvl)
> +{
> +	u32 idx;
> +	int max_entries, ret;
> +
> +	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
> +		return -EINVAL;

And if the above ipte_deref() could check for a valid next level (with
no valid level at MAX_LEVELS) then this can never happen either and
can be removed.

Jason
Tian, Kevin May 22, 2024, 7:12 a.m. UTC | #3
> From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> Sent: Tuesday, April 30, 2024 9:43 PM
> 
> @@ -92,7 +93,11 @@
> 
>  /* Stage-1 PTE */
>  #define ARM_LPAE_PTE_AP_UNPRIV		(((arm_lpae_iopte)1) << 6)
> -#define ARM_LPAE_PTE_AP_RDONLY		(((arm_lpae_iopte)2) << 6)
> +#define ARM_LPAE_PTE_AP_RDONLY_BIT	7
> +#define ARM_LPAE_PTE_AP_RDONLY		(((arm_lpae_iopte)1) << \
> +					   ARM_LPAE_PTE_AP_RDONLY_BIT)
> +#define ARM_LPAE_PTE_AP_WRITABLE_CLEAN
> 	(ARM_LPAE_PTE_AP_RDONLY | \
> +					 ARM_LPAE_PTE_DBM)

based on the usage is it clearer to be xxx_WRITEABLE_MASK?

>  #define ARM_LPAE_PTE_ATTRINDX_SHIFT	2
>  #define ARM_LPAE_PTE_nG			(((arm_lpae_iopte)1) << 11)
> 
> @@ -138,6 +143,9 @@
> 
>  #define iopte_prot(pte)	((pte) & ARM_LPAE_PTE_ATTR_MASK)
> 
> +#define iopte_hw_dirty(pte)	(((pte) &
> ARM_LPAE_PTE_AP_WRITABLE_CLEAN) == \
> +				   ARM_LPAE_PTE_DBM)
> +

iopte_is_writeable_dirty()?

and the following "set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT,
(unsigned long *)ptep);" could be wrapped as:

	iopte_set_writable_clean(ptep);

> +
> +static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops,
> +					 unsigned long iova, size_t size,
> +					 unsigned long flags,
> +					 struct iommu_dirty_bitmap *dirty)
> +{
> +	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
> +	struct io_pgtable_cfg *cfg = &data->iop.cfg;
> +	struct io_pgtable_walk_data walk_data = {
> +		.dirty = dirty,
> +		.flags = flags,
> +		.addr = iova,
> +		.end = iova + size,
> +	};
> +	arm_lpae_iopte *ptep = data->pgd;
> +	int lvl = data->start_level;
> +
> +	if (WARN_ON(!size))
> +		return -EINVAL;
> +	if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1)))
> +		return -EINVAL;
> +	if (data->iop.fmt != ARM_64_LPAE_S1)
> +		return -EINVAL;
> +
> +	return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl);

Intel/AMD drivers also checks:

        if (!dmar_domain->dirty_tracking && dirty->bitmap)
                return -EINVAL;
Jason Gunthorpe May 22, 2024, 12:37 p.m. UTC | #4
On Wed, May 22, 2024 at 07:12:27AM +0000, Tian, Kevin wrote:
> > +static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops,
> > +					 unsigned long iova, size_t size,
> > +					 unsigned long flags,
> > +					 struct iommu_dirty_bitmap *dirty)
> > +{
> > +	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
> > +	struct io_pgtable_cfg *cfg = &data->iop.cfg;
> > +	struct io_pgtable_walk_data walk_data = {
> > +		.dirty = dirty,
> > +		.flags = flags,
> > +		.addr = iova,
> > +		.end = iova + size,
> > +	};
> > +	arm_lpae_iopte *ptep = data->pgd;
> > +	int lvl = data->start_level;
> > +
> > +	if (WARN_ON(!size))
> > +		return -EINVAL;
> > +	if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1)))
> > +		return -EINVAL;
> > +	if (data->iop.fmt != ARM_64_LPAE_S1)
> > +		return -EINVAL;
> > +
> > +	return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl);
> 
> Intel/AMD drivers also checks:
> 
>         if (!dmar_domain->dirty_tracking && dirty->bitmap)
>                 return -EINVAL;

Those both seem redundant checks on a fast path?

Jason
Shameerali Kolothum Thodi May 22, 2024, 2:03 p.m. UTC | #5
> -----Original Message-----
> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Wednesday, May 22, 2024 8:12 AM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> iommu@lists.linux.dev; linux-arm-kernel@lists.infradead.org
> Cc: robin.murphy@arm.com; will@kernel.org; joro@8bytes.org;
> jgg@nvidia.com; ryan.roberts@arm.com; nicolinc@nvidia.com;
> mshavit@google.com; eric.auger@redhat.com; joao.m.martins@oracle.com;
> jiangkunkun <jiangkunkun@huawei.com>; zhukeqian
> <zhukeqian1@huawei.com>; Linuxarm <linuxarm@huawei.com>
> Subject: RE: [PATCH v3 2/4] iommu/io-pgtable-arm: Add read_and_clear_dirty()
> support
> 
> > From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> > Sent: Tuesday, April 30, 2024 9:43 PM
> >
> > @@ -92,7 +93,11 @@
> >
> >  /* Stage-1 PTE */
> >  #define ARM_LPAE_PTE_AP_UNPRIV		(((arm_lpae_iopte)1) << 6)
> > -#define ARM_LPAE_PTE_AP_RDONLY		(((arm_lpae_iopte)2) << 6)
> > +#define ARM_LPAE_PTE_AP_RDONLY_BIT	7
> > +#define ARM_LPAE_PTE_AP_RDONLY		(((arm_lpae_iopte)1) << \
> > +					   ARM_LPAE_PTE_AP_RDONLY_BIT)
> > +#define ARM_LPAE_PTE_AP_WRITABLE_CLEAN
> > 	(ARM_LPAE_PTE_AP_RDONLY | \
> > +					 ARM_LPAE_PTE_DBM)
> 
> based on the usage is it clearer to be xxx_WRITEABLE_MASK?

I think in patch 4 we use this to set the PTE Writeable Clean. Anyway I will
revisit this following Jason's comment on just setting the DBM bit there.

> 
> >  #define ARM_LPAE_PTE_ATTRINDX_SHIFT	2
> >  #define ARM_LPAE_PTE_nG			(((arm_lpae_iopte)1) << 11)
> >
> > @@ -138,6 +143,9 @@
> >
> >  #define iopte_prot(pte)	((pte) & ARM_LPAE_PTE_ATTR_MASK)
> >
> > +#define iopte_hw_dirty(pte)	(((pte) &
> > ARM_LPAE_PTE_AP_WRITABLE_CLEAN) == \
> > +				   ARM_LPAE_PTE_DBM)
> > +
> 
> iopte_is_writeable_dirty()?
> 
> and the following "set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT,
> (unsigned long *)ptep);" could be wrapped as:
> 
> 	iopte_set_writable_clean(ptep);

Ok. Will consider this during respin.
 
> > +
> > +static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops,
> > +					 unsigned long iova, size_t size,
> > +					 unsigned long flags,
> > +					 struct iommu_dirty_bitmap *dirty)
> > +{
> > +	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
> > +	struct io_pgtable_cfg *cfg = &data->iop.cfg;
> > +	struct io_pgtable_walk_data walk_data = {
> > +		.dirty = dirty,
> > +		.flags = flags,
> > +		.addr = iova,
> > +		.end = iova + size,
> > +	};
> > +	arm_lpae_iopte *ptep = data->pgd;
> > +	int lvl = data->start_level;
> > +
> > +	if (WARN_ON(!size))
> > +		return -EINVAL;
> > +	if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1)))
> > +		return -EINVAL;
> > +	if (data->iop.fmt != ARM_64_LPAE_S1)
> > +		return -EINVAL;
> > +
> > +	return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl);
> 
> Intel/AMD drivers also checks:
> 
>         if (!dmar_domain->dirty_tracking && dirty->bitmap)
>                 return -EINVAL;

Is that really required? Is the concern here is user may issue
GET_DIRTY_BITMAP IOCTL without any validation and that may
result in unnecessary scanning of page tables? May be we should
handle it in core code then I think.

Thanks,
Shameer
Joao Martins May 22, 2024, 2:37 p.m. UTC | #6
On 22/05/2024 15:03, Shameerali Kolothum Thodi wrote:
> 
> 
>> -----Original Message-----
>> From: Tian, Kevin <kevin.tian@intel.com>
>> Sent: Wednesday, May 22, 2024 8:12 AM
>> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
>> iommu@lists.linux.dev; linux-arm-kernel@lists.infradead.org
>> Cc: robin.murphy@arm.com; will@kernel.org; joro@8bytes.org;
>> jgg@nvidia.com; ryan.roberts@arm.com; nicolinc@nvidia.com;
>> mshavit@google.com; eric.auger@redhat.com; joao.m.martins@oracle.com;
>> jiangkunkun <jiangkunkun@huawei.com>; zhukeqian
>> <zhukeqian1@huawei.com>; Linuxarm <linuxarm@huawei.com>
>> Subject: RE: [PATCH v3 2/4] iommu/io-pgtable-arm: Add read_and_clear_dirty()
>> support
>>
>>> From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
>>> Sent: Tuesday, April 30, 2024 9:43 PM
>>> +
>>> +static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops,
>>> +					 unsigned long iova, size_t size,
>>> +					 unsigned long flags,
>>> +					 struct iommu_dirty_bitmap *dirty)
>>> +{
>>> +	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
>>> +	struct io_pgtable_cfg *cfg = &data->iop.cfg;
>>> +	struct io_pgtable_walk_data walk_data = {
>>> +		.dirty = dirty,
>>> +		.flags = flags,
>>> +		.addr = iova,
>>> +		.end = iova + size,
>>> +	};
>>> +	arm_lpae_iopte *ptep = data->pgd;
>>> +	int lvl = data->start_level;
>>> +
>>> +	if (WARN_ON(!size))
>>> +		return -EINVAL;
>>> +	if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1)))
>>> +		return -EINVAL;
>>> +	if (data->iop.fmt != ARM_64_LPAE_S1)
>>> +		return -EINVAL;
>>> +
>>> +	return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl);
>>
>> Intel/AMD drivers also checks:
>>
>>         if (!dmar_domain->dirty_tracking && dirty->bitmap)
>>                 return -EINVAL;
> 
> Is that really required? Is the concern here is user may issue
> GET_DIRTY_BITMAP IOCTL without any validation and that may
> result in unnecessary scanning of page tables? May be we should
> handle it in core code then I think.

This is just to catch the case where IOMMUFD can call into read_and_clear()
without dirty tracking enabled and without a bitmap structure to clear dirty
bits -- in order to ensure a clean PTE data snapshot after start(). And thus it
errors out on others.

Right now we don't track in iommufd that dirty tracking is active in the domain,
vendor does so in its own way. I mean we could move that to core, but we end up
duplicating what the iommu driver is already doing.
Jason Gunthorpe May 22, 2024, 4:56 p.m. UTC | #7
On Wed, May 22, 2024 at 03:37:57PM +0100, Joao Martins wrote:
 
> This is just to catch the case where IOMMUFD can call into read_and_clear()
> without dirty tracking enabled and without a bitmap structure to clear dirty
> bits -- in order to ensure a clean PTE data snapshot after start(). 

Is that broken then?

iopt_clear_dirty_data() sets the NULL:

	iommu_dirty_bitmap_init(&dirty, NULL, &gather);
		ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
						iopt_area_length(area), 0,
						&dirty);

But AMD, for instance, does nothing:

	spin_lock_irqsave(&pdomain->lock, lflags);
	if (!pdomain->dirty_tracking && dirty->bitmap) {
		spin_unlock_irqrestore(&pdomain->lock, lflags);
		return -EINVAL;
	}
	spin_unlock_irqrestore(&pdomain->lock, lflags);

	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);

It certainly didn't clear the IOPTEs.

AFAIK the NULL should be ignored here:

static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty,
					     unsigned long iova,
					     unsigned long length)
{
	if (dirty->bitmap)
		iova_bitmap_set(dirty->bitmap, iova, length);

Not above. That looks like a bug. Yes?

Thanks,
Jason
Joao Martins May 22, 2024, 5:10 p.m. UTC | #8
On 22/05/2024 17:56, Jason Gunthorpe wrote:
> On Wed, May 22, 2024 at 03:37:57PM +0100, Joao Martins wrote:
>  
>> This is just to catch the case where IOMMUFD can call into read_and_clear()
>> without dirty tracking enabled and without a bitmap structure to clear dirty
>> bits -- in order to ensure a clean PTE data snapshot after start(). 
> 
> Is that broken then?
> 

It's not: The check errors out the caller ends up calling read-and-clear with a
bitmap but without having started dirty tracking. the iopt_clear_dirty_data()
passes a null bitmap, it goes through and it walks and clears the IOPTEs
*without* recording them in the bitmap.

> iopt_clear_dirty_data() sets the NULL:
> 
> 	iommu_dirty_bitmap_init(&dirty, NULL, &gather);
> 		ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
> 						iopt_area_length(area), 0,
> 						&dirty);
> 
Right

> But AMD, for instance, does nothing:
> 
> 	spin_lock_irqsave(&pdomain->lock, lflags);
> 	if (!pdomain->dirty_tracking && dirty->bitmap) {
> 		spin_unlock_irqrestore(&pdomain->lock, lflags);
> 		return -EINVAL;
> 	}
> 	spin_unlock_irqrestore(&pdomain->lock, lflags);
> 
> 	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
> 
Same for Intel (except it has a comment explaining the why).

But it's on purpose.

> It certainly didn't clear the IOPTEs.
>
But notice that this gets called *without* dirty::bitmap set so it goes through
and walk the page tables and clear the dirty bits. It just doesn't record in the
bitmap.

> AFAIK the NULL should be ignored here:
> 
> static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty,
> 					     unsigned long iova,
> 					     unsigned long length)
> {
> 	if (dirty->bitmap)
> 		iova_bitmap_set(dirty->bitmap, iova, length);
>

Right, yes. As intended. When we pass NULL bitmap we mean that we don't care
about dirty info. But you are still clearing the IOPTEs in the driver.

> Not above. That looks like a bug. Yes?
> 
It's not a bug.

It might be too subtle or maybe strange -- I recall raising this as not being
too obvious IIRC.

All this doing was to allow read_and_clear iommu driver to clear the bits
without recording bitmap info -- which is what it is doing.

In any case I am happy to clean this up if there's something more elegant in
people's mind.
Jason Gunthorpe May 22, 2024, 5:50 p.m. UTC | #9
On Wed, May 22, 2024 at 06:10:59PM +0100, Joao Martins wrote:
> On 22/05/2024 17:56, Jason Gunthorpe wrote:
> > On Wed, May 22, 2024 at 03:37:57PM +0100, Joao Martins wrote:
> >  
> >> This is just to catch the case where IOMMUFD can call into read_and_clear()
> >> without dirty tracking enabled and without a bitmap structure to clear dirty
> >> bits -- in order to ensure a clean PTE data snapshot after start(). 
> > 
> > Is that broken then?
> > 
> 
> It's not: The check errors out the caller ends up calling read-and-clear with a
> bitmap but without having started dirty tracking. the iopt_clear_dirty_data()
> passes a null bitmap, it goes through and it walks and clears the IOPTEs
> *without* recording them in the bitmap.

It is not "without recording them in the bitmap", saying that is the
confusing thing. The purpose of that 'if' is to return -EINVAL if
dirty tracking is not turned on and we query the bitmap.

More like this puts it in the common code and writes it in a more
straightforward way with better locking:

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index d35c1b8c8e65ce..b2cb557d3ea427 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2645,13 +2645,6 @@ static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
 	if (!ops || !ops->read_and_clear_dirty)
 		return -EOPNOTSUPP;
 
-	spin_lock_irqsave(&pdomain->lock, lflags);
-	if (!pdomain->dirty_tracking && dirty->bitmap) {
-		spin_unlock_irqrestore(&pdomain->lock, lflags);
-		return -EINVAL;
-	}
-	spin_unlock_irqrestore(&pdomain->lock, lflags);
-
 	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
 }
 
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 50eb9aed47cc58..844f2cf061911f 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4797,15 +4797,6 @@ static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
 	unsigned long end = iova + size - 1;
 	unsigned long pgsize;
 
-	/*
-	 * IOMMUFD core calls into a dirty tracking disabled domain without an
-	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
-	 * have occurred when we stopped dirty tracking. This ensures that we
-	 * never inherit dirtied bits from a previous cycle.
-	 */
-	if (!dmar_domain->dirty_tracking && dirty->bitmap)
-		return -EINVAL;
-
 	do {
 		struct dma_pte *pte;
 		int lvl = 0;
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 05fd9d3abf1b80..d116179809042d 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -536,7 +536,10 @@ int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
 		return ret;
 
 	down_read(&iopt->iova_rwsem);
-	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
+	if (!iopt->dirty_tracking_enabled)
+		ret = -EINVAL;
+	else
+		ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
 	up_read(&iopt->iova_rwsem);
 
 	return ret;
@@ -580,7 +583,11 @@ int iopt_set_dirty_tracking(struct io_pagetable *iopt,
 	if (!ops)
 		return -EOPNOTSUPP;
 
-	down_read(&iopt->iova_rwsem);
+	down_write(&iopt->iova_rwsem);
+	if (iopt->dirty_tracking_enabled == enable) {
+		ret = 0;
+		goto out_unlock;
+	}
 
 	/* Clear dirty bits from PTEs to ensure a clean snapshot */
 	if (enable) {
@@ -590,9 +597,11 @@ int iopt_set_dirty_tracking(struct io_pagetable *iopt,
 	}
 
 	ret = ops->set_dirty_tracking(domain, enable);
-
+	if (ret)
+		goto out_unlock;
+	iopt->dirty_tracking_enabled = enable;
 out_unlock:
-	up_read(&iopt->iova_rwsem);
+	up_write(&iopt->iova_rwsem);
 	return ret;
 }
 
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 991f864d1f9bc1..de3761e15cab54 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -52,6 +52,7 @@ struct io_pagetable {
 	/* IOVA that cannot be allocated, struct iopt_reserved */
 	struct rb_root_cached reserved_itree;
 	u8 disable_large_pages;
+	u8 dirty_tracking_enabled;
 	unsigned long iova_alignment;
 };
Joao Martins May 22, 2024, 6:15 p.m. UTC | #10
On 22/05/2024 18:50, Jason Gunthorpe wrote:
> On Wed, May 22, 2024 at 06:10:59PM +0100, Joao Martins wrote:
>> On 22/05/2024 17:56, Jason Gunthorpe wrote:
>>> On Wed, May 22, 2024 at 03:37:57PM +0100, Joao Martins wrote:
>>>  
>>>> This is just to catch the case where IOMMUFD can call into read_and_clear()
>>>> without dirty tracking enabled and without a bitmap structure to clear dirty
>>>> bits -- in order to ensure a clean PTE data snapshot after start(). 
>>>
>>> Is that broken then?
>>>
>>
>> It's not: The check errors out the caller ends up calling read-and-clear with a
>> bitmap but without having started dirty tracking. the iopt_clear_dirty_data()
>> passes a null bitmap, it goes through and it walks and clears the IOPTEs
>> *without* recording them in the bitmap.
> 
> It is not "without recording them in the bitmap", saying that is the
> confusing thing. The purpose of that 'if' is to return -EINVAL if
> dirty tracking is not turned on and we query the bitmap.
> 
Right.

> More like this puts it in the common code and writes it in a more
> straightforward way with better locking:
> 
Yes, This snip you pasted would be the equivalent to the current way indeed.
Looks good

I think I was trying too hard not to duplicate 'state of dirty tracking' between
iommu driver and iommufd core that I unintendedly ended up convoluting with this
check in the driver :/

> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> index d35c1b8c8e65ce..b2cb557d3ea427 100644
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -2645,13 +2645,6 @@ static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
>  	if (!ops || !ops->read_and_clear_dirty)
>  		return -EOPNOTSUPP;
>  
> -	spin_lock_irqsave(&pdomain->lock, lflags);
> -	if (!pdomain->dirty_tracking && dirty->bitmap) {
> -		spin_unlock_irqrestore(&pdomain->lock, lflags);
> -		return -EINVAL;
> -	}
> -	spin_unlock_irqrestore(&pdomain->lock, lflags);
> -
>  	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
>  }
>  
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index 50eb9aed47cc58..844f2cf061911f 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -4797,15 +4797,6 @@ static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
>  	unsigned long end = iova + size - 1;
>  	unsigned long pgsize;
>  
> -	/*
> -	 * IOMMUFD core calls into a dirty tracking disabled domain without an
> -	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
> -	 * have occurred when we stopped dirty tracking. This ensures that we
> -	 * never inherit dirtied bits from a previous cycle.
> -	 */
> -	if (!dmar_domain->dirty_tracking && dirty->bitmap)
> -		return -EINVAL;
> -
>  	do {
>  		struct dma_pte *pte;
>  		int lvl = 0;
> diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
> index 05fd9d3abf1b80..d116179809042d 100644
> --- a/drivers/iommu/iommufd/io_pagetable.c
> +++ b/drivers/iommu/iommufd/io_pagetable.c
> @@ -536,7 +536,10 @@ int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
>  		return ret;
>  
>  	down_read(&iopt->iova_rwsem);
> -	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
> +	if (!iopt->dirty_tracking_enabled)
> +		ret = -EINVAL;
> +	else
> +		ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
>  	up_read(&iopt->iova_rwsem);
>  
>  	return ret;
> @@ -580,7 +583,11 @@ int iopt_set_dirty_tracking(struct io_pagetable *iopt,
>  	if (!ops)
>  		return -EOPNOTSUPP;
>  
> -	down_read(&iopt->iova_rwsem);
> +	down_write(&iopt->iova_rwsem);
> +	if (iopt->dirty_tracking_enabled == enable) {
> +		ret = 0;
> +		goto out_unlock;
> +	}
>  
>  	/* Clear dirty bits from PTEs to ensure a clean snapshot */
>  	if (enable) {
> @@ -590,9 +597,11 @@ int iopt_set_dirty_tracking(struct io_pagetable *iopt,
>  	}
>  
>  	ret = ops->set_dirty_tracking(domain, enable);
> -
> +	if (ret)
> +		goto out_unlock;
> +	iopt->dirty_tracking_enabled = enable;
>  out_unlock:
> -	up_read(&iopt->iova_rwsem);
> +	up_write(&iopt->iova_rwsem);
>  	return ret;
>  }
>  
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index 991f864d1f9bc1..de3761e15cab54 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -52,6 +52,7 @@ struct io_pagetable {
>  	/* IOVA that cannot be allocated, struct iopt_reserved */
>  	struct rb_root_cached reserved_itree;
>  	u8 disable_large_pages;
> +	u8 dirty_tracking_enabled;
>  	unsigned long iova_alignment;
>  };
>
Joao Martins May 22, 2024, 6:39 p.m. UTC | #11
On 22/05/2024 19:15, Joao Martins wrote:
> On 22/05/2024 18:50, Jason Gunthorpe wrote:
>> On Wed, May 22, 2024 at 06:10:59PM +0100, Joao Martins wrote:
>>> On 22/05/2024 17:56, Jason Gunthorpe wrote:
>>>> On Wed, May 22, 2024 at 03:37:57PM +0100, Joao Martins wrote:
>>>>  
>>>>> This is just to catch the case where IOMMUFD can call into read_and_clear()
>>>>> without dirty tracking enabled and without a bitmap structure to clear dirty
>>>>> bits -- in order to ensure a clean PTE data snapshot after start(). 
>>>>
>>>> Is that broken then?
>>>>
>>>
>>> It's not: The check errors out the caller ends up calling read-and-clear with a
>>> bitmap but without having started dirty tracking. the iopt_clear_dirty_data()
>>> passes a null bitmap, it goes through and it walks and clears the IOPTEs
>>> *without* recording them in the bitmap.
>>
>> It is not "without recording them in the bitmap", saying that is the
>> confusing thing. The purpose of that 'if' is to return -EINVAL if
>> dirty tracking is not turned on and we query the bitmap.
>>
> Right.
> 
>> More like this puts it in the common code and writes it in a more
>> straightforward way with better locking:
>>
> Yes, This snip you pasted would be the equivalent to the current way indeed.
> Looks good
> 
> I think I was trying too hard not to duplicate 'state of dirty tracking' between
> iommu driver and iommufd core that I unintendedly ended up convoluting with this
> check in the driver :/
> 

To this point above (...)

>> diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
>> index 05fd9d3abf1b80..d116179809042d 100644
>> --- a/drivers/iommu/iommufd/io_pagetable.c
>> +++ b/drivers/iommu/iommufd/io_pagetable.c
>> @@ -536,7 +536,10 @@ int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
>>  		return ret;
>>  
>>  	down_read(&iopt->iova_rwsem);
>> -	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
>> +	if (!iopt->dirty_tracking_enabled)
>> +		ret = -EINVAL;
>> +	else
>> +		ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
>>  	up_read(&iopt->iova_rwsem);
>>  
>>  	return ret;
>> @@ -580,7 +583,11 @@ int iopt_set_dirty_tracking(struct io_pagetable *iopt,
>>  	if (!ops)
>>  		return -EOPNOTSUPP;
>>  
>> -	down_read(&iopt->iova_rwsem);
>> +	down_write(&iopt->iova_rwsem);
>> +	if (iopt->dirty_tracking_enabled == enable) {
>> +		ret = 0;
>> +		goto out_unlock;
>> +	}
>>  

We have a similar check in set_dirty_tracking iommu op of intel/amd iommu
drivers that could be removed. Though I am not sure it's right to remove it even
as we move this up the stack

>>  	/* Clear dirty bits from PTEs to ensure a clean snapshot */
>>  	if (enable) {
>> @@ -590,9 +597,11 @@ int iopt_set_dirty_tracking(struct io_pagetable *iopt,
>>  	}
>>  
>>  	ret = ops->set_dirty_tracking(domain, enable);
>> -
>> +	if (ret)
>> +		goto out_unlock;
>> +	iopt->dirty_tracking_enabled = enable;
>>  out_unlock:
>> -	up_read(&iopt->iova_rwsem);
>> +	up_write(&iopt->iova_rwsem);
>>  	return ret;
>>  }
>>  
>> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
>> index 991f864d1f9bc1..de3761e15cab54 100644
>> --- a/drivers/iommu/iommufd/iommufd_private.h
>> +++ b/drivers/iommu/iommufd/iommufd_private.h
>> @@ -52,6 +52,7 @@ struct io_pagetable {
>>  	/* IOVA that cannot be allocated, struct iopt_reserved */
>>  	struct rb_root_cached reserved_itree;
>>  	u8 disable_large_pages;
>> +	u8 dirty_tracking_enabled;
>>  	unsigned long iova_alignment;
>>  };
>>  
>
Tian, Kevin May 23, 2024, 3:30 a.m. UTC | #12
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, May 23, 2024 1:51 AM
> 
> diff --git a/drivers/iommu/iommufd/iommufd_private.h
> b/drivers/iommu/iommufd/iommufd_private.h
> index 991f864d1f9bc1..de3761e15cab54 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -52,6 +52,7 @@ struct io_pagetable {
>  	/* IOVA that cannot be allocated, struct iopt_reserved */
>  	struct rb_root_cached reserved_itree;
>  	u8 disable_large_pages;
> +	u8 dirty_tracking_enabled;
>  	unsigned long iova_alignment;
>  };
> 

should it be a hwpt flag instead?
Joao Martins May 24, 2024, 11:30 a.m. UTC | #13
On 23/05/2024 04:30, Tian, Kevin wrote:
>> From: Jason Gunthorpe <jgg@nvidia.com>
>> Sent: Thursday, May 23, 2024 1:51 AM
>>
>> diff --git a/drivers/iommu/iommufd/iommufd_private.h
>> b/drivers/iommu/iommufd/iommufd_private.h
>> index 991f864d1f9bc1..de3761e15cab54 100644
>> --- a/drivers/iommu/iommufd/iommufd_private.h
>> +++ b/drivers/iommu/iommufd/iommufd_private.h
>> @@ -52,6 +52,7 @@ struct io_pagetable {
>>  	/* IOVA that cannot be allocated, struct iopt_reserved */
>>  	struct rb_root_cached reserved_itree;
>>  	u8 disable_large_pages;
>> +	u8 dirty_tracking_enabled;
>>  	unsigned long iova_alignment;
>>  };
>>
> 
> should it be a hwpt flag instead?
> 

Most of this deals with iopt locks and walking iopt areas to clear dirty. So
this being a iopt attribute looks cleaner in implementation. But I think I see
your point suggestion considering it represents a iommu domain property.

	Joao
Jason Gunthorpe May 24, 2024, 2:07 p.m. UTC | #14
On Fri, May 24, 2024 at 12:30:22PM +0100, Joao Martins wrote:
> On 23/05/2024 04:30, Tian, Kevin wrote:
> >> From: Jason Gunthorpe <jgg@nvidia.com>
> >> Sent: Thursday, May 23, 2024 1:51 AM
> >>
> >> diff --git a/drivers/iommu/iommufd/iommufd_private.h
> >> b/drivers/iommu/iommufd/iommufd_private.h
> >> index 991f864d1f9bc1..de3761e15cab54 100644
> >> --- a/drivers/iommu/iommufd/iommufd_private.h
> >> +++ b/drivers/iommu/iommufd/iommufd_private.h
> >> @@ -52,6 +52,7 @@ struct io_pagetable {
> >>  	/* IOVA that cannot be allocated, struct iopt_reserved */
> >>  	struct rb_root_cached reserved_itree;
> >>  	u8 disable_large_pages;
> >> +	u8 dirty_tracking_enabled;
> >>  	unsigned long iova_alignment;
> >>  };
> >>
> > 
> > should it be a hwpt flag instead?
> > 
> 
> Most of this deals with iopt locks and walking iopt areas to clear dirty. So
> this being a iopt attribute looks cleaner in implementation. But I think I see
> your point suggestion considering it represents a iommu domain property.

Yeah, the original idea of the hwpt/iopt split was to keep code that
was principly iommu_domain related away from the code which was
principally attachment and lifetime related. Since this value is
covered by iopt locks it makes sense in this struct.

Not sure the split will stand the test of time as we keep finding
reasons to muddle the boundary :\

Jason
Tian, Kevin May 27, 2024, 1:21 a.m. UTC | #15
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Friday, May 24, 2024 10:08 PM
> 
> On Fri, May 24, 2024 at 12:30:22PM +0100, Joao Martins wrote:
> > On 23/05/2024 04:30, Tian, Kevin wrote:
> > >> From: Jason Gunthorpe <jgg@nvidia.com>
> > >> Sent: Thursday, May 23, 2024 1:51 AM
> > >>
> > >> diff --git a/drivers/iommu/iommufd/iommufd_private.h
> > >> b/drivers/iommu/iommufd/iommufd_private.h
> > >> index 991f864d1f9bc1..de3761e15cab54 100644
> > >> --- a/drivers/iommu/iommufd/iommufd_private.h
> > >> +++ b/drivers/iommu/iommufd/iommufd_private.h
> > >> @@ -52,6 +52,7 @@ struct io_pagetable {
> > >>  	/* IOVA that cannot be allocated, struct iopt_reserved */
> > >>  	struct rb_root_cached reserved_itree;
> > >>  	u8 disable_large_pages;
> > >> +	u8 dirty_tracking_enabled;
> > >>  	unsigned long iova_alignment;
> > >>  };
> > >>
> > >
> > > should it be a hwpt flag instead?
> > >
> >
> > Most of this deals with iopt locks and walking iopt areas to clear dirty. So
> > this being a iopt attribute looks cleaner in implementation. But I think I see
> > your point suggestion considering it represents a iommu domain property.
> 
> Yeah, the original idea of the hwpt/iopt split was to keep code that
> was principly iommu_domain related away from the code which was
> principally attachment and lifetime related. Since this value is
> covered by iopt locks it makes sense in this struct.
> 
> Not sure the split will stand the test of time as we keep finding
> reasons to muddle the boundary :\
> 

emmm there could be multiple domains under a iopt while the dirty
tracking toggling must be done per domain.

with a iopt flag like this only the 1st domain under the iopt will be
properly configured?
Joao Martins May 27, 2024, 9:50 a.m. UTC | #16
On 27/05/2024 02:21, Tian, Kevin wrote:
>> From: Jason Gunthorpe <jgg@nvidia.com>
>> Sent: Friday, May 24, 2024 10:08 PM
>>
>> On Fri, May 24, 2024 at 12:30:22PM +0100, Joao Martins wrote:
>>> On 23/05/2024 04:30, Tian, Kevin wrote:
>>>>> From: Jason Gunthorpe <jgg@nvidia.com>
>>>>> Sent: Thursday, May 23, 2024 1:51 AM
>>>>>
>>>>> diff --git a/drivers/iommu/iommufd/iommufd_private.h
>>>>> b/drivers/iommu/iommufd/iommufd_private.h
>>>>> index 991f864d1f9bc1..de3761e15cab54 100644
>>>>> --- a/drivers/iommu/iommufd/iommufd_private.h
>>>>> +++ b/drivers/iommu/iommufd/iommufd_private.h
>>>>> @@ -52,6 +52,7 @@ struct io_pagetable {
>>>>>  	/* IOVA that cannot be allocated, struct iopt_reserved */
>>>>>  	struct rb_root_cached reserved_itree;
>>>>>  	u8 disable_large_pages;
>>>>> +	u8 dirty_tracking_enabled;
>>>>>  	unsigned long iova_alignment;
>>>>>  };
>>>>>
>>>>
>>>> should it be a hwpt flag instead?
>>>>
>>>
>>> Most of this deals with iopt locks and walking iopt areas to clear dirty. So
>>> this being a iopt attribute looks cleaner in implementation. But I think I see
>>> your point suggestion considering it represents a iommu domain property.
>>
>> Yeah, the original idea of the hwpt/iopt split was to keep code that
>> was principly iommu_domain related away from the code which was
>> principally attachment and lifetime related. Since this value is
>> covered by iopt locks it makes sense in this struct.
>>
>> Not sure the split will stand the test of time as we keep finding
>> reasons to muddle the boundary :\
>>
> 
> emmm there could be multiple domains under a iopt while the dirty
> tracking toggling must be done per domain.
> 
> with a iopt flag like this only the 1st domain under the iopt will be
> properly configured?
> 

Using intel as the reference given that it's the only one that has that
implemented: so far seems to be the other way around i.e. a parent domain sets
its childs' DPT in the iommu driver?

	Joao
Jason Gunthorpe June 1, 2024, 6:55 p.m. UTC | #17
On Mon, May 27, 2024 at 01:21:14AM +0000, Tian, Kevin wrote:

> emmm there could be multiple domains under a iopt while the dirty
> tracking toggling must be done per domain.

For some reason I thought we had made this per ioas, so yeah it is not
right like I showed.

Jason
Joao Martins June 3, 2024, 6:50 p.m. UTC | #18
On 01/06/2024 19:55, Jason Gunthorpe wrote:
> On Mon, May 27, 2024 at 01:21:14AM +0000, Tian, Kevin wrote:
> 
>> emmm there could be multiple domains under a iopt while the dirty
>> tracking toggling must be done per domain.
> 
> For some reason I thought we had made this per ioas, so yeah it is not
> right like I showed.
> 

Something like below instead?

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 721ede5a1faf..9756ff18b5f2 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2645,18 +2645,10 @@ static int amd_iommu_read_and_clear_dirty(struct
iommu_domain *domain,
 {
        struct protection_domain *pdomain = to_pdomain(domain);
        struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
-       unsigned long lflags;

        if (!ops || !ops->read_and_clear_dirty)
                return -EOPNOTSUPP;

-       spin_lock_irqsave(&pdomain->lock, lflags);
-       if (!pdomain->dirty_tracking && dirty->bitmap) {
-               spin_unlock_irqrestore(&pdomain->lock, lflags);
-               return -EINVAL;
-       }
-       spin_unlock_irqrestore(&pdomain->lock, lflags);
-
        return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
 }

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 15411d8d1b3d..6db733f73922 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4799,15 +4799,6 @@ static int intel_iommu_read_and_clear_dirty(struct
iommu_domain *domain,
        unsigned long end = iova + size - 1;
        unsigned long pgsize;

-       /*
-        * IOMMUFD core calls into a dirty tracking disabled domain without an
-        * IOVA bitmap set in order to clean dirty bits in all PTEs that might
-        * have occurred when we stopped dirty tracking. This ensures that we
-        * never inherit dirtied bits from a previous cycle.
-        */
-       if (!dmar_domain->dirty_tracking && dirty->bitmap)
-               return -EINVAL;
-
        do {
                struct dma_pte *pte;
                int lvl = 0;
diff --git a/drivers/iommu/iommufd/hw_pagetable.c
b/drivers/iommu/iommufd/hw_pagetable.c
index 33d142f8057d..de3064ee849e 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -331,7 +331,9 @@ int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd)
 {
        struct iommu_hwpt_set_dirty_tracking *cmd = ucmd->cmd;
        struct iommufd_hwpt_paging *hwpt_paging;
-       struct iommufd_ioas *ioas;
+       const struct iommu_dirty_ops *ops;
+       struct iommu_domain *domain;
+       struct io_pagetable *iopt;
        int rc = -EOPNOTSUPP;
        bool enable;

@@ -342,12 +344,35 @@ int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd)
        if (IS_ERR(hwpt_paging))
                return PTR_ERR(hwpt_paging);

-       ioas = hwpt_paging->ioas;
+       iopt = &hwpt_paging->ioas->iopt;
        enable = cmd->flags & IOMMU_HWPT_DIRTY_TRACKING_ENABLE;
+       domain = hwpt_paging->common.domain;

-       rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt_paging->common.domain,
-                                    enable);
+       ops = domain->dirty_ops;
+       if (!ops) {
+               rc = -EOPNOTSUPP;
+               goto out_unlock;
+       }
+
+       down_write(&iopt->iova_rwsem);
+       if (hwpt_paging->dirty_tracking == enable) {
+               rc = 0;
+               goto out_unlock;
+       }
+
+       /* Clear dirty bits from PTEs to ensure a clean snapshot */
+       if (enable) {
+               rc = iopt_clear_dirty_data(iopt, domain);
+               if (rc)
+                       goto out_unlock;
+       }

+       rc = ops->set_dirty_tracking(domain, enable);
+       if (rc)
+               goto out_unlock;
+       hwpt_paging->dirty_tracking = enable;
+out_unlock:
+       up_write(&iopt->iova_rwsem);
        iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj);
        return rc;
 }
@@ -356,7 +381,7 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd)
 {
        struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd;
        struct iommufd_hwpt_paging *hwpt_paging;
-       struct iommufd_ioas *ioas;
+       struct io_pagetable *iopt;
        int rc = -EOPNOTSUPP;

        if ((cmd->flags & ~(IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR)) ||
@@ -367,10 +392,19 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd)
        if (IS_ERR(hwpt_paging))
                return PTR_ERR(hwpt_paging);

-       ioas = hwpt_paging->ioas;
+       iopt = &hwpt_paging->ioas->iopt;
+
+       down_read(&iopt->iova_rwsem);
+       if (!hwpt_paging->dirty_tracking) {
+               rc = -EINVAL;
+               goto out_put_hwpt;
+       }
+
        rc = iopt_read_and_clear_dirty_data(
-               &ioas->iopt, hwpt_paging->common.domain, cmd->flags, cmd);
+               iopt, hwpt_paging->common.domain, cmd->flags, cmd);

+out_put_hwpt:
+       up_read(&iopt->iova_rwsem);
        iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj);
        return rc;
 }
diff --git a/drivers/iommu/iommufd/io_pagetable.c
b/drivers/iommu/iommufd/io_pagetable.c
index b40eab6ebe4c..25547d79f294 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -532,19 +532,17 @@ int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
 {
        int ret;

+       lockdep_assert_held_read(&iopt->iova_rwsem);
+
        ret = iommufd_check_iova_range(iopt, bitmap);
        if (ret)
                return ret;

-       down_read(&iopt->iova_rwsem);
-       ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
-       up_read(&iopt->iova_rwsem);
-
-       return ret;
+       return iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
 }

-static int iopt_clear_dirty_data(struct io_pagetable *iopt,
-                                struct iommu_domain *domain)
+int iopt_clear_dirty_data(struct io_pagetable *iopt,
+                         struct iommu_domain *domain)
 {
        const struct iommu_dirty_ops *ops = domain->dirty_ops;
        struct iommu_iotlb_gather gather;
@@ -576,31 +574,6 @@ static int iopt_clear_dirty_data(struct io_pagetable *iopt,
        return ret;
 }

-int iopt_set_dirty_tracking(struct io_pagetable *iopt,
-                           struct iommu_domain *domain, bool enable)
-{
-       const struct iommu_dirty_ops *ops = domain->dirty_ops;
-       int ret = 0;
-
-       if (!ops)
-               return -EOPNOTSUPP;
-
-       down_read(&iopt->iova_rwsem);
-
-       /* Clear dirty bits from PTEs to ensure a clean snapshot */
-       if (enable) {
-               ret = iopt_clear_dirty_data(iopt, domain);
-               if (ret)
-                       goto out_unlock;
-       }
-
-       ret = ops->set_dirty_tracking(domain, enable);
-
-out_unlock:
-       up_read(&iopt->iova_rwsem);
-       return ret;
-}
-
 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
                   unsigned long length, struct list_head *pages_list)
 {
diff --git a/drivers/iommu/iommufd/iommufd_private.h
b/drivers/iommu/iommufd/iommufd_private.h
index 991f864d1f9b..9f946df4ca77 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -78,8 +78,8 @@ int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
                                   struct iommu_domain *domain,
                                   unsigned long flags,
                                   struct iommu_hwpt_get_dirty_bitmap *bitmap);
-int iopt_set_dirty_tracking(struct io_pagetable *iopt,
-                           struct iommu_domain *domain, bool enable);
+int iopt_clear_dirty_data(struct io_pagetable *iopt,
+                         struct iommu_domain *domain);

 void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
                                 unsigned long length);
@@ -301,6 +301,7 @@ struct iommufd_hwpt_paging {
        bool enforce_cache_coherency : 1;
        bool msi_cookie : 1;
        bool nest_parent : 1;
+       bool dirty_tracking : 1;
        /* Head at iommufd_ioas::hwpt_list */
        struct list_head hwpt_item;
 };
diff mbox series

Patch

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index f7828a7aad41..da6cc52859ba 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -75,6 +75,7 @@ 
 
 #define ARM_LPAE_PTE_NSTABLE		(((arm_lpae_iopte)1) << 63)
 #define ARM_LPAE_PTE_XN			(((arm_lpae_iopte)3) << 53)
+#define ARM_LPAE_PTE_DBM		(((arm_lpae_iopte)1) << 51)
 #define ARM_LPAE_PTE_AF			(((arm_lpae_iopte)1) << 10)
 #define ARM_LPAE_PTE_SH_NS		(((arm_lpae_iopte)0) << 8)
 #define ARM_LPAE_PTE_SH_OS		(((arm_lpae_iopte)2) << 8)
@@ -84,7 +85,7 @@ 
 
 #define ARM_LPAE_PTE_ATTR_LO_MASK	(((arm_lpae_iopte)0x3ff) << 2)
 /* Ignore the contiguous bit for block splitting */
-#define ARM_LPAE_PTE_ATTR_HI_MASK	(((arm_lpae_iopte)6) << 52)
+#define ARM_LPAE_PTE_ATTR_HI_MASK	(ARM_LPAE_PTE_XN | ARM_LPAE_PTE_DBM)
 #define ARM_LPAE_PTE_ATTR_MASK		(ARM_LPAE_PTE_ATTR_LO_MASK |	\
 					 ARM_LPAE_PTE_ATTR_HI_MASK)
 /* Software bit for solving coherency races */
@@ -92,7 +93,11 @@ 
 
 /* Stage-1 PTE */
 #define ARM_LPAE_PTE_AP_UNPRIV		(((arm_lpae_iopte)1) << 6)
-#define ARM_LPAE_PTE_AP_RDONLY		(((arm_lpae_iopte)2) << 6)
+#define ARM_LPAE_PTE_AP_RDONLY_BIT	7
+#define ARM_LPAE_PTE_AP_RDONLY		(((arm_lpae_iopte)1) << \
+					   ARM_LPAE_PTE_AP_RDONLY_BIT)
+#define ARM_LPAE_PTE_AP_WRITABLE_CLEAN	(ARM_LPAE_PTE_AP_RDONLY | \
+					 ARM_LPAE_PTE_DBM)
 #define ARM_LPAE_PTE_ATTRINDX_SHIFT	2
 #define ARM_LPAE_PTE_nG			(((arm_lpae_iopte)1) << 11)
 
@@ -138,6 +143,9 @@ 
 
 #define iopte_prot(pte)	((pte) & ARM_LPAE_PTE_ATTR_MASK)
 
+#define iopte_hw_dirty(pte)	(((pte) & ARM_LPAE_PTE_AP_WRITABLE_CLEAN) == \
+				   ARM_LPAE_PTE_DBM)
+
 struct arm_lpae_io_pgtable {
 	struct io_pgtable	iop;
 
@@ -729,6 +737,98 @@  static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,
 	return iopte_to_paddr(pte, data) | iova;
 }
 
+struct io_pgtable_walk_data {
+	struct iommu_dirty_bitmap	*dirty;
+	unsigned long			flags;
+	u64				addr;
+	const u64			end;
+};
+
+static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data,
+				       struct io_pgtable_walk_data *walk_data,
+				       arm_lpae_iopte *ptep,
+				       int lvl);
+
+static int io_pgtable_visit_dirty(struct arm_lpae_io_pgtable *data,
+				  struct io_pgtable_walk_data *walk_data,
+				  arm_lpae_iopte *ptep, int lvl)
+{
+	struct io_pgtable *iop = &data->iop;
+	arm_lpae_iopte pte = READ_ONCE(*ptep);
+
+	if (WARN_ON(!pte))
+		return -EINVAL;
+
+	if (iopte_leaf(pte, lvl, iop->fmt)) {
+		size_t size = ARM_LPAE_BLOCK_SIZE(lvl, data);
+
+		if (iopte_hw_dirty(pte)) {
+			iommu_dirty_bitmap_record(walk_data->dirty,
+						  walk_data->addr, size);
+			if (!(walk_data->flags & IOMMU_DIRTY_NO_CLEAR))
+				set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT,
+					(unsigned long *)ptep);
+		}
+		walk_data->addr += size;
+		return 0;
+	}
+
+	ptep = iopte_deref(pte, data);
+	return __arm_lpae_iopte_walk_dirty(data, walk_data, ptep, lvl + 1);
+}
+
+static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data,
+				       struct io_pgtable_walk_data *walk_data,
+				       arm_lpae_iopte *ptep,
+				       int lvl)
+{
+	u32 idx;
+	int max_entries, ret;
+
+	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
+		return -EINVAL;
+
+	if (lvl == data->start_level)
+		max_entries = ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte);
+	else
+		max_entries = ARM_LPAE_PTES_PER_TABLE(data);
+
+	for (idx = ARM_LPAE_LVL_IDX(walk_data->addr, lvl, data);
+	     (idx < max_entries) && (walk_data->addr < walk_data->end); ++idx) {
+		ret = io_pgtable_visit_dirty(data, walk_data, ptep + idx, lvl);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops,
+					 unsigned long iova, size_t size,
+					 unsigned long flags,
+					 struct iommu_dirty_bitmap *dirty)
+{
+	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+	struct io_pgtable_walk_data walk_data = {
+		.dirty = dirty,
+		.flags = flags,
+		.addr = iova,
+		.end = iova + size,
+	};
+	arm_lpae_iopte *ptep = data->pgd;
+	int lvl = data->start_level;
+
+	if (WARN_ON(!size))
+		return -EINVAL;
+	if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1)))
+		return -EINVAL;
+	if (data->iop.fmt != ARM_64_LPAE_S1)
+		return -EINVAL;
+
+	return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl);
+}
+
 static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg)
 {
 	unsigned long granule, page_sizes;
@@ -807,6 +907,7 @@  arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
 		.map_pages	= arm_lpae_map_pages,
 		.unmap_pages	= arm_lpae_unmap_pages,
 		.iova_to_phys	= arm_lpae_iova_to_phys,
+		.read_and_clear_dirty = arm_lpae_read_and_clear_dirty,
 	};
 
 	return data;