Message ID | 20240430134308.1604-3-shameerali.kolothum.thodi@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | iommu/smmuv3: Add IOMMUFD dirty tracking support for SMMUv3 | expand |
On 30/04/2024 14:43, Shameer Kolothum wrote: > .read_and_clear_dirty() IOMMU domain op takes care of reading the dirty > bits (i.e. PTE has DBM set and AP[2] clear) and marshalling into a > bitmap of a given page size. > > While reading the dirty bits we also set the PTE AP[2] bit to mark it > as writable-clean depending on read_and_clear_dirty() flags. > > PTE states with respect to DBM bit: > > DBM bit AP[2]("RDONLY" bit) > 1. writable_clean 1 1 > 2. writable_dirty 1 0 > 3. read-only 0 1 > > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> Couple of nits/comments, but regardless: Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> > --- > drivers/iommu/io-pgtable-arm.c | 105 ++++++++++++++++++++++++++++++++- > 1 file changed, 103 insertions(+), 2 deletions(-) > > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c > index f7828a7aad41..da6cc52859ba 100644 > --- a/drivers/iommu/io-pgtable-arm.c > +++ b/drivers/iommu/io-pgtable-arm.c > @@ -75,6 +75,7 @@ > > #define ARM_LPAE_PTE_NSTABLE (((arm_lpae_iopte)1) << 63) > #define ARM_LPAE_PTE_XN (((arm_lpae_iopte)3) << 53) > +#define ARM_LPAE_PTE_DBM (((arm_lpae_iopte)1) << 51) > #define ARM_LPAE_PTE_AF (((arm_lpae_iopte)1) << 10) > #define ARM_LPAE_PTE_SH_NS (((arm_lpae_iopte)0) << 8) > #define ARM_LPAE_PTE_SH_OS (((arm_lpae_iopte)2) << 8) > @@ -84,7 +85,7 @@ > > #define ARM_LPAE_PTE_ATTR_LO_MASK (((arm_lpae_iopte)0x3ff) << 2) > /* Ignore the contiguous bit for block splitting */ > -#define ARM_LPAE_PTE_ATTR_HI_MASK (((arm_lpae_iopte)6) << 52) > +#define ARM_LPAE_PTE_ATTR_HI_MASK (ARM_LPAE_PTE_XN | ARM_LPAE_PTE_DBM) > #define ARM_LPAE_PTE_ATTR_MASK (ARM_LPAE_PTE_ATTR_LO_MASK | \ > ARM_LPAE_PTE_ATTR_HI_MASK) > /* Software bit for solving coherency races */ > @@ -92,7 +93,11 @@ > > /* Stage-1 PTE */ > #define ARM_LPAE_PTE_AP_UNPRIV (((arm_lpae_iopte)1) << 6) > -#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)2) << 6) > +#define ARM_LPAE_PTE_AP_RDONLY_BIT 7 > +#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)1) << \ > + ARM_LPAE_PTE_AP_RDONLY_BIT) > +#define ARM_LPAE_PTE_AP_WRITABLE_CLEAN (ARM_LPAE_PTE_AP_RDONLY | \ > + ARM_LPAE_PTE_DBM) > #define ARM_LPAE_PTE_ATTRINDX_SHIFT 2 > #define ARM_LPAE_PTE_nG (((arm_lpae_iopte)1) << 11) > > @@ -138,6 +143,9 @@ > > #define iopte_prot(pte) ((pte) & ARM_LPAE_PTE_ATTR_MASK) > > +#define iopte_hw_dirty(pte) (((pte) & ARM_LPAE_PTE_AP_WRITABLE_CLEAN) == \ > + ARM_LPAE_PTE_DBM) Could this simply be called "iopte_dirty"? The hw and sw dirty concept only come about in the arch code because we keep an additional sw bit to track dirty in the pte, which you are not doing here. > + > struct arm_lpae_io_pgtable { > struct io_pgtable iop; > > @@ -729,6 +737,98 @@ static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, > return iopte_to_paddr(pte, data) | iova; > } > > +struct io_pgtable_walk_data { > + struct iommu_dirty_bitmap *dirty; > + unsigned long flags; > + u64 addr; > + const u64 end; > +}; > + > +static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data, > + struct io_pgtable_walk_data *walk_data, > + arm_lpae_iopte *ptep, > + int lvl); > + > +static int io_pgtable_visit_dirty(struct arm_lpae_io_pgtable *data, > + struct io_pgtable_walk_data *walk_data, > + arm_lpae_iopte *ptep, int lvl) > +{ > + struct io_pgtable *iop = &data->iop; > + arm_lpae_iopte pte = READ_ONCE(*ptep); > + > + if (WARN_ON(!pte)) Wouldn't it actually be better to check that the pte is valid (i.e. bit 0 is set)? And do we really need the warn here? I'm guessing you never expect to be walking over a hole? - if it is legitimately possible to walk over a hole, then the WARN is definitely incorrect and you would want to continue walking rather than return. > + return -EINVAL; > + > + if (iopte_leaf(pte, lvl, iop->fmt)) { > + size_t size = ARM_LPAE_BLOCK_SIZE(lvl, data); > + > + if (iopte_hw_dirty(pte)) { > + iommu_dirty_bitmap_record(walk_data->dirty, > + walk_data->addr, size); > + if (!(walk_data->flags & IOMMU_DIRTY_NO_CLEAR)) > + set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT, > + (unsigned long *)ptep); Just checking, set_bit() is atomic right? So safe against racing updates with the HW. Pretty sure that's the case... > + } > + walk_data->addr += size; > + return 0; > + } > + > + ptep = iopte_deref(pte, data); > + return __arm_lpae_iopte_walk_dirty(data, walk_data, ptep, lvl + 1); > +} > + > +static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data, > + struct io_pgtable_walk_data *walk_data, > + arm_lpae_iopte *ptep, > + int lvl) > +{ > + u32 idx; > + int max_entries, ret; > + > + if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) > + return -EINVAL; > + > + if (lvl == data->start_level) > + max_entries = ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte); > + else > + max_entries = ARM_LPAE_PTES_PER_TABLE(data); > + > + for (idx = ARM_LPAE_LVL_IDX(walk_data->addr, lvl, data); > + (idx < max_entries) && (walk_data->addr < walk_data->end); ++idx) { > + ret = io_pgtable_visit_dirty(data, walk_data, ptep + idx, lvl); > + if (ret) > + return ret; > + } > + > + return 0; > +} > + > +static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops, > + unsigned long iova, size_t size, > + unsigned long flags, > + struct iommu_dirty_bitmap *dirty) > +{ > + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); > + struct io_pgtable_cfg *cfg = &data->iop.cfg; > + struct io_pgtable_walk_data walk_data = { > + .dirty = dirty, > + .flags = flags, > + .addr = iova, > + .end = iova + size, > + }; > + arm_lpae_iopte *ptep = data->pgd; > + int lvl = data->start_level; > + > + if (WARN_ON(!size)) > + return -EINVAL; > + if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1))) > + return -EINVAL; > + if (data->iop.fmt != ARM_64_LPAE_S1) > + return -EINVAL; > + > + return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl); > +} > + > static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg) > { > unsigned long granule, page_sizes; > @@ -807,6 +907,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg) > .map_pages = arm_lpae_map_pages, > .unmap_pages = arm_lpae_unmap_pages, > .iova_to_phys = arm_lpae_iova_to_phys, > + .read_and_clear_dirty = arm_lpae_read_and_clear_dirty, > }; > > return data;
On Tue, Apr 30, 2024 at 02:43:06PM +0100, Shameer Kolothum wrote: > +static int io_pgtable_visit_dirty(struct arm_lpae_io_pgtable *data, > + struct io_pgtable_walk_data *walk_data, > + arm_lpae_iopte *ptep, int lvl) > +{ > + struct io_pgtable *iop = &data->iop; > + arm_lpae_iopte pte = READ_ONCE(*ptep); > + > + if (WARN_ON(!pte)) > + return -EINVAL; This seems poorly placed, why would pte ever be zero? > + if (iopte_leaf(pte, lvl, iop->fmt)) { > + size_t size = ARM_LPAE_BLOCK_SIZE(lvl, data); > + > + if (iopte_hw_dirty(pte)) { > + iommu_dirty_bitmap_record(walk_data->dirty, > + walk_data->addr, size); > + if (!(walk_data->flags & IOMMU_DIRTY_NO_CLEAR)) > + set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT, > + (unsigned long *)ptep); > + } > + walk_data->addr += size; > + return 0; > + } > + > + ptep = iopte_deref(pte, data); This would be a better spot, if the pte doesn't indicate a next level table then something has gone wrong, otherwise the returned pointer has to be valid. Something like this maybe? static inline bool iopte_table(arm_lpae_iopte pte, int lvl, enum io_pgtable_fmt fmt) { if (lvl == (ARM_LPAE_MAX_LEVELS - 1)) return false; return iopte_type(pte) == ARM_LPAE_PTE_TYPE_TABLE; } > + return __arm_lpae_iopte_walk_dirty(data, walk_data, ptep, lvl + 1); > +} > + > +static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data, > + struct io_pgtable_walk_data *walk_data, > + arm_lpae_iopte *ptep, > + int lvl) > +{ > + u32 idx; > + int max_entries, ret; > + > + if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) > + return -EINVAL; And if the above ipte_deref() could check for a valid next level (with no valid level at MAX_LEVELS) then this can never happen either and can be removed. Jason
> From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> > Sent: Tuesday, April 30, 2024 9:43 PM > > @@ -92,7 +93,11 @@ > > /* Stage-1 PTE */ > #define ARM_LPAE_PTE_AP_UNPRIV (((arm_lpae_iopte)1) << 6) > -#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)2) << 6) > +#define ARM_LPAE_PTE_AP_RDONLY_BIT 7 > +#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)1) << \ > + ARM_LPAE_PTE_AP_RDONLY_BIT) > +#define ARM_LPAE_PTE_AP_WRITABLE_CLEAN > (ARM_LPAE_PTE_AP_RDONLY | \ > + ARM_LPAE_PTE_DBM) based on the usage is it clearer to be xxx_WRITEABLE_MASK? > #define ARM_LPAE_PTE_ATTRINDX_SHIFT 2 > #define ARM_LPAE_PTE_nG (((arm_lpae_iopte)1) << 11) > > @@ -138,6 +143,9 @@ > > #define iopte_prot(pte) ((pte) & ARM_LPAE_PTE_ATTR_MASK) > > +#define iopte_hw_dirty(pte) (((pte) & > ARM_LPAE_PTE_AP_WRITABLE_CLEAN) == \ > + ARM_LPAE_PTE_DBM) > + iopte_is_writeable_dirty()? and the following "set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT, (unsigned long *)ptep);" could be wrapped as: iopte_set_writable_clean(ptep); > + > +static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops, > + unsigned long iova, size_t size, > + unsigned long flags, > + struct iommu_dirty_bitmap *dirty) > +{ > + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); > + struct io_pgtable_cfg *cfg = &data->iop.cfg; > + struct io_pgtable_walk_data walk_data = { > + .dirty = dirty, > + .flags = flags, > + .addr = iova, > + .end = iova + size, > + }; > + arm_lpae_iopte *ptep = data->pgd; > + int lvl = data->start_level; > + > + if (WARN_ON(!size)) > + return -EINVAL; > + if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1))) > + return -EINVAL; > + if (data->iop.fmt != ARM_64_LPAE_S1) > + return -EINVAL; > + > + return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl); Intel/AMD drivers also checks: if (!dmar_domain->dirty_tracking && dirty->bitmap) return -EINVAL;
On Wed, May 22, 2024 at 07:12:27AM +0000, Tian, Kevin wrote: > > +static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops, > > + unsigned long iova, size_t size, > > + unsigned long flags, > > + struct iommu_dirty_bitmap *dirty) > > +{ > > + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); > > + struct io_pgtable_cfg *cfg = &data->iop.cfg; > > + struct io_pgtable_walk_data walk_data = { > > + .dirty = dirty, > > + .flags = flags, > > + .addr = iova, > > + .end = iova + size, > > + }; > > + arm_lpae_iopte *ptep = data->pgd; > > + int lvl = data->start_level; > > + > > + if (WARN_ON(!size)) > > + return -EINVAL; > > + if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1))) > > + return -EINVAL; > > + if (data->iop.fmt != ARM_64_LPAE_S1) > > + return -EINVAL; > > + > > + return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl); > > Intel/AMD drivers also checks: > > if (!dmar_domain->dirty_tracking && dirty->bitmap) > return -EINVAL; Those both seem redundant checks on a fast path? Jason
> -----Original Message----- > From: Tian, Kevin <kevin.tian@intel.com> > Sent: Wednesday, May 22, 2024 8:12 AM > To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>; > iommu@lists.linux.dev; linux-arm-kernel@lists.infradead.org > Cc: robin.murphy@arm.com; will@kernel.org; joro@8bytes.org; > jgg@nvidia.com; ryan.roberts@arm.com; nicolinc@nvidia.com; > mshavit@google.com; eric.auger@redhat.com; joao.m.martins@oracle.com; > jiangkunkun <jiangkunkun@huawei.com>; zhukeqian > <zhukeqian1@huawei.com>; Linuxarm <linuxarm@huawei.com> > Subject: RE: [PATCH v3 2/4] iommu/io-pgtable-arm: Add read_and_clear_dirty() > support > > > From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> > > Sent: Tuesday, April 30, 2024 9:43 PM > > > > @@ -92,7 +93,11 @@ > > > > /* Stage-1 PTE */ > > #define ARM_LPAE_PTE_AP_UNPRIV (((arm_lpae_iopte)1) << 6) > > -#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)2) << 6) > > +#define ARM_LPAE_PTE_AP_RDONLY_BIT 7 > > +#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)1) << \ > > + ARM_LPAE_PTE_AP_RDONLY_BIT) > > +#define ARM_LPAE_PTE_AP_WRITABLE_CLEAN > > (ARM_LPAE_PTE_AP_RDONLY | \ > > + ARM_LPAE_PTE_DBM) > > based on the usage is it clearer to be xxx_WRITEABLE_MASK? I think in patch 4 we use this to set the PTE Writeable Clean. Anyway I will revisit this following Jason's comment on just setting the DBM bit there. > > > #define ARM_LPAE_PTE_ATTRINDX_SHIFT 2 > > #define ARM_LPAE_PTE_nG (((arm_lpae_iopte)1) << 11) > > > > @@ -138,6 +143,9 @@ > > > > #define iopte_prot(pte) ((pte) & ARM_LPAE_PTE_ATTR_MASK) > > > > +#define iopte_hw_dirty(pte) (((pte) & > > ARM_LPAE_PTE_AP_WRITABLE_CLEAN) == \ > > + ARM_LPAE_PTE_DBM) > > + > > iopte_is_writeable_dirty()? > > and the following "set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT, > (unsigned long *)ptep);" could be wrapped as: > > iopte_set_writable_clean(ptep); Ok. Will consider this during respin. > > + > > +static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops, > > + unsigned long iova, size_t size, > > + unsigned long flags, > > + struct iommu_dirty_bitmap *dirty) > > +{ > > + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); > > + struct io_pgtable_cfg *cfg = &data->iop.cfg; > > + struct io_pgtable_walk_data walk_data = { > > + .dirty = dirty, > > + .flags = flags, > > + .addr = iova, > > + .end = iova + size, > > + }; > > + arm_lpae_iopte *ptep = data->pgd; > > + int lvl = data->start_level; > > + > > + if (WARN_ON(!size)) > > + return -EINVAL; > > + if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1))) > > + return -EINVAL; > > + if (data->iop.fmt != ARM_64_LPAE_S1) > > + return -EINVAL; > > + > > + return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl); > > Intel/AMD drivers also checks: > > if (!dmar_domain->dirty_tracking && dirty->bitmap) > return -EINVAL; Is that really required? Is the concern here is user may issue GET_DIRTY_BITMAP IOCTL without any validation and that may result in unnecessary scanning of page tables? May be we should handle it in core code then I think. Thanks, Shameer
On 22/05/2024 15:03, Shameerali Kolothum Thodi wrote: > > >> -----Original Message----- >> From: Tian, Kevin <kevin.tian@intel.com> >> Sent: Wednesday, May 22, 2024 8:12 AM >> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>; >> iommu@lists.linux.dev; linux-arm-kernel@lists.infradead.org >> Cc: robin.murphy@arm.com; will@kernel.org; joro@8bytes.org; >> jgg@nvidia.com; ryan.roberts@arm.com; nicolinc@nvidia.com; >> mshavit@google.com; eric.auger@redhat.com; joao.m.martins@oracle.com; >> jiangkunkun <jiangkunkun@huawei.com>; zhukeqian >> <zhukeqian1@huawei.com>; Linuxarm <linuxarm@huawei.com> >> Subject: RE: [PATCH v3 2/4] iommu/io-pgtable-arm: Add read_and_clear_dirty() >> support >> >>> From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> >>> Sent: Tuesday, April 30, 2024 9:43 PM >>> + >>> +static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops, >>> + unsigned long iova, size_t size, >>> + unsigned long flags, >>> + struct iommu_dirty_bitmap *dirty) >>> +{ >>> + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); >>> + struct io_pgtable_cfg *cfg = &data->iop.cfg; >>> + struct io_pgtable_walk_data walk_data = { >>> + .dirty = dirty, >>> + .flags = flags, >>> + .addr = iova, >>> + .end = iova + size, >>> + }; >>> + arm_lpae_iopte *ptep = data->pgd; >>> + int lvl = data->start_level; >>> + >>> + if (WARN_ON(!size)) >>> + return -EINVAL; >>> + if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1))) >>> + return -EINVAL; >>> + if (data->iop.fmt != ARM_64_LPAE_S1) >>> + return -EINVAL; >>> + >>> + return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl); >> >> Intel/AMD drivers also checks: >> >> if (!dmar_domain->dirty_tracking && dirty->bitmap) >> return -EINVAL; > > Is that really required? Is the concern here is user may issue > GET_DIRTY_BITMAP IOCTL without any validation and that may > result in unnecessary scanning of page tables? May be we should > handle it in core code then I think. This is just to catch the case where IOMMUFD can call into read_and_clear() without dirty tracking enabled and without a bitmap structure to clear dirty bits -- in order to ensure a clean PTE data snapshot after start(). And thus it errors out on others. Right now we don't track in iommufd that dirty tracking is active in the domain, vendor does so in its own way. I mean we could move that to core, but we end up duplicating what the iommu driver is already doing.
On Wed, May 22, 2024 at 03:37:57PM +0100, Joao Martins wrote: > This is just to catch the case where IOMMUFD can call into read_and_clear() > without dirty tracking enabled and without a bitmap structure to clear dirty > bits -- in order to ensure a clean PTE data snapshot after start(). Is that broken then? iopt_clear_dirty_data() sets the NULL: iommu_dirty_bitmap_init(&dirty, NULL, &gather); ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area), iopt_area_length(area), 0, &dirty); But AMD, for instance, does nothing: spin_lock_irqsave(&pdomain->lock, lflags); if (!pdomain->dirty_tracking && dirty->bitmap) { spin_unlock_irqrestore(&pdomain->lock, lflags); return -EINVAL; } spin_unlock_irqrestore(&pdomain->lock, lflags); return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); It certainly didn't clear the IOPTEs. AFAIK the NULL should be ignored here: static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty, unsigned long iova, unsigned long length) { if (dirty->bitmap) iova_bitmap_set(dirty->bitmap, iova, length); Not above. That looks like a bug. Yes? Thanks, Jason
On 22/05/2024 17:56, Jason Gunthorpe wrote: > On Wed, May 22, 2024 at 03:37:57PM +0100, Joao Martins wrote: > >> This is just to catch the case where IOMMUFD can call into read_and_clear() >> without dirty tracking enabled and without a bitmap structure to clear dirty >> bits -- in order to ensure a clean PTE data snapshot after start(). > > Is that broken then? > It's not: The check errors out the caller ends up calling read-and-clear with a bitmap but without having started dirty tracking. the iopt_clear_dirty_data() passes a null bitmap, it goes through and it walks and clears the IOPTEs *without* recording them in the bitmap. > iopt_clear_dirty_data() sets the NULL: > > iommu_dirty_bitmap_init(&dirty, NULL, &gather); > ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area), > iopt_area_length(area), 0, > &dirty); > Right > But AMD, for instance, does nothing: > > spin_lock_irqsave(&pdomain->lock, lflags); > if (!pdomain->dirty_tracking && dirty->bitmap) { > spin_unlock_irqrestore(&pdomain->lock, lflags); > return -EINVAL; > } > spin_unlock_irqrestore(&pdomain->lock, lflags); > > return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); > Same for Intel (except it has a comment explaining the why). But it's on purpose. > It certainly didn't clear the IOPTEs. > But notice that this gets called *without* dirty::bitmap set so it goes through and walk the page tables and clear the dirty bits. It just doesn't record in the bitmap. > AFAIK the NULL should be ignored here: > > static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty, > unsigned long iova, > unsigned long length) > { > if (dirty->bitmap) > iova_bitmap_set(dirty->bitmap, iova, length); > Right, yes. As intended. When we pass NULL bitmap we mean that we don't care about dirty info. But you are still clearing the IOPTEs in the driver. > Not above. That looks like a bug. Yes? > It's not a bug. It might be too subtle or maybe strange -- I recall raising this as not being too obvious IIRC. All this doing was to allow read_and_clear iommu driver to clear the bits without recording bitmap info -- which is what it is doing. In any case I am happy to clean this up if there's something more elegant in people's mind.
On Wed, May 22, 2024 at 06:10:59PM +0100, Joao Martins wrote: > On 22/05/2024 17:56, Jason Gunthorpe wrote: > > On Wed, May 22, 2024 at 03:37:57PM +0100, Joao Martins wrote: > > > >> This is just to catch the case where IOMMUFD can call into read_and_clear() > >> without dirty tracking enabled and without a bitmap structure to clear dirty > >> bits -- in order to ensure a clean PTE data snapshot after start(). > > > > Is that broken then? > > > > It's not: The check errors out the caller ends up calling read-and-clear with a > bitmap but without having started dirty tracking. the iopt_clear_dirty_data() > passes a null bitmap, it goes through and it walks and clears the IOPTEs > *without* recording them in the bitmap. It is not "without recording them in the bitmap", saying that is the confusing thing. The purpose of that 'if' is to return -EINVAL if dirty tracking is not turned on and we query the bitmap. More like this puts it in the common code and writes it in a more straightforward way with better locking: diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d35c1b8c8e65ce..b2cb557d3ea427 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2645,13 +2645,6 @@ static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, if (!ops || !ops->read_and_clear_dirty) return -EOPNOTSUPP; - spin_lock_irqsave(&pdomain->lock, lflags); - if (!pdomain->dirty_tracking && dirty->bitmap) { - spin_unlock_irqrestore(&pdomain->lock, lflags); - return -EINVAL; - } - spin_unlock_irqrestore(&pdomain->lock, lflags); - return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 50eb9aed47cc58..844f2cf061911f 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4797,15 +4797,6 @@ static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, unsigned long end = iova + size - 1; unsigned long pgsize; - /* - * IOMMUFD core calls into a dirty tracking disabled domain without an - * IOVA bitmap set in order to clean dirty bits in all PTEs that might - * have occurred when we stopped dirty tracking. This ensures that we - * never inherit dirtied bits from a previous cycle. - */ - if (!dmar_domain->dirty_tracking && dirty->bitmap) - return -EINVAL; - do { struct dma_pte *pte; int lvl = 0; diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 05fd9d3abf1b80..d116179809042d 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -536,7 +536,10 @@ int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, return ret; down_read(&iopt->iova_rwsem); - ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); + if (!iopt->dirty_tracking_enabled) + ret = -EINVAL; + else + ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); up_read(&iopt->iova_rwsem); return ret; @@ -580,7 +583,11 @@ int iopt_set_dirty_tracking(struct io_pagetable *iopt, if (!ops) return -EOPNOTSUPP; - down_read(&iopt->iova_rwsem); + down_write(&iopt->iova_rwsem); + if (iopt->dirty_tracking_enabled == enable) { + ret = 0; + goto out_unlock; + } /* Clear dirty bits from PTEs to ensure a clean snapshot */ if (enable) { @@ -590,9 +597,11 @@ int iopt_set_dirty_tracking(struct io_pagetable *iopt, } ret = ops->set_dirty_tracking(domain, enable); - + if (ret) + goto out_unlock; + iopt->dirty_tracking_enabled = enable; out_unlock: - up_read(&iopt->iova_rwsem); + up_write(&iopt->iova_rwsem); return ret; } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 991f864d1f9bc1..de3761e15cab54 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -52,6 +52,7 @@ struct io_pagetable { /* IOVA that cannot be allocated, struct iopt_reserved */ struct rb_root_cached reserved_itree; u8 disable_large_pages; + u8 dirty_tracking_enabled; unsigned long iova_alignment; };
On 22/05/2024 18:50, Jason Gunthorpe wrote: > On Wed, May 22, 2024 at 06:10:59PM +0100, Joao Martins wrote: >> On 22/05/2024 17:56, Jason Gunthorpe wrote: >>> On Wed, May 22, 2024 at 03:37:57PM +0100, Joao Martins wrote: >>> >>>> This is just to catch the case where IOMMUFD can call into read_and_clear() >>>> without dirty tracking enabled and without a bitmap structure to clear dirty >>>> bits -- in order to ensure a clean PTE data snapshot after start(). >>> >>> Is that broken then? >>> >> >> It's not: The check errors out the caller ends up calling read-and-clear with a >> bitmap but without having started dirty tracking. the iopt_clear_dirty_data() >> passes a null bitmap, it goes through and it walks and clears the IOPTEs >> *without* recording them in the bitmap. > > It is not "without recording them in the bitmap", saying that is the > confusing thing. The purpose of that 'if' is to return -EINVAL if > dirty tracking is not turned on and we query the bitmap. > Right. > More like this puts it in the common code and writes it in a more > straightforward way with better locking: > Yes, This snip you pasted would be the equivalent to the current way indeed. Looks good I think I was trying too hard not to duplicate 'state of dirty tracking' between iommu driver and iommufd core that I unintendedly ended up convoluting with this check in the driver :/ > diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c > index d35c1b8c8e65ce..b2cb557d3ea427 100644 > --- a/drivers/iommu/amd/iommu.c > +++ b/drivers/iommu/amd/iommu.c > @@ -2645,13 +2645,6 @@ static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, > if (!ops || !ops->read_and_clear_dirty) > return -EOPNOTSUPP; > > - spin_lock_irqsave(&pdomain->lock, lflags); > - if (!pdomain->dirty_tracking && dirty->bitmap) { > - spin_unlock_irqrestore(&pdomain->lock, lflags); > - return -EINVAL; > - } > - spin_unlock_irqrestore(&pdomain->lock, lflags); > - > return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); > } > > diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c > index 50eb9aed47cc58..844f2cf061911f 100644 > --- a/drivers/iommu/intel/iommu.c > +++ b/drivers/iommu/intel/iommu.c > @@ -4797,15 +4797,6 @@ static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, > unsigned long end = iova + size - 1; > unsigned long pgsize; > > - /* > - * IOMMUFD core calls into a dirty tracking disabled domain without an > - * IOVA bitmap set in order to clean dirty bits in all PTEs that might > - * have occurred when we stopped dirty tracking. This ensures that we > - * never inherit dirtied bits from a previous cycle. > - */ > - if (!dmar_domain->dirty_tracking && dirty->bitmap) > - return -EINVAL; > - > do { > struct dma_pte *pte; > int lvl = 0; > diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c > index 05fd9d3abf1b80..d116179809042d 100644 > --- a/drivers/iommu/iommufd/io_pagetable.c > +++ b/drivers/iommu/iommufd/io_pagetable.c > @@ -536,7 +536,10 @@ int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, > return ret; > > down_read(&iopt->iova_rwsem); > - ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); > + if (!iopt->dirty_tracking_enabled) > + ret = -EINVAL; > + else > + ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); > up_read(&iopt->iova_rwsem); > > return ret; > @@ -580,7 +583,11 @@ int iopt_set_dirty_tracking(struct io_pagetable *iopt, > if (!ops) > return -EOPNOTSUPP; > > - down_read(&iopt->iova_rwsem); > + down_write(&iopt->iova_rwsem); > + if (iopt->dirty_tracking_enabled == enable) { > + ret = 0; > + goto out_unlock; > + } > > /* Clear dirty bits from PTEs to ensure a clean snapshot */ > if (enable) { > @@ -590,9 +597,11 @@ int iopt_set_dirty_tracking(struct io_pagetable *iopt, > } > > ret = ops->set_dirty_tracking(domain, enable); > - > + if (ret) > + goto out_unlock; > + iopt->dirty_tracking_enabled = enable; > out_unlock: > - up_read(&iopt->iova_rwsem); > + up_write(&iopt->iova_rwsem); > return ret; > } > > diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h > index 991f864d1f9bc1..de3761e15cab54 100644 > --- a/drivers/iommu/iommufd/iommufd_private.h > +++ b/drivers/iommu/iommufd/iommufd_private.h > @@ -52,6 +52,7 @@ struct io_pagetable { > /* IOVA that cannot be allocated, struct iopt_reserved */ > struct rb_root_cached reserved_itree; > u8 disable_large_pages; > + u8 dirty_tracking_enabled; > unsigned long iova_alignment; > }; >
On 22/05/2024 19:15, Joao Martins wrote: > On 22/05/2024 18:50, Jason Gunthorpe wrote: >> On Wed, May 22, 2024 at 06:10:59PM +0100, Joao Martins wrote: >>> On 22/05/2024 17:56, Jason Gunthorpe wrote: >>>> On Wed, May 22, 2024 at 03:37:57PM +0100, Joao Martins wrote: >>>> >>>>> This is just to catch the case where IOMMUFD can call into read_and_clear() >>>>> without dirty tracking enabled and without a bitmap structure to clear dirty >>>>> bits -- in order to ensure a clean PTE data snapshot after start(). >>>> >>>> Is that broken then? >>>> >>> >>> It's not: The check errors out the caller ends up calling read-and-clear with a >>> bitmap but without having started dirty tracking. the iopt_clear_dirty_data() >>> passes a null bitmap, it goes through and it walks and clears the IOPTEs >>> *without* recording them in the bitmap. >> >> It is not "without recording them in the bitmap", saying that is the >> confusing thing. The purpose of that 'if' is to return -EINVAL if >> dirty tracking is not turned on and we query the bitmap. >> > Right. > >> More like this puts it in the common code and writes it in a more >> straightforward way with better locking: >> > Yes, This snip you pasted would be the equivalent to the current way indeed. > Looks good > > I think I was trying too hard not to duplicate 'state of dirty tracking' between > iommu driver and iommufd core that I unintendedly ended up convoluting with this > check in the driver :/ > To this point above (...) >> diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c >> index 05fd9d3abf1b80..d116179809042d 100644 >> --- a/drivers/iommu/iommufd/io_pagetable.c >> +++ b/drivers/iommu/iommufd/io_pagetable.c >> @@ -536,7 +536,10 @@ int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, >> return ret; >> >> down_read(&iopt->iova_rwsem); >> - ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); >> + if (!iopt->dirty_tracking_enabled) >> + ret = -EINVAL; >> + else >> + ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); >> up_read(&iopt->iova_rwsem); >> >> return ret; >> @@ -580,7 +583,11 @@ int iopt_set_dirty_tracking(struct io_pagetable *iopt, >> if (!ops) >> return -EOPNOTSUPP; >> >> - down_read(&iopt->iova_rwsem); >> + down_write(&iopt->iova_rwsem); >> + if (iopt->dirty_tracking_enabled == enable) { >> + ret = 0; >> + goto out_unlock; >> + } >> We have a similar check in set_dirty_tracking iommu op of intel/amd iommu drivers that could be removed. Though I am not sure it's right to remove it even as we move this up the stack >> /* Clear dirty bits from PTEs to ensure a clean snapshot */ >> if (enable) { >> @@ -590,9 +597,11 @@ int iopt_set_dirty_tracking(struct io_pagetable *iopt, >> } >> >> ret = ops->set_dirty_tracking(domain, enable); >> - >> + if (ret) >> + goto out_unlock; >> + iopt->dirty_tracking_enabled = enable; >> out_unlock: >> - up_read(&iopt->iova_rwsem); >> + up_write(&iopt->iova_rwsem); >> return ret; >> } >> >> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h >> index 991f864d1f9bc1..de3761e15cab54 100644 >> --- a/drivers/iommu/iommufd/iommufd_private.h >> +++ b/drivers/iommu/iommufd/iommufd_private.h >> @@ -52,6 +52,7 @@ struct io_pagetable { >> /* IOVA that cannot be allocated, struct iopt_reserved */ >> struct rb_root_cached reserved_itree; >> u8 disable_large_pages; >> + u8 dirty_tracking_enabled; >> unsigned long iova_alignment; >> }; >> >
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Thursday, May 23, 2024 1:51 AM > > diff --git a/drivers/iommu/iommufd/iommufd_private.h > b/drivers/iommu/iommufd/iommufd_private.h > index 991f864d1f9bc1..de3761e15cab54 100644 > --- a/drivers/iommu/iommufd/iommufd_private.h > +++ b/drivers/iommu/iommufd/iommufd_private.h > @@ -52,6 +52,7 @@ struct io_pagetable { > /* IOVA that cannot be allocated, struct iopt_reserved */ > struct rb_root_cached reserved_itree; > u8 disable_large_pages; > + u8 dirty_tracking_enabled; > unsigned long iova_alignment; > }; > should it be a hwpt flag instead?
On 23/05/2024 04:30, Tian, Kevin wrote: >> From: Jason Gunthorpe <jgg@nvidia.com> >> Sent: Thursday, May 23, 2024 1:51 AM >> >> diff --git a/drivers/iommu/iommufd/iommufd_private.h >> b/drivers/iommu/iommufd/iommufd_private.h >> index 991f864d1f9bc1..de3761e15cab54 100644 >> --- a/drivers/iommu/iommufd/iommufd_private.h >> +++ b/drivers/iommu/iommufd/iommufd_private.h >> @@ -52,6 +52,7 @@ struct io_pagetable { >> /* IOVA that cannot be allocated, struct iopt_reserved */ >> struct rb_root_cached reserved_itree; >> u8 disable_large_pages; >> + u8 dirty_tracking_enabled; >> unsigned long iova_alignment; >> }; >> > > should it be a hwpt flag instead? > Most of this deals with iopt locks and walking iopt areas to clear dirty. So this being a iopt attribute looks cleaner in implementation. But I think I see your point suggestion considering it represents a iommu domain property. Joao
On Fri, May 24, 2024 at 12:30:22PM +0100, Joao Martins wrote: > On 23/05/2024 04:30, Tian, Kevin wrote: > >> From: Jason Gunthorpe <jgg@nvidia.com> > >> Sent: Thursday, May 23, 2024 1:51 AM > >> > >> diff --git a/drivers/iommu/iommufd/iommufd_private.h > >> b/drivers/iommu/iommufd/iommufd_private.h > >> index 991f864d1f9bc1..de3761e15cab54 100644 > >> --- a/drivers/iommu/iommufd/iommufd_private.h > >> +++ b/drivers/iommu/iommufd/iommufd_private.h > >> @@ -52,6 +52,7 @@ struct io_pagetable { > >> /* IOVA that cannot be allocated, struct iopt_reserved */ > >> struct rb_root_cached reserved_itree; > >> u8 disable_large_pages; > >> + u8 dirty_tracking_enabled; > >> unsigned long iova_alignment; > >> }; > >> > > > > should it be a hwpt flag instead? > > > > Most of this deals with iopt locks and walking iopt areas to clear dirty. So > this being a iopt attribute looks cleaner in implementation. But I think I see > your point suggestion considering it represents a iommu domain property. Yeah, the original idea of the hwpt/iopt split was to keep code that was principly iommu_domain related away from the code which was principally attachment and lifetime related. Since this value is covered by iopt locks it makes sense in this struct. Not sure the split will stand the test of time as we keep finding reasons to muddle the boundary :\ Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Friday, May 24, 2024 10:08 PM > > On Fri, May 24, 2024 at 12:30:22PM +0100, Joao Martins wrote: > > On 23/05/2024 04:30, Tian, Kevin wrote: > > >> From: Jason Gunthorpe <jgg@nvidia.com> > > >> Sent: Thursday, May 23, 2024 1:51 AM > > >> > > >> diff --git a/drivers/iommu/iommufd/iommufd_private.h > > >> b/drivers/iommu/iommufd/iommufd_private.h > > >> index 991f864d1f9bc1..de3761e15cab54 100644 > > >> --- a/drivers/iommu/iommufd/iommufd_private.h > > >> +++ b/drivers/iommu/iommufd/iommufd_private.h > > >> @@ -52,6 +52,7 @@ struct io_pagetable { > > >> /* IOVA that cannot be allocated, struct iopt_reserved */ > > >> struct rb_root_cached reserved_itree; > > >> u8 disable_large_pages; > > >> + u8 dirty_tracking_enabled; > > >> unsigned long iova_alignment; > > >> }; > > >> > > > > > > should it be a hwpt flag instead? > > > > > > > Most of this deals with iopt locks and walking iopt areas to clear dirty. So > > this being a iopt attribute looks cleaner in implementation. But I think I see > > your point suggestion considering it represents a iommu domain property. > > Yeah, the original idea of the hwpt/iopt split was to keep code that > was principly iommu_domain related away from the code which was > principally attachment and lifetime related. Since this value is > covered by iopt locks it makes sense in this struct. > > Not sure the split will stand the test of time as we keep finding > reasons to muddle the boundary :\ > emmm there could be multiple domains under a iopt while the dirty tracking toggling must be done per domain. with a iopt flag like this only the 1st domain under the iopt will be properly configured?
On 27/05/2024 02:21, Tian, Kevin wrote: >> From: Jason Gunthorpe <jgg@nvidia.com> >> Sent: Friday, May 24, 2024 10:08 PM >> >> On Fri, May 24, 2024 at 12:30:22PM +0100, Joao Martins wrote: >>> On 23/05/2024 04:30, Tian, Kevin wrote: >>>>> From: Jason Gunthorpe <jgg@nvidia.com> >>>>> Sent: Thursday, May 23, 2024 1:51 AM >>>>> >>>>> diff --git a/drivers/iommu/iommufd/iommufd_private.h >>>>> b/drivers/iommu/iommufd/iommufd_private.h >>>>> index 991f864d1f9bc1..de3761e15cab54 100644 >>>>> --- a/drivers/iommu/iommufd/iommufd_private.h >>>>> +++ b/drivers/iommu/iommufd/iommufd_private.h >>>>> @@ -52,6 +52,7 @@ struct io_pagetable { >>>>> /* IOVA that cannot be allocated, struct iopt_reserved */ >>>>> struct rb_root_cached reserved_itree; >>>>> u8 disable_large_pages; >>>>> + u8 dirty_tracking_enabled; >>>>> unsigned long iova_alignment; >>>>> }; >>>>> >>>> >>>> should it be a hwpt flag instead? >>>> >>> >>> Most of this deals with iopt locks and walking iopt areas to clear dirty. So >>> this being a iopt attribute looks cleaner in implementation. But I think I see >>> your point suggestion considering it represents a iommu domain property. >> >> Yeah, the original idea of the hwpt/iopt split was to keep code that >> was principly iommu_domain related away from the code which was >> principally attachment and lifetime related. Since this value is >> covered by iopt locks it makes sense in this struct. >> >> Not sure the split will stand the test of time as we keep finding >> reasons to muddle the boundary :\ >> > > emmm there could be multiple domains under a iopt while the dirty > tracking toggling must be done per domain. > > with a iopt flag like this only the 1st domain under the iopt will be > properly configured? > Using intel as the reference given that it's the only one that has that implemented: so far seems to be the other way around i.e. a parent domain sets its childs' DPT in the iommu driver? Joao
On Mon, May 27, 2024 at 01:21:14AM +0000, Tian, Kevin wrote: > emmm there could be multiple domains under a iopt while the dirty > tracking toggling must be done per domain. For some reason I thought we had made this per ioas, so yeah it is not right like I showed. Jason
On 01/06/2024 19:55, Jason Gunthorpe wrote: > On Mon, May 27, 2024 at 01:21:14AM +0000, Tian, Kevin wrote: > >> emmm there could be multiple domains under a iopt while the dirty >> tracking toggling must be done per domain. > > For some reason I thought we had made this per ioas, so yeah it is not > right like I showed. > Something like below instead? diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 721ede5a1faf..9756ff18b5f2 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2645,18 +2645,10 @@ static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, { struct protection_domain *pdomain = to_pdomain(domain); struct io_pgtable_ops *ops = &pdomain->iop.iop.ops; - unsigned long lflags; if (!ops || !ops->read_and_clear_dirty) return -EOPNOTSUPP; - spin_lock_irqsave(&pdomain->lock, lflags); - if (!pdomain->dirty_tracking && dirty->bitmap) { - spin_unlock_irqrestore(&pdomain->lock, lflags); - return -EINVAL; - } - spin_unlock_irqrestore(&pdomain->lock, lflags); - return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 15411d8d1b3d..6db733f73922 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4799,15 +4799,6 @@ static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, unsigned long end = iova + size - 1; unsigned long pgsize; - /* - * IOMMUFD core calls into a dirty tracking disabled domain without an - * IOVA bitmap set in order to clean dirty bits in all PTEs that might - * have occurred when we stopped dirty tracking. This ensures that we - * never inherit dirtied bits from a previous cycle. - */ - if (!dmar_domain->dirty_tracking && dirty->bitmap) - return -EINVAL; - do { struct dma_pte *pte; int lvl = 0; diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 33d142f8057d..de3064ee849e 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -331,7 +331,9 @@ int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_set_dirty_tracking *cmd = ucmd->cmd; struct iommufd_hwpt_paging *hwpt_paging; - struct iommufd_ioas *ioas; + const struct iommu_dirty_ops *ops; + struct iommu_domain *domain; + struct io_pagetable *iopt; int rc = -EOPNOTSUPP; bool enable; @@ -342,12 +344,35 @@ int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) if (IS_ERR(hwpt_paging)) return PTR_ERR(hwpt_paging); - ioas = hwpt_paging->ioas; + iopt = &hwpt_paging->ioas->iopt; enable = cmd->flags & IOMMU_HWPT_DIRTY_TRACKING_ENABLE; + domain = hwpt_paging->common.domain; - rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt_paging->common.domain, - enable); + ops = domain->dirty_ops; + if (!ops) { + rc = -EOPNOTSUPP; + goto out_unlock; + } + + down_write(&iopt->iova_rwsem); + if (hwpt_paging->dirty_tracking == enable) { + rc = 0; + goto out_unlock; + } + + /* Clear dirty bits from PTEs to ensure a clean snapshot */ + if (enable) { + rc = iopt_clear_dirty_data(iopt, domain); + if (rc) + goto out_unlock; + } + rc = ops->set_dirty_tracking(domain, enable); + if (rc) + goto out_unlock; + hwpt_paging->dirty_tracking = enable; +out_unlock: + up_write(&iopt->iova_rwsem); iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); return rc; } @@ -356,7 +381,7 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd; struct iommufd_hwpt_paging *hwpt_paging; - struct iommufd_ioas *ioas; + struct io_pagetable *iopt; int rc = -EOPNOTSUPP; if ((cmd->flags & ~(IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR)) || @@ -367,10 +392,19 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) if (IS_ERR(hwpt_paging)) return PTR_ERR(hwpt_paging); - ioas = hwpt_paging->ioas; + iopt = &hwpt_paging->ioas->iopt; + + down_read(&iopt->iova_rwsem); + if (!hwpt_paging->dirty_tracking) { + rc = -EINVAL; + goto out_put_hwpt; + } + rc = iopt_read_and_clear_dirty_data( - &ioas->iopt, hwpt_paging->common.domain, cmd->flags, cmd); + iopt, hwpt_paging->common.domain, cmd->flags, cmd); +out_put_hwpt: + up_read(&iopt->iova_rwsem); iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); return rc; } diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index b40eab6ebe4c..25547d79f294 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -532,19 +532,17 @@ int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, { int ret; + lockdep_assert_held_read(&iopt->iova_rwsem); + ret = iommufd_check_iova_range(iopt, bitmap); if (ret) return ret; - down_read(&iopt->iova_rwsem); - ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); - up_read(&iopt->iova_rwsem); - - return ret; + return iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); } -static int iopt_clear_dirty_data(struct io_pagetable *iopt, - struct iommu_domain *domain) +int iopt_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain) { const struct iommu_dirty_ops *ops = domain->dirty_ops; struct iommu_iotlb_gather gather; @@ -576,31 +574,6 @@ static int iopt_clear_dirty_data(struct io_pagetable *iopt, return ret; } -int iopt_set_dirty_tracking(struct io_pagetable *iopt, - struct iommu_domain *domain, bool enable) -{ - const struct iommu_dirty_ops *ops = domain->dirty_ops; - int ret = 0; - - if (!ops) - return -EOPNOTSUPP; - - down_read(&iopt->iova_rwsem); - - /* Clear dirty bits from PTEs to ensure a clean snapshot */ - if (enable) { - ret = iopt_clear_dirty_data(iopt, domain); - if (ret) - goto out_unlock; - } - - ret = ops->set_dirty_tracking(domain, enable); - -out_unlock: - up_read(&iopt->iova_rwsem); - return ret; -} - int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, unsigned long length, struct list_head *pages_list) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 991f864d1f9b..9f946df4ca77 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -78,8 +78,8 @@ int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, struct iommu_domain *domain, unsigned long flags, struct iommu_hwpt_get_dirty_bitmap *bitmap); -int iopt_set_dirty_tracking(struct io_pagetable *iopt, - struct iommu_domain *domain, bool enable); +int iopt_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain); void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, unsigned long length); @@ -301,6 +301,7 @@ struct iommufd_hwpt_paging { bool enforce_cache_coherency : 1; bool msi_cookie : 1; bool nest_parent : 1; + bool dirty_tracking : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; };
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index f7828a7aad41..da6cc52859ba 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -75,6 +75,7 @@ #define ARM_LPAE_PTE_NSTABLE (((arm_lpae_iopte)1) << 63) #define ARM_LPAE_PTE_XN (((arm_lpae_iopte)3) << 53) +#define ARM_LPAE_PTE_DBM (((arm_lpae_iopte)1) << 51) #define ARM_LPAE_PTE_AF (((arm_lpae_iopte)1) << 10) #define ARM_LPAE_PTE_SH_NS (((arm_lpae_iopte)0) << 8) #define ARM_LPAE_PTE_SH_OS (((arm_lpae_iopte)2) << 8) @@ -84,7 +85,7 @@ #define ARM_LPAE_PTE_ATTR_LO_MASK (((arm_lpae_iopte)0x3ff) << 2) /* Ignore the contiguous bit for block splitting */ -#define ARM_LPAE_PTE_ATTR_HI_MASK (((arm_lpae_iopte)6) << 52) +#define ARM_LPAE_PTE_ATTR_HI_MASK (ARM_LPAE_PTE_XN | ARM_LPAE_PTE_DBM) #define ARM_LPAE_PTE_ATTR_MASK (ARM_LPAE_PTE_ATTR_LO_MASK | \ ARM_LPAE_PTE_ATTR_HI_MASK) /* Software bit for solving coherency races */ @@ -92,7 +93,11 @@ /* Stage-1 PTE */ #define ARM_LPAE_PTE_AP_UNPRIV (((arm_lpae_iopte)1) << 6) -#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)2) << 6) +#define ARM_LPAE_PTE_AP_RDONLY_BIT 7 +#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)1) << \ + ARM_LPAE_PTE_AP_RDONLY_BIT) +#define ARM_LPAE_PTE_AP_WRITABLE_CLEAN (ARM_LPAE_PTE_AP_RDONLY | \ + ARM_LPAE_PTE_DBM) #define ARM_LPAE_PTE_ATTRINDX_SHIFT 2 #define ARM_LPAE_PTE_nG (((arm_lpae_iopte)1) << 11) @@ -138,6 +143,9 @@ #define iopte_prot(pte) ((pte) & ARM_LPAE_PTE_ATTR_MASK) +#define iopte_hw_dirty(pte) (((pte) & ARM_LPAE_PTE_AP_WRITABLE_CLEAN) == \ + ARM_LPAE_PTE_DBM) + struct arm_lpae_io_pgtable { struct io_pgtable iop; @@ -729,6 +737,98 @@ static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, return iopte_to_paddr(pte, data) | iova; } +struct io_pgtable_walk_data { + struct iommu_dirty_bitmap *dirty; + unsigned long flags; + u64 addr; + const u64 end; +}; + +static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data, + struct io_pgtable_walk_data *walk_data, + arm_lpae_iopte *ptep, + int lvl); + +static int io_pgtable_visit_dirty(struct arm_lpae_io_pgtable *data, + struct io_pgtable_walk_data *walk_data, + arm_lpae_iopte *ptep, int lvl) +{ + struct io_pgtable *iop = &data->iop; + arm_lpae_iopte pte = READ_ONCE(*ptep); + + if (WARN_ON(!pte)) + return -EINVAL; + + if (iopte_leaf(pte, lvl, iop->fmt)) { + size_t size = ARM_LPAE_BLOCK_SIZE(lvl, data); + + if (iopte_hw_dirty(pte)) { + iommu_dirty_bitmap_record(walk_data->dirty, + walk_data->addr, size); + if (!(walk_data->flags & IOMMU_DIRTY_NO_CLEAR)) + set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT, + (unsigned long *)ptep); + } + walk_data->addr += size; + return 0; + } + + ptep = iopte_deref(pte, data); + return __arm_lpae_iopte_walk_dirty(data, walk_data, ptep, lvl + 1); +} + +static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data, + struct io_pgtable_walk_data *walk_data, + arm_lpae_iopte *ptep, + int lvl) +{ + u32 idx; + int max_entries, ret; + + if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) + return -EINVAL; + + if (lvl == data->start_level) + max_entries = ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte); + else + max_entries = ARM_LPAE_PTES_PER_TABLE(data); + + for (idx = ARM_LPAE_LVL_IDX(walk_data->addr, lvl, data); + (idx < max_entries) && (walk_data->addr < walk_data->end); ++idx) { + ret = io_pgtable_visit_dirty(data, walk_data, ptep + idx, lvl); + if (ret) + return ret; + } + + return 0; +} + +static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); + struct io_pgtable_cfg *cfg = &data->iop.cfg; + struct io_pgtable_walk_data walk_data = { + .dirty = dirty, + .flags = flags, + .addr = iova, + .end = iova + size, + }; + arm_lpae_iopte *ptep = data->pgd; + int lvl = data->start_level; + + if (WARN_ON(!size)) + return -EINVAL; + if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1))) + return -EINVAL; + if (data->iop.fmt != ARM_64_LPAE_S1) + return -EINVAL; + + return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl); +} + static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg) { unsigned long granule, page_sizes; @@ -807,6 +907,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg) .map_pages = arm_lpae_map_pages, .unmap_pages = arm_lpae_unmap_pages, .iova_to_phys = arm_lpae_iova_to_phys, + .read_and_clear_dirty = arm_lpae_read_and_clear_dirty, }; return data;
.read_and_clear_dirty() IOMMU domain op takes care of reading the dirty bits (i.e. PTE has DBM set and AP[2] clear) and marshalling into a bitmap of a given page size. While reading the dirty bits we also set the PTE AP[2] bit to mark it as writable-clean depending on read_and_clear_dirty() flags. PTE states with respect to DBM bit: DBM bit AP[2]("RDONLY" bit) 1. writable_clean 1 1 2. writable_dirty 1 0 3. read-only 0 1 Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> --- drivers/iommu/io-pgtable-arm.c | 105 ++++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 2 deletions(-)