diff mbox series

[RFC,v4,01/13] iommu: Introduce dirty log tracking framework

Message ID 20210507102211.8836-2-zhukeqian1@huawei.com (mailing list archive)
State New, archived
Headers show
Series iommu/smmuv3: Implement hardware dirty log tracking | expand

Commit Message

Keqian Zhu May 7, 2021, 10:21 a.m. UTC
Some types of IOMMU are capable of tracking DMA dirty log, such as
ARM SMMU with HTTU or Intel IOMMU with SLADE. This introduces the
dirty log tracking framework in the IOMMU base layer.

Four new essential interfaces are added, and we maintaince the status
of dirty log tracking in iommu_domain.
1. iommu_support_dirty_log: Check whether domain supports dirty log tracking
2. iommu_switch_dirty_log: Perform actions to start|stop dirty log tracking
3. iommu_sync_dirty_log: Sync dirty log from IOMMU into a dirty bitmap
4. iommu_clear_dirty_log: Clear dirty log of IOMMU by a mask bitmap

Note: Don't concurrently call these interfaces with other ops that
access underlying page table.

Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 drivers/iommu/iommu.c        | 201 +++++++++++++++++++++++++++++++++++
 include/linux/iommu.h        |  63 +++++++++++
 include/trace/events/iommu.h |  63 +++++++++++
 3 files changed, 327 insertions(+)

Comments

Lu Baolu May 8, 2021, 3:46 a.m. UTC | #1
Hi Keqian,

On 5/7/21 6:21 PM, Keqian Zhu wrote:
> Some types of IOMMU are capable of tracking DMA dirty log, such as
> ARM SMMU with HTTU or Intel IOMMU with SLADE. This introduces the
> dirty log tracking framework in the IOMMU base layer.
> 
> Four new essential interfaces are added, and we maintaince the status
> of dirty log tracking in iommu_domain.
> 1. iommu_support_dirty_log: Check whether domain supports dirty log tracking
> 2. iommu_switch_dirty_log: Perform actions to start|stop dirty log tracking
> 3. iommu_sync_dirty_log: Sync dirty log from IOMMU into a dirty bitmap
> 4. iommu_clear_dirty_log: Clear dirty log of IOMMU by a mask bitmap
> 
> Note: Don't concurrently call these interfaces with other ops that
> access underlying page table.
> 
> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
> Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
> ---
>   drivers/iommu/iommu.c        | 201 +++++++++++++++++++++++++++++++++++
>   include/linux/iommu.h        |  63 +++++++++++
>   include/trace/events/iommu.h |  63 +++++++++++
>   3 files changed, 327 insertions(+)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 808ab70d5df5..0d15620d1e90 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1940,6 +1940,7 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
>   	domain->type = type;
>   	/* Assume all sizes by default; the driver may override this later */
>   	domain->pgsize_bitmap  = bus->iommu_ops->pgsize_bitmap;
> +	mutex_init(&domain->switch_log_lock);
>   
>   	return domain;
>   }
> @@ -2703,6 +2704,206 @@ int iommu_set_pgtable_quirks(struct iommu_domain *domain,
>   }
>   EXPORT_SYMBOL_GPL(iommu_set_pgtable_quirks);
>   
> +bool iommu_support_dirty_log(struct iommu_domain *domain)
> +{
> +	const struct iommu_ops *ops = domain->ops;
> +
> +	return ops->support_dirty_log && ops->support_dirty_log(domain);
> +}
> +EXPORT_SYMBOL_GPL(iommu_support_dirty_log);

I suppose this interface is to ask the vendor IOMMU driver to check
whether each device/iommu in the domain supports dirty bit tracking.
But what will happen if new devices with different tracking capability
are added afterward?

To make things simple, is it possible to support this tracking only when
all underlying IOMMUs support dirty bit tracking?

Or, the more crazy idea is that we don't need to check this capability
at all. If dirty bit tracking is not supported by hardware, just mark
all pages dirty?

> +
> +int iommu_switch_dirty_log(struct iommu_domain *domain, bool enable,
> +			   unsigned long iova, size_t size, int prot)
> +{
> +	const struct iommu_ops *ops = domain->ops;
> +	unsigned long orig_iova = iova;
> +	unsigned int min_pagesz;
> +	size_t orig_size = size;
> +	bool flush = false;
> +	int ret = 0;
> +
> +	if (unlikely(!ops->switch_dirty_log))
> +		return -ENODEV;
> +
> +	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
> +	if (!IS_ALIGNED(iova | size, min_pagesz)) {
> +		pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
> +		       iova, size, min_pagesz);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&domain->switch_log_lock);
> +	if (enable && domain->dirty_log_tracking) {
> +		ret = -EBUSY;
> +		goto out;
> +	} else if (!enable && !domain->dirty_log_tracking) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	pr_debug("switch_dirty_log %s for: iova 0x%lx size 0x%zx\n",
> +		 enable ? "enable" : "disable", iova, size);
> +
> +	while (size) {
> +		size_t pgsize = iommu_pgsize(domain, iova, size);
> +
> +		flush = true;
> +		ret = ops->switch_dirty_log(domain, enable, iova, pgsize, prot);

Per minimal page callback is much expensive. How about using (pagesize,
count), so that all pages with the same page size could be handled in a
single indirect call? I remember I commented this during last review,
but I don't mind doing it again.

Best regards,
baolu

> +		if (ret)
> +			break;
> +
> +		pr_debug("switch_dirty_log handled: iova 0x%lx size 0x%zx\n",
> +			 iova, pgsize);
> +
> +		iova += pgsize;
> +		size -= pgsize;
> +	}
> +
> +	if (flush)
> +		iommu_flush_iotlb_all(domain);
> +
> +	if (!ret) {
> +		domain->dirty_log_tracking = enable;
> +		trace_switch_dirty_log(orig_iova, orig_size, enable);
> +	}
> +out:
> +	mutex_unlock(&domain->switch_log_lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_switch_dirty_log);
> +
> +int iommu_sync_dirty_log(struct iommu_domain *domain, unsigned long iova,
> +			 size_t size, unsigned long *bitmap,
> +			 unsigned long base_iova, unsigned long bitmap_pgshift)
> +{
> +	const struct iommu_ops *ops = domain->ops;
> +	unsigned long orig_iova = iova;
> +	unsigned int min_pagesz;
> +	size_t orig_size = size;
> +	int ret = 0;
> +
> +	if (unlikely(!ops->sync_dirty_log))
> +		return -ENODEV;
> +
> +	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
> +	if (!IS_ALIGNED(iova | size, min_pagesz)) {
> +		pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
> +		       iova, size, min_pagesz);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&domain->switch_log_lock);
> +	if (!domain->dirty_log_tracking) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	pr_debug("sync_dirty_log for: iova 0x%lx size 0x%zx\n", iova, size);
> +
> +	while (size) {
> +		size_t pgsize = iommu_pgsize(domain, iova, size);
> +
> +		ret = ops->sync_dirty_log(domain, iova, pgsize,
> +					  bitmap, base_iova, bitmap_pgshift);
> +		if (ret)
> +			break;
> +
> +		pr_debug("sync_dirty_log handled: iova 0x%lx size 0x%zx\n",
> +			 iova, pgsize);
> +
> +		iova += pgsize;
> +		size -= pgsize;
> +	}
> +
> +	if (!ret)
> +		trace_sync_dirty_log(orig_iova, orig_size);
> +out:
> +	mutex_unlock(&domain->switch_log_lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_sync_dirty_log);
> +
> +static int __iommu_clear_dirty_log(struct iommu_domain *domain,
> +				   unsigned long iova, size_t size,
> +				   unsigned long *bitmap,
> +				   unsigned long base_iova,
> +				   unsigned long bitmap_pgshift)
> +{
> +	const struct iommu_ops *ops = domain->ops;
> +	unsigned long orig_iova = iova;
> +	size_t orig_size = size;
> +	int ret = 0;
> +
> +	if (unlikely(!ops->clear_dirty_log))
> +		return -ENODEV;
> +
> +	pr_debug("clear_dirty_log for: iova 0x%lx size 0x%zx\n", iova, size);
> +
> +	while (size) {
> +		size_t pgsize = iommu_pgsize(domain, iova, size);
> +
> +		ret = ops->clear_dirty_log(domain, iova, pgsize, bitmap,
> +					   base_iova, bitmap_pgshift);
> +		if (ret)
> +			break;
> +
> +		pr_debug("clear_dirty_log handled: iova 0x%lx size 0x%zx\n",
> +			 iova, pgsize);
> +
> +		iova += pgsize;
> +		size -= pgsize;
> +	}
> +
> +	if (!ret)
> +		trace_clear_dirty_log(orig_iova, orig_size);
> +
> +	return ret;
> +}
> +
> +int iommu_clear_dirty_log(struct iommu_domain *domain,
> +			  unsigned long iova, size_t size,
> +			  unsigned long *bitmap, unsigned long base_iova,
> +			  unsigned long bitmap_pgshift)
> +{
> +	unsigned long riova, rsize;
> +	unsigned int min_pagesz;
> +	bool flush = false;
> +	int rs, re, start, end;
> +	int ret = 0;
> +
> +	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
> +	if (!IS_ALIGNED(iova | size, min_pagesz)) {
> +		pr_err("unaligned: iova 0x%lx min_pagesz 0x%x\n",
> +		       iova, min_pagesz);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&domain->switch_log_lock);
> +	if (!domain->dirty_log_tracking) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	start = (iova - base_iova) >> bitmap_pgshift;
> +	end = start + (size >> bitmap_pgshift);
> +	bitmap_for_each_set_region(bitmap, rs, re, start, end) {
> +		flush = true;
> +		riova = base_iova + (rs << bitmap_pgshift);
> +		rsize = (re - rs) << bitmap_pgshift;
> +		ret = __iommu_clear_dirty_log(domain, riova, rsize, bitmap,
> +					      base_iova, bitmap_pgshift);
> +		if (ret)
> +			break;
> +	}
> +
> +	if (flush)
> +		iommu_flush_iotlb_all(domain);
> +out:
> +	mutex_unlock(&domain->switch_log_lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_dirty_log);
> +
>   void iommu_get_resv_regions(struct device *dev, struct list_head *list)
>   {
>   	const struct iommu_ops *ops = dev->bus->iommu_ops;
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 32d448050bf7..e0e40dda974d 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -87,6 +87,8 @@ struct iommu_domain {
>   	void *handler_token;
>   	struct iommu_domain_geometry geometry;
>   	void *iova_cookie;
> +	bool dirty_log_tracking;
> +	struct mutex switch_log_lock;
>   };
>   
>   enum iommu_cap {
> @@ -193,6 +195,10 @@ struct iommu_iotlb_gather {
>    * @device_group: find iommu group for a particular device
>    * @enable_nesting: Enable nesting
>    * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
> + * @support_dirty_log: Check whether domain supports dirty log tracking
> + * @switch_dirty_log: Perform actions to start|stop dirty log tracking
> + * @sync_dirty_log: Sync dirty log from IOMMU into a dirty bitmap
> + * @clear_dirty_log: Clear dirty log of IOMMU by a mask bitmap
>    * @get_resv_regions: Request list of reserved regions for a device
>    * @put_resv_regions: Free list of reserved regions for a device
>    * @apply_resv_region: Temporary helper call-back for iova reserved ranges
> @@ -245,6 +251,22 @@ struct iommu_ops {
>   	int (*set_pgtable_quirks)(struct iommu_domain *domain,
>   				  unsigned long quirks);
>   
> +	/*
> +	 * Track dirty log. Note: Don't concurrently call these interfaces with
> +	 * other ops that access underlying page table.
> +	 */
> +	bool (*support_dirty_log)(struct iommu_domain *domain);
> +	int (*switch_dirty_log)(struct iommu_domain *domain, bool enable,
> +				unsigned long iova, size_t size, int prot);
> +	int (*sync_dirty_log)(struct iommu_domain *domain,
> +			      unsigned long iova, size_t size,
> +			      unsigned long *bitmap, unsigned long base_iova,
> +			      unsigned long bitmap_pgshift);
> +	int (*clear_dirty_log)(struct iommu_domain *domain,
> +			       unsigned long iova, size_t size,
> +			       unsigned long *bitmap, unsigned long base_iova,
> +			       unsigned long bitmap_pgshift);
> +
>   	/* Request/Free a list of reserved regions for a device */
>   	void (*get_resv_regions)(struct device *dev, struct list_head *list);
>   	void (*put_resv_regions)(struct device *dev, struct list_head *list);
> @@ -475,6 +497,17 @@ extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
>   int iommu_enable_nesting(struct iommu_domain *domain);
>   int iommu_set_pgtable_quirks(struct iommu_domain *domain,
>   		unsigned long quirks);
> +extern bool iommu_support_dirty_log(struct iommu_domain *domain);
> +extern int iommu_switch_dirty_log(struct iommu_domain *domain, bool enable,
> +				  unsigned long iova, size_t size, int prot);
> +extern int iommu_sync_dirty_log(struct iommu_domain *domain, unsigned long iova,
> +				size_t size, unsigned long *bitmap,
> +				unsigned long base_iova,
> +				unsigned long bitmap_pgshift);
> +extern int iommu_clear_dirty_log(struct iommu_domain *domain, unsigned long iova,
> +				 size_t dma_size, unsigned long *bitmap,
> +				 unsigned long base_iova,
> +				 unsigned long bitmap_pgshift);
>   
>   void iommu_set_dma_strict(bool val);
>   bool iommu_get_dma_strict(struct iommu_domain *domain);
> @@ -848,6 +881,36 @@ static inline int iommu_set_pgtable_quirks(struct iommu_domain *domain,
>   	return 0;
>   }
>   
> +static inline bool iommu_support_dirty_log(struct iommu_domain *domain)
> +{
> +	return false;
> +}
> +
> +static inline int iommu_switch_dirty_log(struct iommu_domain *domain,
> +					 bool enable, unsigned long iova,
> +					 size_t size, int prot)
> +{
> +	return -EINVAL;
> +}
> +
> +static inline int iommu_sync_dirty_log(struct iommu_domain *domain,
> +				       unsigned long iova, size_t size,
> +				       unsigned long *bitmap,
> +				       unsigned long base_iova,
> +				       unsigned long pgshift)
> +{
> +	return -EINVAL;
> +}
> +
> +static inline int iommu_clear_dirty_log(struct iommu_domain *domain,
> +					unsigned long iova, size_t size,
> +					unsigned long *bitmap,
> +					unsigned long base_iova,
> +					unsigned long pgshift)
> +{
> +	return -EINVAL;
> +}
> +
>   static inline int iommu_device_register(struct iommu_device *iommu,
>   					const struct iommu_ops *ops,
>   					struct device *hwdev)
> diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h
> index 72b4582322ff..6436d693d357 100644
> --- a/include/trace/events/iommu.h
> +++ b/include/trace/events/iommu.h
> @@ -129,6 +129,69 @@ TRACE_EVENT(unmap,
>   	)
>   );
>   
> +TRACE_EVENT(switch_dirty_log,
> +
> +	TP_PROTO(unsigned long iova, size_t size, bool enable),
> +
> +	TP_ARGS(iova, size, enable),
> +
> +	TP_STRUCT__entry(
> +		__field(u64, iova)
> +		__field(size_t, size)
> +		__field(bool, enable)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->iova = iova;
> +		__entry->size = size;
> +		__entry->enable = enable;
> +	),
> +
> +	TP_printk("IOMMU: iova=0x%016llx size=%zu enable=%u",
> +			__entry->iova, __entry->size, __entry->enable
> +	)
> +);
> +
> +TRACE_EVENT(sync_dirty_log,
> +
> +	TP_PROTO(unsigned long iova, size_t size),
> +
> +	TP_ARGS(iova, size),
> +
> +	TP_STRUCT__entry(
> +		__field(u64, iova)
> +		__field(size_t, size)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->iova = iova;
> +		__entry->size = size;
> +	),
> +
> +	TP_printk("IOMMU: iova=0x%016llx size=%zu", __entry->iova,
> +			__entry->size)
> +);
> +
> +TRACE_EVENT(clear_dirty_log,
> +
> +	TP_PROTO(unsigned long iova, size_t size),
> +
> +	TP_ARGS(iova, size),
> +
> +	TP_STRUCT__entry(
> +		__field(u64, iova)
> +		__field(size_t, size)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->iova = iova;
> +		__entry->size = size;
> +	),
> +
> +	TP_printk("IOMMU: iova=0x%016llx size=%zu", __entry->iova,
> +			__entry->size)
> +);
> +
>   DECLARE_EVENT_CLASS(iommu_error,
>   
>   	TP_PROTO(struct device *dev, unsigned long iova, int flags),
>
Keqian Zhu May 8, 2021, 7:35 a.m. UTC | #2
Hi Baolu,

On 2021/5/8 11:46, Lu Baolu wrote:
> Hi Keqian,
> 
> On 5/7/21 6:21 PM, Keqian Zhu wrote:
>> Some types of IOMMU are capable of tracking DMA dirty log, such as
>> ARM SMMU with HTTU or Intel IOMMU with SLADE. This introduces the
>> dirty log tracking framework in the IOMMU base layer.
>>
>> Four new essential interfaces are added, and we maintaince the status
>> of dirty log tracking in iommu_domain.
>> 1. iommu_support_dirty_log: Check whether domain supports dirty log tracking
>> 2. iommu_switch_dirty_log: Perform actions to start|stop dirty log tracking
>> 3. iommu_sync_dirty_log: Sync dirty log from IOMMU into a dirty bitmap
>> 4. iommu_clear_dirty_log: Clear dirty log of IOMMU by a mask bitmap
>>
>> Note: Don't concurrently call these interfaces with other ops that
>> access underlying page table.
>>
>> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
>> Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
>> ---
>>   drivers/iommu/iommu.c        | 201 +++++++++++++++++++++++++++++++++++
>>   include/linux/iommu.h        |  63 +++++++++++
>>   include/trace/events/iommu.h |  63 +++++++++++
>>   3 files changed, 327 insertions(+)
>>
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index 808ab70d5df5..0d15620d1e90 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -1940,6 +1940,7 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
>>       domain->type = type;
>>       /* Assume all sizes by default; the driver may override this later */
>>       domain->pgsize_bitmap  = bus->iommu_ops->pgsize_bitmap;
>> +    mutex_init(&domain->switch_log_lock);
>>         return domain;
>>   }
>> @@ -2703,6 +2704,206 @@ int iommu_set_pgtable_quirks(struct iommu_domain *domain,
>>   }
>>   EXPORT_SYMBOL_GPL(iommu_set_pgtable_quirks);
>>   +bool iommu_support_dirty_log(struct iommu_domain *domain)
>> +{
>> +    const struct iommu_ops *ops = domain->ops;
>> +
>> +    return ops->support_dirty_log && ops->support_dirty_log(domain);
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_support_dirty_log);
> 
> I suppose this interface is to ask the vendor IOMMU driver to check
> whether each device/iommu in the domain supports dirty bit tracking.
> But what will happen if new devices with different tracking capability
> are added afterward?
Yep, this is considered in the vfio part. We will query again after attaching or
detaching devices from the domain.  When the domain becomes capable, we enable
dirty log for it. When it becomes not capable, we disable dirty log for it.

> 
> To make things simple, is it possible to support this tracking only when
> all underlying IOMMUs support dirty bit tracking?
IIUC, all underlying IOMMUs you refer is of system wide. I think this idea may has
two issues. 1) The target domain may just contains part of system IOMMUs. 2) The
dirty tracking capability can be related to the capability of devices. For example,
we can track dirty log based on IOPF, which needs the capability of devices. That's
to say, we can make this framework more common.

> 
> Or, the more crazy idea is that we don't need to check this capability
> at all. If dirty bit tracking is not supported by hardware, just mark
> all pages dirty?
Yeah, I think this idea is nice :).

Still one concern is that we may have other dirty tracking methods in the future,
if we can't track dirty through iommu, we can still try other methods.

If there is no interface to check this capability, we have no chance to try
other methods. What do you think?

> 
>> +
>> +int iommu_switch_dirty_log(struct iommu_domain *domain, bool enable,
>> +               unsigned long iova, size_t size, int prot)
>> +{
>> +    const struct iommu_ops *ops = domain->ops;
>> +    unsigned long orig_iova = iova;
>> +    unsigned int min_pagesz;
>> +    size_t orig_size = size;
>> +    bool flush = false;
>> +    int ret = 0;
>> +
>> +    if (unlikely(!ops->switch_dirty_log))
>> +        return -ENODEV;
>> +
>> +    min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
>> +    if (!IS_ALIGNED(iova | size, min_pagesz)) {
>> +        pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
>> +               iova, size, min_pagesz);
>> +        return -EINVAL;
>> +    }
>> +
>> +    mutex_lock(&domain->switch_log_lock);
>> +    if (enable && domain->dirty_log_tracking) {
>> +        ret = -EBUSY;
>> +        goto out;
>> +    } else if (!enable && !domain->dirty_log_tracking) {
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +
>> +    pr_debug("switch_dirty_log %s for: iova 0x%lx size 0x%zx\n",
>> +         enable ? "enable" : "disable", iova, size);
>> +
>> +    while (size) {
>> +        size_t pgsize = iommu_pgsize(domain, iova, size);
>> +
>> +        flush = true;
>> +        ret = ops->switch_dirty_log(domain, enable, iova, pgsize, prot);
> 
> Per minimal page callback is much expensive. How about using (pagesize,
> count), so that all pages with the same page size could be handled in a
> single indirect call? I remember I commented this during last review,
> but I don't mind doing it again.
Thanks for reminding me again :). I'll do that in next version.

Thanks,
Keqian
Lu Baolu May 10, 2021, 1:08 a.m. UTC | #3
Hi Keqian,

On 5/8/21 3:35 PM, Keqian Zhu wrote:
> Hi Baolu,
> 
> On 2021/5/8 11:46, Lu Baolu wrote:
>> Hi Keqian,
>>
>> On 5/7/21 6:21 PM, Keqian Zhu wrote:
>>> Some types of IOMMU are capable of tracking DMA dirty log, such as
>>> ARM SMMU with HTTU or Intel IOMMU with SLADE. This introduces the
>>> dirty log tracking framework in the IOMMU base layer.
>>>
>>> Four new essential interfaces are added, and we maintaince the status
>>> of dirty log tracking in iommu_domain.
>>> 1. iommu_support_dirty_log: Check whether domain supports dirty log tracking
>>> 2. iommu_switch_dirty_log: Perform actions to start|stop dirty log tracking
>>> 3. iommu_sync_dirty_log: Sync dirty log from IOMMU into a dirty bitmap
>>> 4. iommu_clear_dirty_log: Clear dirty log of IOMMU by a mask bitmap
>>>
>>> Note: Don't concurrently call these interfaces with other ops that
>>> access underlying page table.
>>>
>>> Signed-off-by: Keqian Zhu<zhukeqian1@huawei.com>
>>> Signed-off-by: Kunkun Jiang<jiangkunkun@huawei.com>
>>> ---
>>>    drivers/iommu/iommu.c        | 201 +++++++++++++++++++++++++++++++++++
>>>    include/linux/iommu.h        |  63 +++++++++++
>>>    include/trace/events/iommu.h |  63 +++++++++++
>>>    3 files changed, 327 insertions(+)
>>>
>>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>>> index 808ab70d5df5..0d15620d1e90 100644
>>> --- a/drivers/iommu/iommu.c
>>> +++ b/drivers/iommu/iommu.c
>>> @@ -1940,6 +1940,7 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
>>>        domain->type = type;
>>>        /* Assume all sizes by default; the driver may override this later */
>>>        domain->pgsize_bitmap  = bus->iommu_ops->pgsize_bitmap;
>>> +    mutex_init(&domain->switch_log_lock);
>>>          return domain;
>>>    }
>>> @@ -2703,6 +2704,206 @@ int iommu_set_pgtable_quirks(struct iommu_domain *domain,
>>>    }
>>>    EXPORT_SYMBOL_GPL(iommu_set_pgtable_quirks);
>>>    +bool iommu_support_dirty_log(struct iommu_domain *domain)
>>> +{
>>> +    const struct iommu_ops *ops = domain->ops;
>>> +
>>> +    return ops->support_dirty_log && ops->support_dirty_log(domain);
>>> +}
>>> +EXPORT_SYMBOL_GPL(iommu_support_dirty_log);
>> I suppose this interface is to ask the vendor IOMMU driver to check
>> whether each device/iommu in the domain supports dirty bit tracking.
>> But what will happen if new devices with different tracking capability
>> are added afterward?
> Yep, this is considered in the vfio part. We will query again after attaching or
> detaching devices from the domain.  When the domain becomes capable, we enable
> dirty log for it. When it becomes not capable, we disable dirty log for it.

If that's the case, why not putting this logic in the iommu subsystem so
that it doesn't need to be duplicate in different upper layers?

For example, add something like dirty_page_trackable in the struct of
iommu_domain and ask the vendor iommu driver to update it once any
device is added/removed to/from the domain. It's also better to disallow
any domain attach/detach once the dirty page tracking is on.

> 
>> To make things simple, is it possible to support this tracking only when
>> all underlying IOMMUs support dirty bit tracking?
> IIUC, all underlying IOMMUs you refer is of system wide. I think this idea may has
> two issues. 1) The target domain may just contains part of system IOMMUs. 2) The
> dirty tracking capability can be related to the capability of devices. For example,
> we can track dirty log based on IOPF, which needs the capability of devices. That's
> to say, we can make this framework more common.

Yes. Fair enough. Thanks for sharing.

> 
>> Or, the more crazy idea is that we don't need to check this capability
>> at all. If dirty bit tracking is not supported by hardware, just mark
>> all pages dirty?
> Yeah, I think this idea is nice:).
> 
> Still one concern is that we may have other dirty tracking methods in the future,
> if we can't track dirty through iommu, we can still try other methods.
> 
> If there is no interface to check this capability, we have no chance to try
> other methods. What do you think?
> 

Agreed.

Best regards,
baolu
Keqian Zhu May 10, 2021, 11:07 a.m. UTC | #4
Hi Baolu,

On 2021/5/10 9:08, Lu Baolu wrote:
> Hi Keqian,
> 
> On 5/8/21 3:35 PM, Keqian Zhu wrote:
>> Hi Baolu,
>>
>> On 2021/5/8 11:46, Lu Baolu wrote:
>>> Hi Keqian,
>>>
>>> On 5/7/21 6:21 PM, Keqian Zhu wrote:
>>>> Some types of IOMMU are capable of tracking DMA dirty log, such as
>>>> ARM SMMU with HTTU or Intel IOMMU with SLADE. This introduces the
>>>> dirty log tracking framework in the IOMMU base layer.
>>>>
>>>> Four new essential interfaces are added, and we maintaince the status
>>>> of dirty log tracking in iommu_domain.
>>>> 1. iommu_support_dirty_log: Check whether domain supports dirty log tracking
>>>> 2. iommu_switch_dirty_log: Perform actions to start|stop dirty log tracking
>>>> 3. iommu_sync_dirty_log: Sync dirty log from IOMMU into a dirty bitmap
>>>> 4. iommu_clear_dirty_log: Clear dirty log of IOMMU by a mask bitmap
>>>>
>>>> Note: Don't concurrently call these interfaces with other ops that
>>>> access underlying page table.
>>>>
>>>> Signed-off-by: Keqian Zhu<zhukeqian1@huawei.com>
>>>> Signed-off-by: Kunkun Jiang<jiangkunkun@huawei.com>
>>>> ---
>>>>    drivers/iommu/iommu.c        | 201 +++++++++++++++++++++++++++++++++++
>>>>    include/linux/iommu.h        |  63 +++++++++++
>>>>    include/trace/events/iommu.h |  63 +++++++++++
>>>>    3 files changed, 327 insertions(+)
>>>>
>>>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>>>> index 808ab70d5df5..0d15620d1e90 100644
>>>> --- a/drivers/iommu/iommu.c
>>>> +++ b/drivers/iommu/iommu.c
>>>> @@ -1940,6 +1940,7 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
>>>>        domain->type = type;
>>>>        /* Assume all sizes by default; the driver may override this later */
>>>>        domain->pgsize_bitmap  = bus->iommu_ops->pgsize_bitmap;
>>>> +    mutex_init(&domain->switch_log_lock);
>>>>          return domain;
>>>>    }
>>>> @@ -2703,6 +2704,206 @@ int iommu_set_pgtable_quirks(struct iommu_domain *domain,
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(iommu_set_pgtable_quirks);
>>>>    +bool iommu_support_dirty_log(struct iommu_domain *domain)
>>>> +{
>>>> +    const struct iommu_ops *ops = domain->ops;
>>>> +
>>>> +    return ops->support_dirty_log && ops->support_dirty_log(domain);
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(iommu_support_dirty_log);
>>> I suppose this interface is to ask the vendor IOMMU driver to check
>>> whether each device/iommu in the domain supports dirty bit tracking.
>>> But what will happen if new devices with different tracking capability
>>> are added afterward?
>> Yep, this is considered in the vfio part. We will query again after attaching or
>> detaching devices from the domain.  When the domain becomes capable, we enable
>> dirty log for it. When it becomes not capable, we disable dirty log for it.
> 
> If that's the case, why not putting this logic in the iommu subsystem so
> that it doesn't need to be duplicate in different upper layers?
> 
> For example, add something like dirty_page_trackable in the struct of
> iommu_domain and ask the vendor iommu driver to update it once any
> device is added/removed to/from the domain. It's also better to disallow
If we do it, the upper layer still needs to query the capability from domain and switch
dirty log tracking for it. Or do you mean the domain can switch dirty log tracking automatically
when its capability change? If so, I think we're lack of some flexibility. The upper layer
may have it's own policy, such as only enable dirty log tracking when all domains are capable,
and disable dirty log tracking when just one domain is not capable.

> any domain attach/detach once the dirty page tracking is on.
Yep, this can greatly simplify our code logic, but I don't know whether our maintainers
agree that, as they may think that IOMMU dirty logging should not change original domain
behaviors.


Thanks,
Keqian
Lu Baolu May 11, 2021, 3:12 a.m. UTC | #5
Hi Keqian,

On 5/10/21 7:07 PM, Keqian Zhu wrote:
>>>> I suppose this interface is to ask the vendor IOMMU driver to check
>>>> whether each device/iommu in the domain supports dirty bit tracking.
>>>> But what will happen if new devices with different tracking capability
>>>> are added afterward?
>>> Yep, this is considered in the vfio part. We will query again after attaching or
>>> detaching devices from the domain.  When the domain becomes capable, we enable
>>> dirty log for it. When it becomes not capable, we disable dirty log for it.
>> If that's the case, why not putting this logic in the iommu subsystem so
>> that it doesn't need to be duplicate in different upper layers?
>>
>> For example, add something like dirty_page_trackable in the struct of
>> iommu_domain and ask the vendor iommu driver to update it once any
>> device is added/removed to/from the domain. It's also better to disallow
> If we do it, the upper layer still needs to query the capability from domain and switch
> dirty log tracking for it. Or do you mean the domain can switch dirty log tracking automatically
> when its capability change? If so, I think we're lack of some flexibility. The upper layer
> may have it's own policy, such as only enable dirty log tracking when all domains are capable,
> and disable dirty log tracking when just one domain is not capable.

I may not get you.

Assume that dirty_page_trackable is an attribution of an iommu_domain.
This attribution might be changed once a new device (with different
capability) added or removed. So it should be updated every time a new
device is attached or detached. This work could be done by the vendor
iommu driver on the path of dev_attach/dev_detach callback.

For upper layers, before starting page tracking, they check the
dirty_page_trackable attribution of the domain and start it only it's
capable. Once the page tracking is switched on the vendor iommu driver
(or iommu core) should block further device attach/detach operations
until page tracking is stopped.

> 
>> any domain attach/detach once the dirty page tracking is on.
> Yep, this can greatly simplify our code logic, but I don't know whether our maintainers
> agree that, as they may think that IOMMU dirty logging should not change original domain
> behaviors.

The maintainer owns the last word, but we need to work out a generic and
self-contained API set.

Best regards,
baolu
Keqian Zhu May 11, 2021, 7:40 a.m. UTC | #6
Hi Baolu,

On 2021/5/11 11:12, Lu Baolu wrote:
> Hi Keqian,
> 
> On 5/10/21 7:07 PM, Keqian Zhu wrote:
>>>>> I suppose this interface is to ask the vendor IOMMU driver to check
>>>>> whether each device/iommu in the domain supports dirty bit tracking.
>>>>> But what will happen if new devices with different tracking capability
>>>>> are added afterward?
>>>> Yep, this is considered in the vfio part. We will query again after attaching or
>>>> detaching devices from the domain.  When the domain becomes capable, we enable
>>>> dirty log for it. When it becomes not capable, we disable dirty log for it.
>>> If that's the case, why not putting this logic in the iommu subsystem so
>>> that it doesn't need to be duplicate in different upper layers?
>>>
>>> For example, add something like dirty_page_trackable in the struct of
>>> iommu_domain and ask the vendor iommu driver to update it once any
>>> device is added/removed to/from the domain. It's also better to disallow
>> If we do it, the upper layer still needs to query the capability from domain and switch
>> dirty log tracking for it. Or do you mean the domain can switch dirty log tracking automatically
>> when its capability change? If so, I think we're lack of some flexibility. The upper layer
>> may have it's own policy, such as only enable dirty log tracking when all domains are capable,
>> and disable dirty log tracking when just one domain is not capable.
> 
> I may not get you.
> 
> Assume that dirty_page_trackable is an attribution of an iommu_domain.
> This attribution might be changed once a new device (with different
> capability) added or removed. So it should be updated every time a new
> device is attached or detached. This work could be done by the vendor
> iommu driver on the path of dev_attach/dev_detach callback.
Yes, this is what I understand you.

> 
> For upper layers, before starting page tracking, they check the
> dirty_page_trackable attribution of the domain and start it only it's
> capable. Once the page tracking is switched on the vendor iommu driver
> (or iommu core) should block further device attach/detach operations
> until page tracking is stopped.
But when a domain becomes capable after detaching a device, the upper layer
still needs to query it and enable dirty log for it...

To make things coordinated, maybe the upper layer can register a notifier,
when the domain's capability change, the upper layer do not need to query, instead
they just need to realize a callback, and do their specific policy in the callback.
What do you think?

> 
>>
>>> any domain attach/detach once the dirty page tracking is on.
>> Yep, this can greatly simplify our code logic, but I don't know whether our maintainers
>> agree that, as they may think that IOMMU dirty logging should not change original domain
>> behaviors.
> 
> The maintainer owns the last word, but we need to work out a generic and
> self-contained API set.
OK, I see.

Thanks,
Keqian
Lu Baolu May 12, 2021, 3:20 a.m. UTC | #7
On 5/11/21 3:40 PM, Keqian Zhu wrote:
>> For upper layers, before starting page tracking, they check the
>> dirty_page_trackable attribution of the domain and start it only it's
>> capable. Once the page tracking is switched on the vendor iommu driver
>> (or iommu core) should block further device attach/detach operations
>> until page tracking is stopped.
> But when a domain becomes capable after detaching a device, the upper layer
> still needs to query it and enable dirty log for it...
> 
> To make things coordinated, maybe the upper layer can register a notifier,
> when the domain's capability change, the upper layer do not need to query, instead
> they just need to realize a callback, and do their specific policy in the callback.
> What do you think?
> 

That might be an option. But why not checking domain's attribution every
time a new tracking period is about to start?

Best regards,
baolu
Keqian Zhu May 12, 2021, 8:44 a.m. UTC | #8
On 2021/5/12 11:20, Lu Baolu wrote:
> On 5/11/21 3:40 PM, Keqian Zhu wrote:
>>> For upper layers, before starting page tracking, they check the
>>> dirty_page_trackable attribution of the domain and start it only it's
>>> capable. Once the page tracking is switched on the vendor iommu driver
>>> (or iommu core) should block further device attach/detach operations
>>> until page tracking is stopped.
>> But when a domain becomes capable after detaching a device, the upper layer
>> still needs to query it and enable dirty log for it...
>>
>> To make things coordinated, maybe the upper layer can register a notifier,
>> when the domain's capability change, the upper layer do not need to query, instead
>> they just need to realize a callback, and do their specific policy in the callback.
>> What do you think?
>>
> 
> That might be an option. But why not checking domain's attribution every
> time a new tracking period is about to start?
Hi Baolu,

I'll add an attribution in iommu_domain, and the vendor iommu driver will update
the attribution when attach/detach devices.

The attribute should be protected by a lock, so the upper layer shouldn't access
the attribute directly. Then the iommu_domain_support_dirty_log() still should be
retained. Does this design looks good to you?

Thanks,
Keqian
Lu Baolu May 12, 2021, 11:36 a.m. UTC | #9
Hi keqian,

On 5/12/21 4:44 PM, Keqian Zhu wrote:
> 
> 
> On 2021/5/12 11:20, Lu Baolu wrote:
>> On 5/11/21 3:40 PM, Keqian Zhu wrote:
>>>> For upper layers, before starting page tracking, they check the
>>>> dirty_page_trackable attribution of the domain and start it only it's
>>>> capable. Once the page tracking is switched on the vendor iommu driver
>>>> (or iommu core) should block further device attach/detach operations
>>>> until page tracking is stopped.
>>> But when a domain becomes capable after detaching a device, the upper layer
>>> still needs to query it and enable dirty log for it...
>>>
>>> To make things coordinated, maybe the upper layer can register a notifier,
>>> when the domain's capability change, the upper layer do not need to query, instead
>>> they just need to realize a callback, and do their specific policy in the callback.
>>> What do you think?
>>>
>>
>> That might be an option. But why not checking domain's attribution every
>> time a new tracking period is about to start?
> Hi Baolu,
> 
> I'll add an attribution in iommu_domain, and the vendor iommu driver will update
> the attribution when attach/detach devices.
> 
> The attribute should be protected by a lock, so the upper layer shouldn't access
> the attribute directly. Then the iommu_domain_support_dirty_log() still should be
> retained. Does this design looks good to you?

Yes, that's what I was thinking of. But I am not sure whether it worth
of a lock here. It seems not to be a valid behavior for upper layer to
attach or detach any device while doing the dirty page tracking.

Best regards,
baolu
Keqian Zhu May 13, 2021, 10:58 a.m. UTC | #10
On 2021/5/12 19:36, Lu Baolu wrote:
> Hi keqian,
> 
> On 5/12/21 4:44 PM, Keqian Zhu wrote:
>>
>>
>> On 2021/5/12 11:20, Lu Baolu wrote:
>>> On 5/11/21 3:40 PM, Keqian Zhu wrote:
>>>>> For upper layers, before starting page tracking, they check the
>>>>> dirty_page_trackable attribution of the domain and start it only it's
>>>>> capable. Once the page tracking is switched on the vendor iommu driver
>>>>> (or iommu core) should block further device attach/detach operations
>>>>> until page tracking is stopped.
>>>> But when a domain becomes capable after detaching a device, the upper layer
>>>> still needs to query it and enable dirty log for it...
>>>>
>>>> To make things coordinated, maybe the upper layer can register a notifier,
>>>> when the domain's capability change, the upper layer do not need to query, instead
>>>> they just need to realize a callback, and do their specific policy in the callback.
>>>> What do you think?
>>>>
>>>
>>> That might be an option. But why not checking domain's attribution every
>>> time a new tracking period is about to start?
>> Hi Baolu,
>>
>> I'll add an attribution in iommu_domain, and the vendor iommu driver will update
>> the attribution when attach/detach devices.
>>
>> The attribute should be protected by a lock, so the upper layer shouldn't access
>> the attribute directly. Then the iommu_domain_support_dirty_log() still should be
>> retained. Does this design looks good to you?
> 
> Yes, that's what I was thinking of. But I am not sure whether it worth
> of a lock here. It seems not to be a valid behavior for upper layer to
> attach or detach any device while doing the dirty page tracking.
Hi Baolu,

Right, if the "detach|attach" interfaces and "dirty tracking" interfaces can be called concurrently,
a lock in iommu_domain_support_dirty_log() is still not enough. I will add another note for the dirty
tracking interfaces.

Do you have other suggestions? I will accelerate the progress, so I plan to send out v5 next week.

Thanks,
Keqian
Lu Baolu May 13, 2021, 12:02 p.m. UTC | #11
On 5/13/21 6:58 PM, Keqian Zhu wrote:
> 
> 
> On 2021/5/12 19:36, Lu Baolu wrote:
>> Hi keqian,
>>
>> On 5/12/21 4:44 PM, Keqian Zhu wrote:
>>>
>>>
>>> On 2021/5/12 11:20, Lu Baolu wrote:
>>>> On 5/11/21 3:40 PM, Keqian Zhu wrote:
>>>>>> For upper layers, before starting page tracking, they check the
>>>>>> dirty_page_trackable attribution of the domain and start it only it's
>>>>>> capable. Once the page tracking is switched on the vendor iommu driver
>>>>>> (or iommu core) should block further device attach/detach operations
>>>>>> until page tracking is stopped.
>>>>> But when a domain becomes capable after detaching a device, the upper layer
>>>>> still needs to query it and enable dirty log for it...
>>>>>
>>>>> To make things coordinated, maybe the upper layer can register a notifier,
>>>>> when the domain's capability change, the upper layer do not need to query, instead
>>>>> they just need to realize a callback, and do their specific policy in the callback.
>>>>> What do you think?
>>>>>
>>>>
>>>> That might be an option. But why not checking domain's attribution every
>>>> time a new tracking period is about to start?
>>> Hi Baolu,
>>>
>>> I'll add an attribution in iommu_domain, and the vendor iommu driver will update
>>> the attribution when attach/detach devices.
>>>
>>> The attribute should be protected by a lock, so the upper layer shouldn't access
>>> the attribute directly. Then the iommu_domain_support_dirty_log() still should be
>>> retained. Does this design looks good to you?
>>
>> Yes, that's what I was thinking of. But I am not sure whether it worth
>> of a lock here. It seems not to be a valid behavior for upper layer to
>> attach or detach any device while doing the dirty page tracking.
> Hi Baolu,
> 
> Right, if the "detach|attach" interfaces and "dirty tracking" interfaces can be called concurrently,
> a lock in iommu_domain_support_dirty_log() is still not enough. I will add another note for the dirty
> tracking interfaces.
> 
> Do you have other suggestions? I will accelerate the progress, so I plan to send out v5 next week.

No further comments expect below nit:

"iommu_switch_dirty_log: Perform actions to start|stop dirty log tracking"

How about splitting it into
  - iommu_start_dirty_log()
  - iommu_stop_dirty_log()

Not a strong opinion anyway.

Best regards,
baolu
Keqian Zhu May 14, 2021, 2:30 a.m. UTC | #12
On 2021/5/13 20:02, Lu Baolu wrote:
> On 5/13/21 6:58 PM, Keqian Zhu wrote:
>>
>>
>> On 2021/5/12 19:36, Lu Baolu wrote:
>>> Hi keqian,
>>>
>>> On 5/12/21 4:44 PM, Keqian Zhu wrote:
>>>>
>>>>
>>>> On 2021/5/12 11:20, Lu Baolu wrote:
>>>>> On 5/11/21 3:40 PM, Keqian Zhu wrote:
>>>>>>> For upper layers, before starting page tracking, they check the
>>>>>>> dirty_page_trackable attribution of the domain and start it only it's
>>>>>>> capable. Once the page tracking is switched on the vendor iommu driver
>>>>>>> (or iommu core) should block further device attach/detach operations
>>>>>>> until page tracking is stopped.
>>>>>> But when a domain becomes capable after detaching a device, the upper layer
>>>>>> still needs to query it and enable dirty log for it...
>>>>>>
>>>>>> To make things coordinated, maybe the upper layer can register a notifier,
>>>>>> when the domain's capability change, the upper layer do not need to query, instead
>>>>>> they just need to realize a callback, and do their specific policy in the callback.
>>>>>> What do you think?
>>>>>>
>>>>>
>>>>> That might be an option. But why not checking domain's attribution every
>>>>> time a new tracking period is about to start?
>>>> Hi Baolu,
>>>>
>>>> I'll add an attribution in iommu_domain, and the vendor iommu driver will update
>>>> the attribution when attach/detach devices.
>>>>
>>>> The attribute should be protected by a lock, so the upper layer shouldn't access
>>>> the attribute directly. Then the iommu_domain_support_dirty_log() still should be
>>>> retained. Does this design looks good to you?
>>>
>>> Yes, that's what I was thinking of. But I am not sure whether it worth
>>> of a lock here. It seems not to be a valid behavior for upper layer to
>>> attach or detach any device while doing the dirty page tracking.
>> Hi Baolu,
>>
>> Right, if the "detach|attach" interfaces and "dirty tracking" interfaces can be called concurrently,
>> a lock in iommu_domain_support_dirty_log() is still not enough. I will add another note for the dirty
>> tracking interfaces.
>>
>> Do you have other suggestions? I will accelerate the progress, so I plan to send out v5 next week.
> 
> No further comments expect below nit:
> 
> "iommu_switch_dirty_log: Perform actions to start|stop dirty log tracking"
> 
> How about splitting it into
>  - iommu_start_dirty_log()
>  - iommu_stop_dirty_log()
Yeah, actually this is my original version, and the "switch" style is suggested by Yi Sun.
Anyway, I think both is OK, and the "switch" style can reduce some code.

Thanks,
Keqian

> 
> Not a strong opinion anyway.
> 
> Best regards,
> baolu
> .
>
diff mbox series

Patch

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 808ab70d5df5..0d15620d1e90 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1940,6 +1940,7 @@  static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
 	domain->type = type;
 	/* Assume all sizes by default; the driver may override this later */
 	domain->pgsize_bitmap  = bus->iommu_ops->pgsize_bitmap;
+	mutex_init(&domain->switch_log_lock);
 
 	return domain;
 }
@@ -2703,6 +2704,206 @@  int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_set_pgtable_quirks);
 
+bool iommu_support_dirty_log(struct iommu_domain *domain)
+{
+	const struct iommu_ops *ops = domain->ops;
+
+	return ops->support_dirty_log && ops->support_dirty_log(domain);
+}
+EXPORT_SYMBOL_GPL(iommu_support_dirty_log);
+
+int iommu_switch_dirty_log(struct iommu_domain *domain, bool enable,
+			   unsigned long iova, size_t size, int prot)
+{
+	const struct iommu_ops *ops = domain->ops;
+	unsigned long orig_iova = iova;
+	unsigned int min_pagesz;
+	size_t orig_size = size;
+	bool flush = false;
+	int ret = 0;
+
+	if (unlikely(!ops->switch_dirty_log))
+		return -ENODEV;
+
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+	if (!IS_ALIGNED(iova | size, min_pagesz)) {
+		pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
+		       iova, size, min_pagesz);
+		return -EINVAL;
+	}
+
+	mutex_lock(&domain->switch_log_lock);
+	if (enable && domain->dirty_log_tracking) {
+		ret = -EBUSY;
+		goto out;
+	} else if (!enable && !domain->dirty_log_tracking) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	pr_debug("switch_dirty_log %s for: iova 0x%lx size 0x%zx\n",
+		 enable ? "enable" : "disable", iova, size);
+
+	while (size) {
+		size_t pgsize = iommu_pgsize(domain, iova, size);
+
+		flush = true;
+		ret = ops->switch_dirty_log(domain, enable, iova, pgsize, prot);
+		if (ret)
+			break;
+
+		pr_debug("switch_dirty_log handled: iova 0x%lx size 0x%zx\n",
+			 iova, pgsize);
+
+		iova += pgsize;
+		size -= pgsize;
+	}
+
+	if (flush)
+		iommu_flush_iotlb_all(domain);
+
+	if (!ret) {
+		domain->dirty_log_tracking = enable;
+		trace_switch_dirty_log(orig_iova, orig_size, enable);
+	}
+out:
+	mutex_unlock(&domain->switch_log_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_switch_dirty_log);
+
+int iommu_sync_dirty_log(struct iommu_domain *domain, unsigned long iova,
+			 size_t size, unsigned long *bitmap,
+			 unsigned long base_iova, unsigned long bitmap_pgshift)
+{
+	const struct iommu_ops *ops = domain->ops;
+	unsigned long orig_iova = iova;
+	unsigned int min_pagesz;
+	size_t orig_size = size;
+	int ret = 0;
+
+	if (unlikely(!ops->sync_dirty_log))
+		return -ENODEV;
+
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+	if (!IS_ALIGNED(iova | size, min_pagesz)) {
+		pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
+		       iova, size, min_pagesz);
+		return -EINVAL;
+	}
+
+	mutex_lock(&domain->switch_log_lock);
+	if (!domain->dirty_log_tracking) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	pr_debug("sync_dirty_log for: iova 0x%lx size 0x%zx\n", iova, size);
+
+	while (size) {
+		size_t pgsize = iommu_pgsize(domain, iova, size);
+
+		ret = ops->sync_dirty_log(domain, iova, pgsize,
+					  bitmap, base_iova, bitmap_pgshift);
+		if (ret)
+			break;
+
+		pr_debug("sync_dirty_log handled: iova 0x%lx size 0x%zx\n",
+			 iova, pgsize);
+
+		iova += pgsize;
+		size -= pgsize;
+	}
+
+	if (!ret)
+		trace_sync_dirty_log(orig_iova, orig_size);
+out:
+	mutex_unlock(&domain->switch_log_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_sync_dirty_log);
+
+static int __iommu_clear_dirty_log(struct iommu_domain *domain,
+				   unsigned long iova, size_t size,
+				   unsigned long *bitmap,
+				   unsigned long base_iova,
+				   unsigned long bitmap_pgshift)
+{
+	const struct iommu_ops *ops = domain->ops;
+	unsigned long orig_iova = iova;
+	size_t orig_size = size;
+	int ret = 0;
+
+	if (unlikely(!ops->clear_dirty_log))
+		return -ENODEV;
+
+	pr_debug("clear_dirty_log for: iova 0x%lx size 0x%zx\n", iova, size);
+
+	while (size) {
+		size_t pgsize = iommu_pgsize(domain, iova, size);
+
+		ret = ops->clear_dirty_log(domain, iova, pgsize, bitmap,
+					   base_iova, bitmap_pgshift);
+		if (ret)
+			break;
+
+		pr_debug("clear_dirty_log handled: iova 0x%lx size 0x%zx\n",
+			 iova, pgsize);
+
+		iova += pgsize;
+		size -= pgsize;
+	}
+
+	if (!ret)
+		trace_clear_dirty_log(orig_iova, orig_size);
+
+	return ret;
+}
+
+int iommu_clear_dirty_log(struct iommu_domain *domain,
+			  unsigned long iova, size_t size,
+			  unsigned long *bitmap, unsigned long base_iova,
+			  unsigned long bitmap_pgshift)
+{
+	unsigned long riova, rsize;
+	unsigned int min_pagesz;
+	bool flush = false;
+	int rs, re, start, end;
+	int ret = 0;
+
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+	if (!IS_ALIGNED(iova | size, min_pagesz)) {
+		pr_err("unaligned: iova 0x%lx min_pagesz 0x%x\n",
+		       iova, min_pagesz);
+		return -EINVAL;
+	}
+
+	mutex_lock(&domain->switch_log_lock);
+	if (!domain->dirty_log_tracking) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	start = (iova - base_iova) >> bitmap_pgshift;
+	end = start + (size >> bitmap_pgshift);
+	bitmap_for_each_set_region(bitmap, rs, re, start, end) {
+		flush = true;
+		riova = base_iova + (rs << bitmap_pgshift);
+		rsize = (re - rs) << bitmap_pgshift;
+		ret = __iommu_clear_dirty_log(domain, riova, rsize, bitmap,
+					      base_iova, bitmap_pgshift);
+		if (ret)
+			break;
+	}
+
+	if (flush)
+		iommu_flush_iotlb_all(domain);
+out:
+	mutex_unlock(&domain->switch_log_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_dirty_log);
+
 void iommu_get_resv_regions(struct device *dev, struct list_head *list)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 32d448050bf7..e0e40dda974d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -87,6 +87,8 @@  struct iommu_domain {
 	void *handler_token;
 	struct iommu_domain_geometry geometry;
 	void *iova_cookie;
+	bool dirty_log_tracking;
+	struct mutex switch_log_lock;
 };
 
 enum iommu_cap {
@@ -193,6 +195,10 @@  struct iommu_iotlb_gather {
  * @device_group: find iommu group for a particular device
  * @enable_nesting: Enable nesting
  * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
+ * @support_dirty_log: Check whether domain supports dirty log tracking
+ * @switch_dirty_log: Perform actions to start|stop dirty log tracking
+ * @sync_dirty_log: Sync dirty log from IOMMU into a dirty bitmap
+ * @clear_dirty_log: Clear dirty log of IOMMU by a mask bitmap
  * @get_resv_regions: Request list of reserved regions for a device
  * @put_resv_regions: Free list of reserved regions for a device
  * @apply_resv_region: Temporary helper call-back for iova reserved ranges
@@ -245,6 +251,22 @@  struct iommu_ops {
 	int (*set_pgtable_quirks)(struct iommu_domain *domain,
 				  unsigned long quirks);
 
+	/*
+	 * Track dirty log. Note: Don't concurrently call these interfaces with
+	 * other ops that access underlying page table.
+	 */
+	bool (*support_dirty_log)(struct iommu_domain *domain);
+	int (*switch_dirty_log)(struct iommu_domain *domain, bool enable,
+				unsigned long iova, size_t size, int prot);
+	int (*sync_dirty_log)(struct iommu_domain *domain,
+			      unsigned long iova, size_t size,
+			      unsigned long *bitmap, unsigned long base_iova,
+			      unsigned long bitmap_pgshift);
+	int (*clear_dirty_log)(struct iommu_domain *domain,
+			       unsigned long iova, size_t size,
+			       unsigned long *bitmap, unsigned long base_iova,
+			       unsigned long bitmap_pgshift);
+
 	/* Request/Free a list of reserved regions for a device */
 	void (*get_resv_regions)(struct device *dev, struct list_head *list);
 	void (*put_resv_regions)(struct device *dev, struct list_head *list);
@@ -475,6 +497,17 @@  extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 int iommu_enable_nesting(struct iommu_domain *domain);
 int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirks);
+extern bool iommu_support_dirty_log(struct iommu_domain *domain);
+extern int iommu_switch_dirty_log(struct iommu_domain *domain, bool enable,
+				  unsigned long iova, size_t size, int prot);
+extern int iommu_sync_dirty_log(struct iommu_domain *domain, unsigned long iova,
+				size_t size, unsigned long *bitmap,
+				unsigned long base_iova,
+				unsigned long bitmap_pgshift);
+extern int iommu_clear_dirty_log(struct iommu_domain *domain, unsigned long iova,
+				 size_t dma_size, unsigned long *bitmap,
+				 unsigned long base_iova,
+				 unsigned long bitmap_pgshift);
 
 void iommu_set_dma_strict(bool val);
 bool iommu_get_dma_strict(struct iommu_domain *domain);
@@ -848,6 +881,36 @@  static inline int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 	return 0;
 }
 
+static inline bool iommu_support_dirty_log(struct iommu_domain *domain)
+{
+	return false;
+}
+
+static inline int iommu_switch_dirty_log(struct iommu_domain *domain,
+					 bool enable, unsigned long iova,
+					 size_t size, int prot)
+{
+	return -EINVAL;
+}
+
+static inline int iommu_sync_dirty_log(struct iommu_domain *domain,
+				       unsigned long iova, size_t size,
+				       unsigned long *bitmap,
+				       unsigned long base_iova,
+				       unsigned long pgshift)
+{
+	return -EINVAL;
+}
+
+static inline int iommu_clear_dirty_log(struct iommu_domain *domain,
+					unsigned long iova, size_t size,
+					unsigned long *bitmap,
+					unsigned long base_iova,
+					unsigned long pgshift)
+{
+	return -EINVAL;
+}
+
 static inline int iommu_device_register(struct iommu_device *iommu,
 					const struct iommu_ops *ops,
 					struct device *hwdev)
diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h
index 72b4582322ff..6436d693d357 100644
--- a/include/trace/events/iommu.h
+++ b/include/trace/events/iommu.h
@@ -129,6 +129,69 @@  TRACE_EVENT(unmap,
 	)
 );
 
+TRACE_EVENT(switch_dirty_log,
+
+	TP_PROTO(unsigned long iova, size_t size, bool enable),
+
+	TP_ARGS(iova, size, enable),
+
+	TP_STRUCT__entry(
+		__field(u64, iova)
+		__field(size_t, size)
+		__field(bool, enable)
+	),
+
+	TP_fast_assign(
+		__entry->iova = iova;
+		__entry->size = size;
+		__entry->enable = enable;
+	),
+
+	TP_printk("IOMMU: iova=0x%016llx size=%zu enable=%u",
+			__entry->iova, __entry->size, __entry->enable
+	)
+);
+
+TRACE_EVENT(sync_dirty_log,
+
+	TP_PROTO(unsigned long iova, size_t size),
+
+	TP_ARGS(iova, size),
+
+	TP_STRUCT__entry(
+		__field(u64, iova)
+		__field(size_t, size)
+	),
+
+	TP_fast_assign(
+		__entry->iova = iova;
+		__entry->size = size;
+	),
+
+	TP_printk("IOMMU: iova=0x%016llx size=%zu", __entry->iova,
+			__entry->size)
+);
+
+TRACE_EVENT(clear_dirty_log,
+
+	TP_PROTO(unsigned long iova, size_t size),
+
+	TP_ARGS(iova, size),
+
+	TP_STRUCT__entry(
+		__field(u64, iova)
+		__field(size_t, size)
+	),
+
+	TP_fast_assign(
+		__entry->iova = iova;
+		__entry->size = size;
+	),
+
+	TP_printk("IOMMU: iova=0x%016llx size=%zu", __entry->iova,
+			__entry->size)
+);
+
 DECLARE_EVENT_CLASS(iommu_error,
 
 	TP_PROTO(struct device *dev, unsigned long iova, int flags),