diff mbox series

[RFC,04/19] iommu: Add an unmap API that returns dirtied IOPTEs

Message ID 20220428210933.3583-5-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series IOMMUFD Dirty Tracking | expand

Commit Message

Joao Martins April 28, 2022, 9:09 p.m. UTC
Today, the dirty state is lost and the page wouldn't be migrated to
destination potentially leading the guest into error.

Add an unmap API that reads the dirty bit and sets it in the
user passed bitmap. This unmap iommu API tackles a potentially
racy update to the dirty bit *when* doing DMA on a iova that is
being unmapped at the same time.

The new unmap_read_dirty/unmap_pages_read_dirty does not replace
the unmap pages, but rather only when explicit called with an dirty
bitmap data passed in.

It could be said that the guest is buggy and rather than a special unmap
path tackling the theoretical race ... it would suffice fetching the
dirty bits (with GET_DIRTY_IOVA), and then unmap the IOVA.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 drivers/iommu/iommu.c      | 43 +++++++++++++++++++++++++++++++-------
 include/linux/io-pgtable.h | 10 +++++++++
 include/linux/iommu.h      | 12 +++++++++++
 3 files changed, 58 insertions(+), 7 deletions(-)

Comments

Baolu Lu April 30, 2022, 5:12 a.m. UTC | #1
On 2022/4/29 05:09, Joao Martins wrote:
> Today, the dirty state is lost and the page wouldn't be migrated to
> destination potentially leading the guest into error.
> 
> Add an unmap API that reads the dirty bit and sets it in the
> user passed bitmap. This unmap iommu API tackles a potentially
> racy update to the dirty bit *when* doing DMA on a iova that is
> being unmapped at the same time.
> 
> The new unmap_read_dirty/unmap_pages_read_dirty does not replace
> the unmap pages, but rather only when explicit called with an dirty
> bitmap data passed in.
> 
> It could be said that the guest is buggy and rather than a special unmap
> path tackling the theoretical race ... it would suffice fetching the
> dirty bits (with GET_DIRTY_IOVA), and then unmap the IOVA.

I am not sure whether this API could solve the race.

size_t iommu_unmap(struct iommu_domain *domain,
                    unsigned long iova, size_t size)
{
         struct iommu_iotlb_gather iotlb_gather;
         size_t ret;

         iommu_iotlb_gather_init(&iotlb_gather);
         ret = __iommu_unmap(domain, iova, size, &iotlb_gather);
         iommu_iotlb_sync(domain, &iotlb_gather);

         return ret;
}

The PTEs are cleared before iotlb invalidation. What if a DMA write
happens after PTE clearing and before the iotlb invalidation with the
PTE happening to be cached?

Best regards,
baolu
Joao Martins May 2, 2022, 12:22 p.m. UTC | #2
On 4/30/22 06:12, Baolu Lu wrote:
> On 2022/4/29 05:09, Joao Martins wrote:
>> Today, the dirty state is lost and the page wouldn't be migrated to
>> destination potentially leading the guest into error.
>>
>> Add an unmap API that reads the dirty bit and sets it in the
>> user passed bitmap. This unmap iommu API tackles a potentially
>> racy update to the dirty bit *when* doing DMA on a iova that is
>> being unmapped at the same time.
>>
>> The new unmap_read_dirty/unmap_pages_read_dirty does not replace
>> the unmap pages, but rather only when explicit called with an dirty
>> bitmap data passed in.
>>
>> It could be said that the guest is buggy and rather than a special unmap
>> path tackling the theoretical race ... it would suffice fetching the
>> dirty bits (with GET_DIRTY_IOVA), and then unmap the IOVA.
> 
> I am not sure whether this API could solve the race.
> 

Yeah, it doesn't fully solve the race as DMA can still potentially
occuring until the IOMMU needs to rewalk page tables (i.e. after IOTLB flush).


> size_t iommu_unmap(struct iommu_domain *domain,
>                     unsigned long iova, size_t size)
> {
>          struct iommu_iotlb_gather iotlb_gather;
>          size_t ret;
> 
>          iommu_iotlb_gather_init(&iotlb_gather);
>          ret = __iommu_unmap(domain, iova, size, &iotlb_gather);
>          iommu_iotlb_sync(domain, &iotlb_gather);
> 
>          return ret;
> }
> 
> The PTEs are cleared before iotlb invalidation. What if a DMA write
> happens after PTE clearing and before the iotlb invalidation with the
> PTE happening to be cached?


Yeap. Jason/Robin also reiterated similarly.

To fully handle this we need to force the PTEs readonly, and check the dirty bit
after. So perhaps if we wanna go to the extent of fully stopping DMA -- which none
of unmap APIs ever guarantee -- we need more of an write-protects API that optionally
fetches the dirties. And then the unmap remains as is (prior to this series).

Now whether this race is worth solving isn't clear (bearing that solving the race will add
a lot of overhead), and git/mailing list archeology doesn't respond to that either if this
was ever useful in pratice :(

	Joao
diff mbox series

Patch

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d18b9ddbcce4..cc04263709ee 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2289,12 +2289,25 @@  EXPORT_SYMBOL_GPL(iommu_map_atomic);
 
 static size_t __iommu_unmap_pages(struct iommu_domain *domain,
 				  unsigned long iova, size_t size,
-				  struct iommu_iotlb_gather *iotlb_gather)
+				  struct iommu_iotlb_gather *iotlb_gather,
+				  struct iommu_dirty_bitmap *dirty)
 {
 	const struct iommu_domain_ops *ops = domain->ops;
 	size_t pgsize, count;
 
 	pgsize = iommu_pgsize(domain, iova, iova, size, &count);
+
+	if (dirty) {
+		if (!ops->unmap_read_dirty && !ops->unmap_pages_read_dirty)
+			return 0;
+
+		return ops->unmap_pages_read_dirty ?
+		       ops->unmap_pages_read_dirty(domain, iova, pgsize,
+						   count, iotlb_gather, dirty) :
+		       ops->unmap_read_dirty(domain, iova, pgsize,
+					     iotlb_gather, dirty);
+	}
+
 	return ops->unmap_pages ?
 	       ops->unmap_pages(domain, iova, pgsize, count, iotlb_gather) :
 	       ops->unmap(domain, iova, pgsize, iotlb_gather);
@@ -2302,7 +2315,8 @@  static size_t __iommu_unmap_pages(struct iommu_domain *domain,
 
 static size_t __iommu_unmap(struct iommu_domain *domain,
 			    unsigned long iova, size_t size,
-			    struct iommu_iotlb_gather *iotlb_gather)
+			    struct iommu_iotlb_gather *iotlb_gather,
+			    struct iommu_dirty_bitmap *dirty)
 {
 	const struct iommu_domain_ops *ops = domain->ops;
 	size_t unmapped_page, unmapped = 0;
@@ -2337,9 +2351,8 @@  static size_t __iommu_unmap(struct iommu_domain *domain,
 	 * or we hit an area that isn't mapped.
 	 */
 	while (unmapped < size) {
-		unmapped_page = __iommu_unmap_pages(domain, iova,
-						    size - unmapped,
-						    iotlb_gather);
+		unmapped_page = __iommu_unmap_pages(domain, iova, size - unmapped,
+						    iotlb_gather, dirty);
 		if (!unmapped_page)
 			break;
 
@@ -2361,18 +2374,34 @@  size_t iommu_unmap(struct iommu_domain *domain,
 	size_t ret;
 
 	iommu_iotlb_gather_init(&iotlb_gather);
-	ret = __iommu_unmap(domain, iova, size, &iotlb_gather);
+	ret = __iommu_unmap(domain, iova, size, &iotlb_gather, NULL);
 	iommu_iotlb_sync(domain, &iotlb_gather);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_unmap);
 
+size_t iommu_unmap_read_dirty(struct iommu_domain *domain,
+			      unsigned long iova, size_t size,
+			      struct iommu_dirty_bitmap *dirty)
+{
+	struct iommu_iotlb_gather iotlb_gather;
+	size_t ret;
+
+	iommu_iotlb_gather_init(&iotlb_gather);
+	ret = __iommu_unmap(domain, iova, size, &iotlb_gather, dirty);
+	iommu_iotlb_sync(domain, &iotlb_gather);
+
+	return ret;
+
+}
+EXPORT_SYMBOL_GPL(iommu_unmap_read_dirty);
+
 size_t iommu_unmap_fast(struct iommu_domain *domain,
 			unsigned long iova, size_t size,
 			struct iommu_iotlb_gather *iotlb_gather)
 {
-	return __iommu_unmap(domain, iova, size, iotlb_gather);
+	return __iommu_unmap(domain, iova, size, iotlb_gather, NULL);
 }
 EXPORT_SYMBOL_GPL(iommu_unmap_fast);
 
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 82b39925c21f..c2ebfe037f5d 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -171,6 +171,16 @@  struct io_pgtable_ops {
 	int (*read_and_clear_dirty)(struct io_pgtable_ops *ops,
 				    unsigned long iova, size_t size,
 				    struct iommu_dirty_bitmap *dirty);
+	size_t (*unmap_read_dirty)(struct io_pgtable_ops *ops,
+				   unsigned long iova,
+				   size_t size,
+				   struct iommu_iotlb_gather *gather,
+				   struct iommu_dirty_bitmap *dirty);
+	size_t (*unmap_pages_read_dirty)(struct io_pgtable_ops *ops,
+					 unsigned long iova,
+					 size_t pgsize, size_t pgcount,
+					 struct iommu_iotlb_gather *gather,
+					 struct iommu_dirty_bitmap *dirty);
 };
 
 /**
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ca076365d77b..7c66b4e00556 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -340,6 +340,15 @@  struct iommu_domain_ops {
 	int (*read_and_clear_dirty)(struct iommu_domain *domain,
 				    unsigned long iova, size_t size,
 				    struct iommu_dirty_bitmap *dirty);
+	size_t (*unmap_read_dirty)(struct iommu_domain *domain,
+				   unsigned long iova, size_t size,
+				   struct iommu_iotlb_gather *iotlb_gather,
+				   struct iommu_dirty_bitmap *dirty);
+	size_t (*unmap_pages_read_dirty)(struct iommu_domain *domain,
+					 unsigned long iova,
+					 size_t pgsize, size_t pgcount,
+					 struct iommu_iotlb_gather *iotlb_gather,
+					 struct iommu_dirty_bitmap *dirty);
 };
 
 /**
@@ -463,6 +472,9 @@  extern int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova,
 			    phys_addr_t paddr, size_t size, int prot);
 extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova,
 			  size_t size);
+extern size_t iommu_unmap_read_dirty(struct iommu_domain *domain,
+				     unsigned long iova, size_t size,
+				     struct iommu_dirty_bitmap *dirty);
 extern size_t iommu_unmap_fast(struct iommu_domain *domain,
 			       unsigned long iova, size_t size,
 			       struct iommu_iotlb_gather *iotlb_gather);