diff mbox series

iommu/io-pgtable-arm: Remove split on unmap behavior

Message ID 0-v1-8c5f369ec2e5+75-arm_no_split_jgg@nvidia.com (mailing list archive)
State New
Headers show
Series iommu/io-pgtable-arm: Remove split on unmap behavior | expand

Commit Message

Jason Gunthorpe Oct. 18, 2024, 5:19 p.m. UTC
Of the page table implementations (AMD v1/2, VT-D SS, ARM32, DART)
arm_lpae is unique in how it handles partial unmap of large IOPTEs.

All other drivers will unmap the large IOPTE and return it's length.  For
example if a 2M IOPTE is present and the first 4K is requested to be
unmapped then unmap will remove the whole 2M and report 2M as the result.

arm_lpae instead replaces the IOPTE with a table of smaller IOPTEs, unmaps
the 4K and returns 4k. This is actually an illegal/non-hitless operation
on at least SMMUv3 because of the BBM level 0 rules.

Long ago VFIO could trigger a path like this, today I know of no user of
this functionality.

Given it doesn't work fully correctly on SMMUv3 and would create
portability problems if any user depends on it, remove the unique support
in arm_lpae and align with the expected iommu interface.

Outside the iommu users, this will potentially effect io_pgtable users of
ARM_32_LPAE_S1, ARM_32_LPAE_S2, ARM_64_LPAE_S1, ARM_64_LPAE_S2, and
ARM_MALI_LPAE formats.

Cc: Boris Brezillon <boris.brezillon@collabora.com>
Cc: Steven Price <steven.price@arm.com>
Cc: Liviu Dudau <liviu.dudau@arm.com>
Cc: dri-devel@lists.freedesktop.org
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/io-pgtable-arm.c | 72 +++-------------------------------
 1 file changed, 6 insertions(+), 66 deletions(-)

I don't know anything in the iommu space that needs this, and this is the only
page table implementation in iommu that does it.

I'm not sure about DRM, I looked for awhile and only Panthor was unclear.

If DRM does use this, I'd respin this to make it into a quirk and iommu won't
set it.

Thanks,
Jason



base-commit: 27ab08d646a1b53330229a97100200c9567d28b5

Comments

Boris Brezillon Oct. 21, 2024, 9:17 a.m. UTC | #1
On Fri, 18 Oct 2024 14:19:26 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> Of the page table implementations (AMD v1/2, VT-D SS, ARM32, DART)
> arm_lpae is unique in how it handles partial unmap of large IOPTEs.
> 
> All other drivers will unmap the large IOPTE and return it's length.  For
> example if a 2M IOPTE is present and the first 4K is requested to be
> unmapped then unmap will remove the whole 2M and report 2M as the result.
> 
> arm_lpae instead replaces the IOPTE with a table of smaller IOPTEs, unmaps
> the 4K and returns 4k. This is actually an illegal/non-hitless operation
> on at least SMMUv3 because of the BBM level 0 rules.
> 
> Long ago VFIO could trigger a path like this, today I know of no user of
> this functionality.
> 
> Given it doesn't work fully correctly on SMMUv3 and would create
> portability problems if any user depends on it, remove the unique support
> in arm_lpae and align with the expected iommu interface.
> 
> Outside the iommu users, this will potentially effect io_pgtable users of
> ARM_32_LPAE_S1, ARM_32_LPAE_S2, ARM_64_LPAE_S1, ARM_64_LPAE_S2, and
> ARM_MALI_LPAE formats.
> 
> Cc: Boris Brezillon <boris.brezillon@collabora.com>
> Cc: Steven Price <steven.price@arm.com>
> Cc: Liviu Dudau <liviu.dudau@arm.com>
> Cc: dri-devel@lists.freedesktop.org
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/io-pgtable-arm.c | 72 +++-------------------------------
>  1 file changed, 6 insertions(+), 66 deletions(-)
> 
> I don't know anything in the iommu space that needs this, and this is the only
> page table implementation in iommu that does it.
> 
> I'm not sure about DRM, I looked for awhile and only Panthor was unclear.

I theory, Panthor can do partial unmaps (unmapping a subregion of a
physically contiguous 2M section). In practice, it's not something we
rely on yet, so I don't think it's a blocker. If we ever want to support
that, we can always do it in two steps (unmap the 2M region and remap
the borders). At some point it'd be good to have some kind of atomic
page table updates, so we don't have this short period of time during
which nothing is mapped (between the unmap and the remap), but that's a
different story.

> 
> If DRM does use this, I'd respin this to make it into a quirk and iommu won't
> set it.
> 
> Thanks,
> Jason
> 
> 
> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> index 0e67f1721a3d98..a51fae9c909111 100644
> --- a/drivers/iommu/io-pgtable-arm.c
> +++ b/drivers/iommu/io-pgtable-arm.c
> @@ -569,66 +569,6 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop)
>  	kfree(data);
>  }
>  
> -static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
> -				       struct iommu_iotlb_gather *gather,
> -				       unsigned long iova, size_t size,
> -				       arm_lpae_iopte blk_pte, int lvl,
> -				       arm_lpae_iopte *ptep, size_t pgcount)
> -{
> -	struct io_pgtable_cfg *cfg = &data->iop.cfg;
> -	arm_lpae_iopte pte, *tablep;
> -	phys_addr_t blk_paddr;
> -	size_t tablesz = ARM_LPAE_GRANULE(data);
> -	size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
> -	int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data);
> -	int i, unmap_idx_start = -1, num_entries = 0, max_entries;
> -
> -	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
> -		return 0;
> -
> -	tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg, data->iop.cookie);
> -	if (!tablep)
> -		return 0; /* Bytes unmapped */
> -
> -	if (size == split_sz) {
> -		unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
> -		max_entries = ptes_per_table - unmap_idx_start;
> -		num_entries = min_t(int, pgcount, max_entries);
> -	}
> -
> -	blk_paddr = iopte_to_paddr(blk_pte, data);
> -	pte = iopte_prot(blk_pte);
> -
> -	for (i = 0; i < ptes_per_table; i++, blk_paddr += split_sz) {
> -		/* Unmap! */
> -		if (i >= unmap_idx_start && i < (unmap_idx_start + num_entries))
> -			continue;
> -
> -		__arm_lpae_init_pte(data, blk_paddr, pte, lvl, 1, &tablep[i]);
> -	}
> -
> -	pte = arm_lpae_install_table(tablep, ptep, blk_pte, data);
> -	if (pte != blk_pte) {
> -		__arm_lpae_free_pages(tablep, tablesz, cfg, data->iop.cookie);
> -		/*
> -		 * We may race against someone unmapping another part of this
> -		 * block, but anything else is invalid. We can't misinterpret
> -		 * a page entry here since we're never at the last level.
> -		 */
> -		if (iopte_type(pte) != ARM_LPAE_PTE_TYPE_TABLE)
> -			return 0;
> -
> -		tablep = iopte_deref(pte, data);
> -	} else if (unmap_idx_start >= 0) {
> -		for (i = 0; i < num_entries; i++)
> -			io_pgtable_tlb_add_page(&data->iop, gather, iova + i * size, size);
> -
> -		return num_entries * size;
> -	}
> -
> -	return __arm_lpae_unmap(data, gather, iova, size, pgcount, lvl, tablep);
> -}
> -
>  static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>  			       struct iommu_iotlb_gather *gather,
>  			       unsigned long iova, size_t size, size_t pgcount,
> @@ -678,12 +618,12 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>  
>  		return i * size;
>  	} else if (iopte_leaf(pte, lvl, iop->fmt)) {
> -		/*
> -		 * Insert a table at the next level to map the old region,
> -		 * minus the part we want to unmap
> -		 */
> -		return arm_lpae_split_blk_unmap(data, gather, iova, size, pte,
> -						lvl + 1, ptep, pgcount);
> +		/* Unmap the entire large IOPTE and return its size */
> +		size = ARM_LPAE_BLOCK_SIZE(lvl, data);
> +		__arm_lpae_clear_pte(ptep, &iop->cfg, 1);
> +		if (gather && !iommu_iotlb_gather_queued(gather))
> +			io_pgtable_tlb_add_page(iop, gather, iova, size);
> +		return size;
>  	}
>  
>  	/* Keep on walkin' */
> 
> base-commit: 27ab08d646a1b53330229a97100200c9567d28b5
Steven Price Oct. 21, 2024, 11:32 a.m. UTC | #2
On 21/10/2024 10:17, Boris Brezillon wrote:
> On Fri, 18 Oct 2024 14:19:26 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
>> Of the page table implementations (AMD v1/2, VT-D SS, ARM32, DART)
>> arm_lpae is unique in how it handles partial unmap of large IOPTEs.
>>
>> All other drivers will unmap the large IOPTE and return it's length.  For
>> example if a 2M IOPTE is present and the first 4K is requested to be
>> unmapped then unmap will remove the whole 2M and report 2M as the result.
>>
>> arm_lpae instead replaces the IOPTE with a table of smaller IOPTEs, unmaps
>> the 4K and returns 4k. This is actually an illegal/non-hitless operation
>> on at least SMMUv3 because of the BBM level 0 rules.

Mali GPUs can (theoretically) do this safely because we can lock a
region during the operation (LOCKADDR register). But neither Panfrost
nor Panthor make use of this today.

>> Long ago VFIO could trigger a path like this, today I know of no user of
>> this functionality.
>>
>> Given it doesn't work fully correctly on SMMUv3 and would create
>> portability problems if any user depends on it, remove the unique support
>> in arm_lpae and align with the expected iommu interface.
>>
>> Outside the iommu users, this will potentially effect io_pgtable users of
>> ARM_32_LPAE_S1, ARM_32_LPAE_S2, ARM_64_LPAE_S1, ARM_64_LPAE_S2, and
>> ARM_MALI_LPAE formats.
>>
>> Cc: Boris Brezillon <boris.brezillon@collabora.com>
>> Cc: Steven Price <steven.price@arm.com>
>> Cc: Liviu Dudau <liviu.dudau@arm.com>
>> Cc: dri-devel@lists.freedesktop.org
>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>> ---
>>  drivers/iommu/io-pgtable-arm.c | 72 +++-------------------------------
>>  1 file changed, 6 insertions(+), 66 deletions(-)
>>
>> I don't know anything in the iommu space that needs this, and this is the only
>> page table implementation in iommu that does it.
>>
>> I'm not sure about DRM, I looked for awhile and only Panthor was unclear.
> 
> I theory, Panthor can do partial unmaps (unmapping a subregion of a
> physically contiguous 2M section). In practice, it's not something we
> rely on yet, so I don't think it's a blocker. If we ever want to support

As above, we fail to lock the 2M section before updating the page
tables, so this functionality is broken today in Panthor.

> that, we can always do it in two steps (unmap the 2M region and remap
> the borders). At some point it'd be good to have some kind of atomic
> page table updates, so we don't have this short period of time during
> which nothing is mapped (between the unmap and the remap), but that's a
> different story.

The GPU hardware provides that. The only possible missing piece is that
the driver needs to know ahead of time that the unmap would unmap the 2M
region so it can do the correct lock before the entries are removed.

> 
>>
>> If DRM does use this, I'd respin this to make it into a quirk and iommu won't
>> set it.

You should be safe as far as Panfrost and Panthor are concerned.

Thanks,
Steve

>>
>> Thanks,
>> Jason
>>
>>
>> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
>> index 0e67f1721a3d98..a51fae9c909111 100644
>> --- a/drivers/iommu/io-pgtable-arm.c
>> +++ b/drivers/iommu/io-pgtable-arm.c
>> @@ -569,66 +569,6 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop)
>>  	kfree(data);
>>  }
>>  
>> -static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
>> -				       struct iommu_iotlb_gather *gather,
>> -				       unsigned long iova, size_t size,
>> -				       arm_lpae_iopte blk_pte, int lvl,
>> -				       arm_lpae_iopte *ptep, size_t pgcount)
>> -{
>> -	struct io_pgtable_cfg *cfg = &data->iop.cfg;
>> -	arm_lpae_iopte pte, *tablep;
>> -	phys_addr_t blk_paddr;
>> -	size_t tablesz = ARM_LPAE_GRANULE(data);
>> -	size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
>> -	int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data);
>> -	int i, unmap_idx_start = -1, num_entries = 0, max_entries;
>> -
>> -	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
>> -		return 0;
>> -
>> -	tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg, data->iop.cookie);
>> -	if (!tablep)
>> -		return 0; /* Bytes unmapped */
>> -
>> -	if (size == split_sz) {
>> -		unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
>> -		max_entries = ptes_per_table - unmap_idx_start;
>> -		num_entries = min_t(int, pgcount, max_entries);
>> -	}
>> -
>> -	blk_paddr = iopte_to_paddr(blk_pte, data);
>> -	pte = iopte_prot(blk_pte);
>> -
>> -	for (i = 0; i < ptes_per_table; i++, blk_paddr += split_sz) {
>> -		/* Unmap! */
>> -		if (i >= unmap_idx_start && i < (unmap_idx_start + num_entries))
>> -			continue;
>> -
>> -		__arm_lpae_init_pte(data, blk_paddr, pte, lvl, 1, &tablep[i]);
>> -	}
>> -
>> -	pte = arm_lpae_install_table(tablep, ptep, blk_pte, data);
>> -	if (pte != blk_pte) {
>> -		__arm_lpae_free_pages(tablep, tablesz, cfg, data->iop.cookie);
>> -		/*
>> -		 * We may race against someone unmapping another part of this
>> -		 * block, but anything else is invalid. We can't misinterpret
>> -		 * a page entry here since we're never at the last level.
>> -		 */
>> -		if (iopte_type(pte) != ARM_LPAE_PTE_TYPE_TABLE)
>> -			return 0;
>> -
>> -		tablep = iopte_deref(pte, data);
>> -	} else if (unmap_idx_start >= 0) {
>> -		for (i = 0; i < num_entries; i++)
>> -			io_pgtable_tlb_add_page(&data->iop, gather, iova + i * size, size);
>> -
>> -		return num_entries * size;
>> -	}
>> -
>> -	return __arm_lpae_unmap(data, gather, iova, size, pgcount, lvl, tablep);
>> -}
>> -
>>  static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>>  			       struct iommu_iotlb_gather *gather,
>>  			       unsigned long iova, size_t size, size_t pgcount,
>> @@ -678,12 +618,12 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>>  
>>  		return i * size;
>>  	} else if (iopte_leaf(pte, lvl, iop->fmt)) {
>> -		/*
>> -		 * Insert a table at the next level to map the old region,
>> -		 * minus the part we want to unmap
>> -		 */
>> -		return arm_lpae_split_blk_unmap(data, gather, iova, size, pte,
>> -						lvl + 1, ptep, pgcount);
>> +		/* Unmap the entire large IOPTE and return its size */
>> +		size = ARM_LPAE_BLOCK_SIZE(lvl, data);
>> +		__arm_lpae_clear_pte(ptep, &iop->cfg, 1);
>> +		if (gather && !iommu_iotlb_gather_queued(gather))
>> +			io_pgtable_tlb_add_page(iop, gather, iova, size);
>> +		return size;
>>  	}
>>  
>>  	/* Keep on walkin' */
>>
>> base-commit: 27ab08d646a1b53330229a97100200c9567d28b5
>
Jason Gunthorpe Oct. 21, 2024, 12:17 p.m. UTC | #3
On Mon, Oct 21, 2024 at 12:32:21PM +0100, Steven Price wrote:

> > that, we can always do it in two steps (unmap the 2M region and remap
> > the borders). At some point it'd be good to have some kind of atomic
> > page table updates, so we don't have this short period of time during
> > which nothing is mapped (between the unmap and the remap), but that's a
> > different story.
> 
> The GPU hardware provides that. The only possible missing piece is that
> the driver needs to know ahead of time that the unmap would unmap the 2M
> region so it can do the correct lock before the entries are removed.

It looks like we need atomic update for some confidential compute
scenarios, so I am working toward that with the coming generic pt
stuff.

> >> If DRM does use this, I'd respin this to make it into a quirk and iommu won't
> >> set it.
> 
> You should be safe as far as Panfrost and Panthor are concerned.

Great, thanks!

Jason
Robin Murphy Oct. 21, 2024, 1:50 p.m. UTC | #4
On 21/10/2024 1:17 pm, Jason Gunthorpe wrote:
> On Mon, Oct 21, 2024 at 12:32:21PM +0100, Steven Price wrote:
> 
>>> that, we can always do it in two steps (unmap the 2M region and remap
>>> the borders). At some point it'd be good to have some kind of atomic
>>> page table updates, so we don't have this short period of time during
>>> which nothing is mapped (between the unmap and the remap), but that's a
>>> different story.
>>
>> The GPU hardware provides that. The only possible missing piece is that
>> the driver needs to know ahead of time that the unmap would unmap the 2M
>> region so it can do the correct lock before the entries are removed.
> 
> It looks like we need atomic update for some confidential compute
> scenarios, so I am working toward that with the coming generic pt
> stuff.

Beware that whatever the Mali drivers might have the option to do for 
themselves, there's still no notion of "atomic update" for SMMU and 
io-pgtable-arm in general, other than perhaps for permission changes - 
even BBML is quite explicitly non-atomic, as it's defined in terms of 
two otherwise-identical mappings existing at the same time, just 
guaranteeing that while they do, you'll still get behaviour consistent 
with one *or* the other, and not anything in-between.

As far as this patch goes, though, I would not be at all unhappy to see 
the back of split_blk_unmap... However if we are going to do this then 
I'd like even more to formally define it as the behaviour of 
iommu_unmap() and fix up all the other drivers which behave still 
differently (the statement in the commit message is incorrect - 
io-pgtable-arm-v7s still splits; at least exynos fails the unmap entirely.)

Thanks,
Robin.
Jason Gunthorpe Oct. 21, 2024, 2:20 p.m. UTC | #5
On Mon, Oct 21, 2024 at 02:50:34PM +0100, Robin Murphy wrote:

> Beware that whatever the Mali drivers might have the option to do for
> themselves, there's still no notion of "atomic update" for SMMU and
> io-pgtable-arm in general, other than perhaps for permission changes - even
> BBML is quite explicitly non-atomic, as it's defined in terms of two
> otherwise-identical mappings existing at the same time, just guaranteeing
> that while they do, you'll still get behaviour consistent with one *or* the
> other, and not anything in-between.

one or the other is mostly what atomic means in this context. CC just
doesn't want any sort of fault.

> As far as this patch goes, though, I would not be at all unhappy to see the
> back of split_blk_unmap... However if we are going to do this then I'd like
> even more to formally define it as the behaviour of iommu_unmap() and fix up
> all the other drivers which behave still differently (the statement in the
> commit message is incorrect - io-pgtable-arm-v7s still splits; at least
> exynos fails the unmap entirely.)

Hmm, my kunit does cover io-pgtable-arm-v7 but I see there is a
mistake.  I can remove it and test it there too.

Exynos looks like it triggers WARN_ONs so nobody uses it there at least:

	if (lv1ent_section(ent)) {
		if (WARN_ON(size < SECT_SIZE)) {
			err_pgsize = SECT_SIZE;
			goto err;
		}

We just need to delete that WARN block?

I'm not sure what to document here, I don't think we should tell
people to rely on this given we can't test every single
implementation. I'm mostly interested to ensure that nobody has
quietly started relying on split behavior.

Jason
Will Deacon Oct. 24, 2024, 1:05 p.m. UTC | #6
Hi Jason,

On Fri, Oct 18, 2024 at 02:19:26PM -0300, Jason Gunthorpe wrote:
> Of the page table implementations (AMD v1/2, VT-D SS, ARM32, DART)
> arm_lpae is unique in how it handles partial unmap of large IOPTEs.
> 
> All other drivers will unmap the large IOPTE and return it's length.  For
> example if a 2M IOPTE is present and the first 4K is requested to be
> unmapped then unmap will remove the whole 2M and report 2M as the result.
> 
> arm_lpae instead replaces the IOPTE with a table of smaller IOPTEs, unmaps
> the 4K and returns 4k. This is actually an illegal/non-hitless operation
> on at least SMMUv3 because of the BBM level 0 rules.
> 
> Long ago VFIO could trigger a path like this, today I know of no user of
> this functionality.
> 
> Given it doesn't work fully correctly on SMMUv3 and would create
> portability problems if any user depends on it, remove the unique support
> in arm_lpae and align with the expected iommu interface.
> 
> Outside the iommu users, this will potentially effect io_pgtable users of
> ARM_32_LPAE_S1, ARM_32_LPAE_S2, ARM_64_LPAE_S1, ARM_64_LPAE_S2, and
> ARM_MALI_LPAE formats.
> 
> Cc: Boris Brezillon <boris.brezillon@collabora.com>
> Cc: Steven Price <steven.price@arm.com>
> Cc: Liviu Dudau <liviu.dudau@arm.com>
> Cc: dri-devel@lists.freedesktop.org
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/io-pgtable-arm.c | 72 +++-------------------------------
>  1 file changed, 6 insertions(+), 66 deletions(-)

I'd love to drop this, but I'm sure it was needed when I added it :/

My recollection is hazy, but I seem to remember VFIO using the largest
page sizes in the IOMMU 'pgsize_bitmap' for map() requests but then
using the smallest page size for unmap() requests, so you'd end up
cracking block mappings when tearing down a VM with assigne devices.

Is this what you're referring to when you say?

  > Long ago VFIO could trigger a path like this, today I know of no user of
  > this functionality.

If so, please can you provide a reference to the patch that moved VFIO
off that problematic behaviour?

Thanks!

Will
Jason Gunthorpe Oct. 24, 2024, 1:44 p.m. UTC | #7
On Thu, Oct 24, 2024 at 02:05:53PM +0100, Will Deacon wrote:

> My recollection is hazy, but I seem to remember VFIO using the largest
> page sizes in the IOMMU 'pgsize_bitmap' for map() requests but then
> using the smallest page size for unmap() requests, so you'd end up
> cracking block mappings when tearing down a VM with assigne devices.
>
> Is this what you're referring to when you say?

Sounds like it, but I'm really hazy on the long ago history here.

>   > Long ago VFIO could trigger a path like this, today I know of no user of
>   > this functionality.
> 
> If so, please can you provide a reference to the patch that moved VFIO
> off that problematic behaviour?

Looking more deeply, I'm not really sure VFIO ever required this.

vfio commit 166fd7d94afd ("vfio: hugepage support for
vfio_iommu_type1") is the thing that added the huge page support, and
it called map like:

+               ret = iommu_map(iommu->domain, iova,
+                               (phys_addr_t)pfn << PAGE_SHIFT,
+                               npage << PAGE_SHIFT, prot);

But then the unmap path still looked like:

+               unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
+               unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
+                                            unmapped >> PAGE_SHIFT,
+                                            dma->prot, false);

So at that time it was relying on the "unmap smaller gives the larger
size" trick that we see Intel and AMD implementing today. No
requirement for split, but the ARM split code could be triggered.

Next came the introduction of VFIO_TYPE1v2_IOMMU which eliminated the
ability for userspace to request splitting a mapping. Userspace can
only unmap what userspace maps. commit 1ef3e2bc0422
("vfio/iommu_type1: Multi-IOMMU domain support")

    To do this, our DMA tracking needs to change.  We currently try to
    coalesce user mappings into as few tracking entries as possible.  The
    problem then becomes that we lose granularity of user mappings.  We've
    never guaranteed that a user is able to unmap at a finer granularity
    than the original mapping, but we must honor the granularity of the
    original mapping.  This coalescing code is therefore removed, allowing
    only unmaps covering complete maps.  The change in accounting is
    fairly small here, a typical QEMU VM will start out with roughly a
    dozen entries, so it's arguable if this coalescing was ever needed.

That blocked any requirement for splitting driven by the uAPI. Noting
uAPI based splitting never worked right and AFAICT AMD didn't
implement split at the time.

Finally, commit 6fe1010d6d9c ("vfio/type1: DMA unmap chunking")
changed the unmap loop to this:

-               unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE);
+               /*
+                * To optimize for fewer iommu_unmap() calls, each of which
+                * may require hardware cache flushing, try to find the
+                * largest contiguous physical memory chunk to unmap.
+                */
+               for (len = PAGE_SIZE;
+                    !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
+                       next = iommu_iova_to_phys(domain->domain, iova + len);
+                       if (next != phys + len)
+                               break;
+               }
+
+               unmapped = iommu_unmap(domain->domain, iova, len);

fgsp=true is only possible on AMD, and this loop effectively
guarantees that the iommu driver will never see a partial huge page
unmap as the size is discovered via iommu_iova_to_phys() before
calling unmap.

At that point only the AMD driver will ever see a page size that is
smaller than what is in the radix tree.

Jason
Will Deacon Nov. 1, 2024, 11:54 a.m. UTC | #8
Hi Jason,

On Thu, Oct 24, 2024 at 10:44:11AM -0300, Jason Gunthorpe wrote:
> On Thu, Oct 24, 2024 at 02:05:53PM +0100, Will Deacon wrote:
> 
> > My recollection is hazy, but I seem to remember VFIO using the largest
> > page sizes in the IOMMU 'pgsize_bitmap' for map() requests but then
> > using the smallest page size for unmap() requests, so you'd end up
> > cracking block mappings when tearing down a VM with assigne devices.
> >
> > Is this what you're referring to when you say?
> 
> Sounds like it, but I'm really hazy on the long ago history here.
> 
> >   > Long ago VFIO could trigger a path like this, today I know of no user of
> >   > this functionality.
> > 
> > If so, please can you provide a reference to the patch that moved VFIO
> > off that problematic behaviour?
> 
> Looking more deeply, I'm not really sure VFIO ever required this.
> 
> vfio commit 166fd7d94afd ("vfio: hugepage support for
> vfio_iommu_type1") is the thing that added the huge page support, and
> it called map like:
> 
> +               ret = iommu_map(iommu->domain, iova,
> +                               (phys_addr_t)pfn << PAGE_SHIFT,
> +                               npage << PAGE_SHIFT, prot);
> 
> But then the unmap path still looked like:
> 
> +               unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
> +               unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
> +                                            unmapped >> PAGE_SHIFT,
> +                                            dma->prot, false);
> 
> So at that time it was relying on the "unmap smaller gives the larger
> size" trick that we see Intel and AMD implementing today. No
> requirement for split, but the ARM split code could be triggered.

Urgh, I'm not sure I was ever fully aware of that "trick". That's really
horrible!

> Next came the introduction of VFIO_TYPE1v2_IOMMU which eliminated the
> ability for userspace to request splitting a mapping. Userspace can
> only unmap what userspace maps. commit 1ef3e2bc0422
> ("vfio/iommu_type1: Multi-IOMMU domain support")
> 
>     To do this, our DMA tracking needs to change.  We currently try to
>     coalesce user mappings into as few tracking entries as possible.  The
>     problem then becomes that we lose granularity of user mappings.  We've
>     never guaranteed that a user is able to unmap at a finer granularity
>     than the original mapping, but we must honor the granularity of the
>     original mapping.  This coalescing code is therefore removed, allowing
>     only unmaps covering complete maps.  The change in accounting is
>     fairly small here, a typical QEMU VM will start out with roughly a
>     dozen entries, so it's arguable if this coalescing was ever needed.
> 
> That blocked any requirement for splitting driven by the uAPI. Noting
> uAPI based splitting never worked right and AFAICT AMD didn't
> implement split at the time.
> 
> Finally, commit 6fe1010d6d9c ("vfio/type1: DMA unmap chunking")
> changed the unmap loop to this:
> 
> -               unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE);
> +               /*
> +                * To optimize for fewer iommu_unmap() calls, each of which
> +                * may require hardware cache flushing, try to find the
> +                * largest contiguous physical memory chunk to unmap.
> +                */
> +               for (len = PAGE_SIZE;
> +                    !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
> +                       next = iommu_iova_to_phys(domain->domain, iova + len);
> +                       if (next != phys + len)
> +                               break;
> +               }
> +
> +               unmapped = iommu_unmap(domain->domain, iova, len);
> 
> fgsp=true is only possible on AMD, and this loop effectively
> guarantees that the iommu driver will never see a partial huge page
> unmap as the size is discovered via iommu_iova_to_phys() before
> calling unmap.

Talking to Robin, he reminded me of:

7c6d90e2bb1a ("iommu/io-pgtable-arm: Fix iova_to_phys for block entries")

which looks like it fixes a bug which would've defeated the VFIO chunking
in your snippet above.

But we're all good now, so I'm in favour of dropping this. Let's just
cram some of this history into the commit message so we know why we're
happy to do so.

Lemme go look at the actual diff now...

Cheers,

Will
Will Deacon Nov. 1, 2024, 11:58 a.m. UTC | #9
On Fri, Oct 18, 2024 at 02:19:26PM -0300, Jason Gunthorpe wrote:
> Of the page table implementations (AMD v1/2, VT-D SS, ARM32, DART)
> arm_lpae is unique in how it handles partial unmap of large IOPTEs.
> 
> All other drivers will unmap the large IOPTE and return it's length.  For
> example if a 2M IOPTE is present and the first 4K is requested to be
> unmapped then unmap will remove the whole 2M and report 2M as the result.
> 
> arm_lpae instead replaces the IOPTE with a table of smaller IOPTEs, unmaps
> the 4K and returns 4k. This is actually an illegal/non-hitless operation
> on at least SMMUv3 because of the BBM level 0 rules.
> 
> Long ago VFIO could trigger a path like this, today I know of no user of
> this functionality.
> 
> Given it doesn't work fully correctly on SMMUv3 and would create
> portability problems if any user depends on it, remove the unique support
> in arm_lpae and align with the expected iommu interface.
> 
> Outside the iommu users, this will potentially effect io_pgtable users of
> ARM_32_LPAE_S1, ARM_32_LPAE_S2, ARM_64_LPAE_S1, ARM_64_LPAE_S2, and
> ARM_MALI_LPAE formats.
> 
> Cc: Boris Brezillon <boris.brezillon@collabora.com>
> Cc: Steven Price <steven.price@arm.com>
> Cc: Liviu Dudau <liviu.dudau@arm.com>
> Cc: dri-devel@lists.freedesktop.org
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/io-pgtable-arm.c | 72 +++-------------------------------
>  1 file changed, 6 insertions(+), 66 deletions(-)
> 
> I don't know anything in the iommu space that needs this, and this is the only
> page table implementation in iommu that does it.

I think the v7s code does it as well, so please can you apply the same
treatment to arm_v7s_split_blk_unmap()?

> @@ -678,12 +618,12 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>  
>  		return i * size;
>  	} else if (iopte_leaf(pte, lvl, iop->fmt)) {
> -		/*
> -		 * Insert a table at the next level to map the old region,
> -		 * minus the part we want to unmap
> -		 */
> -		return arm_lpae_split_blk_unmap(data, gather, iova, size, pte,
> -						lvl + 1, ptep, pgcount);
> +		/* Unmap the entire large IOPTE and return its size */
> +		size = ARM_LPAE_BLOCK_SIZE(lvl, data);

If I understand your other message correctly, we shouldn't actually get
into this situation any more, right? In which case, can we WARN_ONCE()
and return 0 instead? Over-unmapping is filthy!

Will
Jason Gunthorpe Nov. 1, 2024, 1:40 p.m. UTC | #10
On Fri, Oct 18, 2024 at 02:19:26PM -0300, Jason Gunthorpe wrote:
> Of the page table implementations (AMD v1/2, VT-D SS, ARM32, DART)
> arm_lpae is unique in how it handles partial unmap of large IOPTEs.
> 
> All other drivers will unmap the large IOPTE and return it's length.  For
> example if a 2M IOPTE is present and the first 4K is requested to be
> unmapped then unmap will remove the whole 2M and report 2M as the result.
> 
> arm_lpae instead replaces the IOPTE with a table of smaller IOPTEs, unmaps
> the 4K and returns 4k. This is actually an illegal/non-hitless operation
> on at least SMMUv3 because of the BBM level 0 rules.
> 
> Long ago VFIO could trigger a path like this, today I know of no user of
> this functionality.
> 
> Given it doesn't work fully correctly on SMMUv3 and would create
> portability problems if any user depends on it, remove the unique support
> in arm_lpae and align with the expected iommu interface.
> 
> Outside the iommu users, this will potentially effect io_pgtable users of
> ARM_32_LPAE_S1, ARM_32_LPAE_S2, ARM_64_LPAE_S1, ARM_64_LPAE_S2, and
> ARM_MALI_LPAE formats.
> 
> Cc: Boris Brezillon <boris.brezillon@collabora.com>
> Cc: Steven Price <steven.price@arm.com>
> Cc: Liviu Dudau <liviu.dudau@arm.com>
> Cc: dri-devel@lists.freedesktop.org
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/io-pgtable-arm.c | 72 +++-------------------------------
>  1 file changed, 6 insertions(+), 66 deletions(-)

Updated commit message - Will let me know if you want me to resend
with this, or any changes:

iommu/io-pgtable-arm: Remove split on unmap behavior

A minority of page table implementations (arm_lpae, armv7) are unique in
how they handle partial unmap of large IOPTEs.

Other implementations will unmap the large IOPTE and return it's
length. For example if a 2M IOPTE is present and the first 4K is requested
to be unmapped then unmap will remove the whole 2M and report 2M as the
result.

arm_lpae instead replaces the IOPTE with a table of smaller IOPTEs, unmaps
the 4K and returns 4k. This is actually an illegal/non-hitless operation
on at least SMMUv3 because of the BBM level 0 rules.

Will says this was done to support VFIO, but upon deeper analysis this was
never strictly necessary:

 https://lore.kernel.org/all/20241024134411.GA6956@nvidia.com/

In summary, historical VFIO supported the AMD behavior of unmapping the
whole large IOPTE and returning the size, even if asked to unmap a
portion. The driver would see this as a request to split a large IOPTE.
Modern VFIO always unmaps entire large IOPTEs (except on AMD) and drivers
don't see an IOPTE split.

Given it doesn't work fully correctly on SMMUv3 and relying on ARM unique
behavior would create portability problems across IOMMU drivers, retire
this functionality.

Outside the iommu users, this will potentially effect io_pgtable users of
ARM_32_LPAE_S1, ARM_32_LPAE_S2, ARM_64_LPAE_S1, ARM_64_LPAE_S2, and
ARM_MALI_LPAE formats.
Jason Gunthorpe Nov. 1, 2024, 3:37 p.m. UTC | #11
On Fri, Nov 01, 2024 at 11:58:29AM +0000, Will Deacon wrote:
> On Fri, Oct 18, 2024 at 02:19:26PM -0300, Jason Gunthorpe wrote:
> > Of the page table implementations (AMD v1/2, VT-D SS, ARM32, DART)
> > arm_lpae is unique in how it handles partial unmap of large IOPTEs.
> > 
> > All other drivers will unmap the large IOPTE and return it's length.  For
> > example if a 2M IOPTE is present and the first 4K is requested to be
> > unmapped then unmap will remove the whole 2M and report 2M as the result.
> > 
> > arm_lpae instead replaces the IOPTE with a table of smaller IOPTEs, unmaps
> > the 4K and returns 4k. This is actually an illegal/non-hitless operation
> > on at least SMMUv3 because of the BBM level 0 rules.
> > 
> > Long ago VFIO could trigger a path like this, today I know of no user of
> > this functionality.
> > 
> > Given it doesn't work fully correctly on SMMUv3 and would create
> > portability problems if any user depends on it, remove the unique support
> > in arm_lpae and align with the expected iommu interface.
> > 
> > Outside the iommu users, this will potentially effect io_pgtable users of
> > ARM_32_LPAE_S1, ARM_32_LPAE_S2, ARM_64_LPAE_S1, ARM_64_LPAE_S2, and
> > ARM_MALI_LPAE formats.
> > 
> > Cc: Boris Brezillon <boris.brezillon@collabora.com>
> > Cc: Steven Price <steven.price@arm.com>
> > Cc: Liviu Dudau <liviu.dudau@arm.com>
> > Cc: dri-devel@lists.freedesktop.org
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > ---
> >  drivers/iommu/io-pgtable-arm.c | 72 +++-------------------------------
> >  1 file changed, 6 insertions(+), 66 deletions(-)
> > 
> > I don't know anything in the iommu space that needs this, and this is the only
> > page table implementation in iommu that does it.
> 
> I think the v7s code does it as well, so please can you apply the same
> treatment to arm_v7s_split_blk_unmap()?

I have that patch written, I'm not as confident in it as it is much
more complex, but it passes my simple tests.

However, if we make it fail and WARN_ON that should simplify it alot.

> > @@ -678,12 +618,12 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
> >  
> >  		return i * size;
> >  	} else if (iopte_leaf(pte, lvl, iop->fmt)) {
> > -		/*
> > -		 * Insert a table at the next level to map the old region,
> > -		 * minus the part we want to unmap
> > -		 */
> > -		return arm_lpae_split_blk_unmap(data, gather, iova, size, pte,
> > -						lvl + 1, ptep, pgcount);
> > +		/* Unmap the entire large IOPTE and return its size */
> > +		size = ARM_LPAE_BLOCK_SIZE(lvl, data);
> 
> If I understand your other message correctly, we shouldn't actually get
> into this situation any more, right? In which case, can we WARN_ONCE()
> and return 0 instead? Over-unmapping is filthy!

VFIO won't do it (except on AMD), I have not tried to figure out if
something else might depend on it over-unmapping.

So, OK, let's try the WARN_ON and it is very easy to put the above
hunk back as a fixup if someone hits it.

Jason
Will Deacon Nov. 4, 2024, 11:32 a.m. UTC | #12
On Fri, Nov 01, 2024 at 10:40:05AM -0300, Jason Gunthorpe wrote:
> On Fri, Oct 18, 2024 at 02:19:26PM -0300, Jason Gunthorpe wrote:
> > Of the page table implementations (AMD v1/2, VT-D SS, ARM32, DART)
> > arm_lpae is unique in how it handles partial unmap of large IOPTEs.
> > 
> > All other drivers will unmap the large IOPTE and return it's length.  For
> > example if a 2M IOPTE is present and the first 4K is requested to be
> > unmapped then unmap will remove the whole 2M and report 2M as the result.
> > 
> > arm_lpae instead replaces the IOPTE with a table of smaller IOPTEs, unmaps
> > the 4K and returns 4k. This is actually an illegal/non-hitless operation
> > on at least SMMUv3 because of the BBM level 0 rules.
> > 
> > Long ago VFIO could trigger a path like this, today I know of no user of
> > this functionality.
> > 
> > Given it doesn't work fully correctly on SMMUv3 and would create
> > portability problems if any user depends on it, remove the unique support
> > in arm_lpae and align with the expected iommu interface.
> > 
> > Outside the iommu users, this will potentially effect io_pgtable users of
> > ARM_32_LPAE_S1, ARM_32_LPAE_S2, ARM_64_LPAE_S1, ARM_64_LPAE_S2, and
> > ARM_MALI_LPAE formats.
> > 
> > Cc: Boris Brezillon <boris.brezillon@collabora.com>
> > Cc: Steven Price <steven.price@arm.com>
> > Cc: Liviu Dudau <liviu.dudau@arm.com>
> > Cc: dri-devel@lists.freedesktop.org
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > ---
> >  drivers/iommu/io-pgtable-arm.c | 72 +++-------------------------------
> >  1 file changed, 6 insertions(+), 66 deletions(-)
> 
> Updated commit message - Will let me know if you want me to resend
> with this, or any changes:

Thanks! Please send a new version with this text, the WARN we discussed
in the other part of the thread and also attacking the v7s code.

Cheers,

Will
diff mbox series

Patch

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 0e67f1721a3d98..a51fae9c909111 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -569,66 +569,6 @@  static void arm_lpae_free_pgtable(struct io_pgtable *iop)
 	kfree(data);
 }
 
-static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
-				       struct iommu_iotlb_gather *gather,
-				       unsigned long iova, size_t size,
-				       arm_lpae_iopte blk_pte, int lvl,
-				       arm_lpae_iopte *ptep, size_t pgcount)
-{
-	struct io_pgtable_cfg *cfg = &data->iop.cfg;
-	arm_lpae_iopte pte, *tablep;
-	phys_addr_t blk_paddr;
-	size_t tablesz = ARM_LPAE_GRANULE(data);
-	size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
-	int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data);
-	int i, unmap_idx_start = -1, num_entries = 0, max_entries;
-
-	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
-		return 0;
-
-	tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg, data->iop.cookie);
-	if (!tablep)
-		return 0; /* Bytes unmapped */
-
-	if (size == split_sz) {
-		unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
-		max_entries = ptes_per_table - unmap_idx_start;
-		num_entries = min_t(int, pgcount, max_entries);
-	}
-
-	blk_paddr = iopte_to_paddr(blk_pte, data);
-	pte = iopte_prot(blk_pte);
-
-	for (i = 0; i < ptes_per_table; i++, blk_paddr += split_sz) {
-		/* Unmap! */
-		if (i >= unmap_idx_start && i < (unmap_idx_start + num_entries))
-			continue;
-
-		__arm_lpae_init_pte(data, blk_paddr, pte, lvl, 1, &tablep[i]);
-	}
-
-	pte = arm_lpae_install_table(tablep, ptep, blk_pte, data);
-	if (pte != blk_pte) {
-		__arm_lpae_free_pages(tablep, tablesz, cfg, data->iop.cookie);
-		/*
-		 * We may race against someone unmapping another part of this
-		 * block, but anything else is invalid. We can't misinterpret
-		 * a page entry here since we're never at the last level.
-		 */
-		if (iopte_type(pte) != ARM_LPAE_PTE_TYPE_TABLE)
-			return 0;
-
-		tablep = iopte_deref(pte, data);
-	} else if (unmap_idx_start >= 0) {
-		for (i = 0; i < num_entries; i++)
-			io_pgtable_tlb_add_page(&data->iop, gather, iova + i * size, size);
-
-		return num_entries * size;
-	}
-
-	return __arm_lpae_unmap(data, gather, iova, size, pgcount, lvl, tablep);
-}
-
 static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 			       struct iommu_iotlb_gather *gather,
 			       unsigned long iova, size_t size, size_t pgcount,
@@ -678,12 +618,12 @@  static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 
 		return i * size;
 	} else if (iopte_leaf(pte, lvl, iop->fmt)) {
-		/*
-		 * Insert a table at the next level to map the old region,
-		 * minus the part we want to unmap
-		 */
-		return arm_lpae_split_blk_unmap(data, gather, iova, size, pte,
-						lvl + 1, ptep, pgcount);
+		/* Unmap the entire large IOPTE and return its size */
+		size = ARM_LPAE_BLOCK_SIZE(lvl, data);
+		__arm_lpae_clear_pte(ptep, &iop->cfg, 1);
+		if (gather && !iommu_iotlb_gather_queued(gather))
+			io_pgtable_tlb_add_page(iop, gather, iova, size);
+		return size;
 	}
 
 	/* Keep on walkin' */