diff mbox series

[1/3] iommu/io-pgtable-arm: Add nents_per_pgtable in struct io_pgtable_cfg

Message ID 0fe68babdb3a07adf024ed471fead4e3eb7e703f.1692693557.git.nicolinc@nvidia.com (mailing list archive)
State New, archived
Headers show
Series iommu/arm-smmu-v3: Reduce latency in __arm_smmu_tlb_inv_range() | expand

Commit Message

Nicolin Chen Aug. 22, 2023, 8:45 a.m. UTC
Add a new nents_per_pgtable in struct io_pgtable_cfg to store the number
of entries per IO pagetable, so it can be passed back to an IOMMU driver.
It will be used by one of the following changes to set the maximum number
of TLBI operations in the arm-smmu-v3 driver.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/io-pgtable-arm.c | 3 +++
 include/linux/io-pgtable.h     | 2 ++
 2 files changed, 5 insertions(+)

Comments

Robin Murphy Aug. 22, 2023, 9:19 a.m. UTC | #1
On 2023-08-22 09:45, Nicolin Chen wrote:
> Add a new nents_per_pgtable in struct io_pgtable_cfg to store the number
> of entries per IO pagetable, so it can be passed back to an IOMMU driver.
> It will be used by one of the following changes to set the maximum number
> of TLBI operations in the arm-smmu-v3 driver.
> 
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>   drivers/iommu/io-pgtable-arm.c | 3 +++
>   include/linux/io-pgtable.h     | 2 ++
>   2 files changed, 5 insertions(+)
> 
> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> index 72dcdd468cf3..7583d9ecca2b 100644
> --- a/drivers/iommu/io-pgtable-arm.c
> +++ b/drivers/iommu/io-pgtable-arm.c
> @@ -891,6 +891,7 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
>   
>   	/* TTBR */
>   	cfg->arm_lpae_s1_cfg.ttbr = virt_to_phys(data->pgd);
> +	cfg->nents_per_pgtable = 1 << data->bits_per_level;
>   	return &data->iop;
>   
>   out_free_data:
> @@ -993,6 +994,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
>   
>   	/* VTTBR */
>   	cfg->arm_lpae_s2_cfg.vttbr = virt_to_phys(data->pgd);
> +	cfg->nents_per_pgtable = 1 << data->bits_per_level;
>   	return &data->iop;
>   
>   out_free_data:
> @@ -1071,6 +1073,7 @@ arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
>   					  ARM_MALI_LPAE_TTBR_ADRMODE_TABLE;
>   	if (cfg->coherent_walk)
>   		cfg->arm_mali_lpae_cfg.transtab |= ARM_MALI_LPAE_TTBR_SHARE_OUTER;
> +	cfg->nents_per_pgtable = 1 << data->bits_per_level;

The result of this highly complex and expensive calculation is clearly 
redundant with the existing bits_per_level field, so why do we need to 
waste space storing when the driver could simply use bits_per_level?

Thanks,
Robin.

>   	return &data->iop;
>   
> diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
> index 1b7a44b35616..4b55a327abc1 100644
> --- a/include/linux/io-pgtable.h
> +++ b/include/linux/io-pgtable.h
> @@ -55,6 +55,7 @@ struct iommu_flush_ops {
>    *                 tables.
>    * @ias:           Input address (iova) size, in bits.
>    * @oas:           Output address (paddr) size, in bits.
> + * @nents_per_pgtable: Number of entries per page table.
>    * @coherent_walk  A flag to indicate whether or not page table walks made
>    *                 by the IOMMU are coherent with the CPU caches.
>    * @tlb:           TLB management callbacks for this set of tables.
> @@ -96,6 +97,7 @@ struct io_pgtable_cfg {
>   	unsigned long			pgsize_bitmap;
>   	unsigned int			ias;
>   	unsigned int			oas;
> +	unsigned int			nents_per_pgtable;
>   	bool				coherent_walk;
>   	const struct iommu_flush_ops	*tlb;
>   	struct device			*iommu_dev;
Nicolin Chen Aug. 22, 2023, 4:42 p.m. UTC | #2
On Tue, Aug 22, 2023 at 10:19:21AM +0100, Robin Murphy wrote:

> >   out_free_data:
> > @@ -1071,6 +1073,7 @@ arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
> >                                         ARM_MALI_LPAE_TTBR_ADRMODE_TABLE;
> >       if (cfg->coherent_walk)
> >               cfg->arm_mali_lpae_cfg.transtab |= ARM_MALI_LPAE_TTBR_SHARE_OUTER;
> > +     cfg->nents_per_pgtable = 1 << data->bits_per_level;
> 
> The result of this highly complex and expensive calculation is clearly
> redundant with the existing bits_per_level field, so why do we need to
> waste space storing when the driver could simply use bits_per_level?

bits_per_level is in the private struct arm_lpae_io_pgtable, while
drivers can only access struct io_pgtable_cfg. Are you suggesting
to move bits_per_level out of the private struct arm_lpae_io_pgtable
to the public struct io_pgtable_cfg?

Or am I missing another bits_per_level?

Thanks
Nicolin
Robin Murphy Aug. 29, 2023, 3:37 p.m. UTC | #3
On 2023-08-22 17:42, Nicolin Chen wrote:
> On Tue, Aug 22, 2023 at 10:19:21AM +0100, Robin Murphy wrote:
> 
>>>    out_free_data:
>>> @@ -1071,6 +1073,7 @@ arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
>>>                                          ARM_MALI_LPAE_TTBR_ADRMODE_TABLE;
>>>        if (cfg->coherent_walk)
>>>                cfg->arm_mali_lpae_cfg.transtab |= ARM_MALI_LPAE_TTBR_SHARE_OUTER;
>>> +     cfg->nents_per_pgtable = 1 << data->bits_per_level;
>>
>> The result of this highly complex and expensive calculation is clearly
>> redundant with the existing bits_per_level field, so why do we need to
>> waste space storing when the driver could simply use bits_per_level?
> 
> bits_per_level is in the private struct arm_lpae_io_pgtable, while
> drivers can only access struct io_pgtable_cfg. Are you suggesting
> to move bits_per_level out of the private struct arm_lpae_io_pgtable
> to the public struct io_pgtable_cfg?
> 
> Or am I missing another bits_per_level?

Bleh, apologies, I always confuse myself trying to remember the fiddly 
design of io-pgtable data. However, I think this then ends up proving 
the opposite point - the number of pages per table only happens to be a 
fixed constant for certain formats like LPAE, but does not necessarily 
generalise. For instance for a single v7s config it would be 1024 or 256 
or 16 depending on what has actually been unmapped.

The mechanism as proposed implicitly assumes LPAE format, so I still 
think we're better off making that assumption explicit. And at that 
point arm-smmu-v3 can then freely admit it already knows the number is 
simply 1/8th of the domain page size.

Thanks,
Robin.
Nicolin Chen Aug. 29, 2023, 8:15 p.m. UTC | #4
On Tue, Aug 29, 2023 at 04:37:00PM +0100, Robin Murphy wrote:

> On 2023-08-22 17:42, Nicolin Chen wrote:
> > On Tue, Aug 22, 2023 at 10:19:21AM +0100, Robin Murphy wrote:
> > 
> > > >    out_free_data:
> > > > @@ -1071,6 +1073,7 @@ arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
> > > >                                          ARM_MALI_LPAE_TTBR_ADRMODE_TABLE;
> > > >        if (cfg->coherent_walk)
> > > >                cfg->arm_mali_lpae_cfg.transtab |= ARM_MALI_LPAE_TTBR_SHARE_OUTER;
> > > > +     cfg->nents_per_pgtable = 1 << data->bits_per_level;
> > > 
> > > The result of this highly complex and expensive calculation is clearly
> > > redundant with the existing bits_per_level field, so why do we need to
> > > waste space storing when the driver could simply use bits_per_level?
> > 
> > bits_per_level is in the private struct arm_lpae_io_pgtable, while
> > drivers can only access struct io_pgtable_cfg. Are you suggesting
> > to move bits_per_level out of the private struct arm_lpae_io_pgtable
> > to the public struct io_pgtable_cfg?
> > 
> > Or am I missing another bits_per_level?
> 
> Bleh, apologies, I always confuse myself trying to remember the fiddly
> design of io-pgtable data. However, I think this then ends up proving
> the opposite point - the number of pages per table only happens to be a
> fixed constant for certain formats like LPAE, but does not necessarily
> generalise. For instance for a single v7s config it would be 1024 or 256
> or 16 depending on what has actually been unmapped.
> 
> The mechanism as proposed implicitly assumes LPAE format, so I still
> think we're better off making that assumption explicit. And at that
> point arm-smmu-v3 can then freely admit it already knows the number is
> simply 1/8th of the domain page size.

Hmm, I am not getting that "1/8th" part, would you mind elaborating?

Also, what we need is actually an arbitrary number for max_tlbi_ops.
And I think it could be irrelevant to the page size, i.e. either a
4K pgsize or a 64K pgsize could use the same max_tlbi_ops number,
because what eventually impacts the latency is the number of loops
of building/issuing commands.

So, combining your narrative above that nents_per_pgtable isn't so
general as we have in the tlbflush for MMU, perhaps we could just
decouple max_tlbi_ops from the pgtable and pgsize, instead define
something like this in the SMMUv3 driver:
	/*
	 * A request for a large number of TLBI commands could result in a big
	 * overhead and latency on SMMUs without ARM_SMMU_FEAT_RANGE_INV. Set
	 * a threshold to the number, so the driver would switch to one single
	 * full-range command.
	 */
	#define MAX_TLBI_OPS 8192

Any thought?

Thanks
Nicolin
Robin Murphy Aug. 29, 2023, 9:25 p.m. UTC | #5
On 2023-08-29 21:15, Nicolin Chen wrote:
> On Tue, Aug 29, 2023 at 04:37:00PM +0100, Robin Murphy wrote:
> 
>> On 2023-08-22 17:42, Nicolin Chen wrote:
>>> On Tue, Aug 22, 2023 at 10:19:21AM +0100, Robin Murphy wrote:
>>>
>>>>>     out_free_data:
>>>>> @@ -1071,6 +1073,7 @@ arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
>>>>>                                           ARM_MALI_LPAE_TTBR_ADRMODE_TABLE;
>>>>>         if (cfg->coherent_walk)
>>>>>                 cfg->arm_mali_lpae_cfg.transtab |= ARM_MALI_LPAE_TTBR_SHARE_OUTER;
>>>>> +     cfg->nents_per_pgtable = 1 << data->bits_per_level;
>>>>
>>>> The result of this highly complex and expensive calculation is clearly
>>>> redundant with the existing bits_per_level field, so why do we need to
>>>> waste space storing when the driver could simply use bits_per_level?
>>>
>>> bits_per_level is in the private struct arm_lpae_io_pgtable, while
>>> drivers can only access struct io_pgtable_cfg. Are you suggesting
>>> to move bits_per_level out of the private struct arm_lpae_io_pgtable
>>> to the public struct io_pgtable_cfg?
>>>
>>> Or am I missing another bits_per_level?
>>
>> Bleh, apologies, I always confuse myself trying to remember the fiddly
>> design of io-pgtable data. However, I think this then ends up proving
>> the opposite point - the number of pages per table only happens to be a
>> fixed constant for certain formats like LPAE, but does not necessarily
>> generalise. For instance for a single v7s config it would be 1024 or 256
>> or 16 depending on what has actually been unmapped.
>>
>> The mechanism as proposed implicitly assumes LPAE format, so I still
>> think we're better off making that assumption explicit. And at that
>> point arm-smmu-v3 can then freely admit it already knows the number is
>> simply 1/8th of the domain page size.
> 
> Hmm, I am not getting that "1/8th" part, would you mind elaborating?

If we know the format is LPAE, then we already know that nearly all 
pagetable levels are one full page, and the PTEs are 64 bits long. No 
magic data conduit required.

> Also, what we need is actually an arbitrary number for max_tlbi_ops.
> And I think it could be irrelevant to the page size, i.e. either a
> 4K pgsize or a 64K pgsize could use the same max_tlbi_ops number,
> because what eventually impacts the latency is the number of loops
> of building/issuing commands.

Although there is perhaps a counter-argument for selective invalidation, 
that if you're using 64K pages to improve TLB hit rates vs. 4K, then you 
have more to lose from overinvalidation, so maybe a single threshold 
isn't so appropriate for both.

Yes, ultimately it all comes down to picking an arbitrary number, but 
the challenge is that we want to pick a *good* one, and ideally have 
some reasoning behind it. As Will suggested, copying what the mm layer 
does gives us an easy line of reasoning, even if it's just in the form 
of passing the buck. And that's actually quite attractive, since 
otherwise we'd then have to get into the question of what really is the 
latency of building and issuing commands, since that clearly depends on 
how fast the CPU is, and how fast the SMMU is, and how busy the SMMU is, 
and how large the command queue is, and how many other CPUs are also 
contending for the command queue... and very quickly it becomes hard to 
believe that any simple constant can be good for all possible systems.

> So, combining your narrative above that nents_per_pgtable isn't so
> general as we have in the tlbflush for MMU,

FWIW I meant it doesn't generalise well enough to be a common io-pgtable 
interface; I have no issue with it forming the basis of an 
SMMUv3-specific heuristic when it *is* a relevant concept to all the 
pagetable formats SMMUv3 can possibly support.

Thanks,
Robin.
Nicolin Chen Aug. 29, 2023, 10:15 p.m. UTC | #6
On Tue, Aug 29, 2023 at 10:25:10PM +0100, Robin Murphy wrote:

> > > Bleh, apologies, I always confuse myself trying to remember the fiddly
> > > design of io-pgtable data. However, I think this then ends up proving
> > > the opposite point - the number of pages per table only happens to be a
> > > fixed constant for certain formats like LPAE, but does not necessarily
> > > generalise. For instance for a single v7s config it would be 1024 or 256
> > > or 16 depending on what has actually been unmapped.
> > > 
> > > The mechanism as proposed implicitly assumes LPAE format, so I still
> > > think we're better off making that assumption explicit. And at that
> > > point arm-smmu-v3 can then freely admit it already knows the number is
> > > simply 1/8th of the domain page size.
> > 
> > Hmm, I am not getting that "1/8th" part, would you mind elaborating?
> 
> If we know the format is LPAE, then we already know that nearly all
> pagetable levels are one full page, and the PTEs are 64 bits long. No
> magic data conduit required.

Oh, I see!

> > Also, what we need is actually an arbitrary number for max_tlbi_ops.
> > And I think it could be irrelevant to the page size, i.e. either a
> > 4K pgsize or a 64K pgsize could use the same max_tlbi_ops number,
> > because what eventually impacts the latency is the number of loops
> > of building/issuing commands.
> 
> Although there is perhaps a counter-argument for selective invalidation,
> that if you're using 64K pages to improve TLB hit rates vs. 4K, then you
> have more to lose from overinvalidation, so maybe a single threshold
> isn't so appropriate for both.
> 
> Yes, ultimately it all comes down to picking an arbitrary number, but
> the challenge is that we want to pick a *good* one, and ideally have
> some reasoning behind it. As Will suggested, copying what the mm layer
> does gives us an easy line of reasoning, even if it's just in the form
> of passing the buck. And that's actually quite attractive, since
> otherwise we'd then have to get into the question of what really is the
> latency of building and issuing commands, since that clearly depends on
> how fast the CPU is, and how fast the SMMU is, and how busy the SMMU is,
> and how large the command queue is, and how many other CPUs are also
> contending for the command queue... and very quickly it becomes hard to
> believe that any simple constant can be good for all possible systems.

Yea, I had trouble with deciding the number at the first place, so
the previous solution ended up with an SYSFS node. I do agree that
copying from the mm layer solution gives a strong justification of
picking a arbitrary number. My concern here is about whether it'll
be overly too often or not at triggering a full-as invalidation.

Meanwhile, by re-looking at Will's commit log:
    arm64: tlbi: Set MAX_TLBI_OPS to PTRS_PER_PTE

    In order to reduce the possibility of soft lock-ups, we bound the
    maximum number of TLBI operations performed by a single call to
    flush_tlb_range() to an arbitrary constant of 1024.

    Whilst this does the job of avoiding lock-ups, we can actually be a bit
    smarter by defining this as PTRS_PER_PTE. Due to the structure of our
    page tables, using PTRS_PER_PTE means that an outer loop calling
    flush_tlb_range() for entire table entries will end up performing just a
    single TLBI operation for each entry. As an example, mremap()ing a 1GB
    range mapped using 4k pages now requires only 512 TLBI operations when
    moving the page tables as opposed to 262144 operations (512*512) when
    using the current threshold of 1024.

I found that I am actually not quite getting the calculation at the
end for the comparison between 512 and 262144.

For a 4K pgsize setup, MAX_TLBI_OPS is set to 512, calculated from
4096 / 8. Then, any VA range >= 2MB will trigger a flush_tlb_all().
By setting the threshold to 1024, the 2MB size bumps up to 4MB, i.e.
the condition becomes range >= 4MB.

So, it seems to me that requesting a 1GB invalidation will trigger
a flush_tlb_all() in either case of having a 2MB or a 4MB threshold?

I can get that the 262144 is the number of pages in a 1GB size, so
the number of per-page invalidations will be 262144 operations if
there was no threshold to replace with a full-as invalidation. Yet,
that wasn't the case since we had a 4MB threshold with an arbitrary
1024 for MAX_TLBI_OPS?

> > So, combining your narrative above that nents_per_pgtable isn't so
> > general as we have in the tlbflush for MMU,
> 
> FWIW I meant it doesn't generalise well enough to be a common io-pgtable
> interface; I have no issue with it forming the basis of an
> SMMUv3-specific heuristic when it *is* a relevant concept to all the
> pagetable formats SMMUv3 can possibly support.

OK.

Thanks
Nicolin
Will Deacon Aug. 30, 2023, 9:49 p.m. UTC | #7
On Tue, Aug 29, 2023 at 03:15:52PM -0700, Nicolin Chen wrote:
> Meanwhile, by re-looking at Will's commit log:
>     arm64: tlbi: Set MAX_TLBI_OPS to PTRS_PER_PTE
> 
>     In order to reduce the possibility of soft lock-ups, we bound the
>     maximum number of TLBI operations performed by a single call to
>     flush_tlb_range() to an arbitrary constant of 1024.
> 
>     Whilst this does the job of avoiding lock-ups, we can actually be a bit
>     smarter by defining this as PTRS_PER_PTE. Due to the structure of our
>     page tables, using PTRS_PER_PTE means that an outer loop calling
>     flush_tlb_range() for entire table entries will end up performing just a
>     single TLBI operation for each entry. As an example, mremap()ing a 1GB
>     range mapped using 4k pages now requires only 512 TLBI operations when
>     moving the page tables as opposed to 262144 operations (512*512) when
>     using the current threshold of 1024.
> 
> I found that I am actually not quite getting the calculation at the
> end for the comparison between 512 and 262144.
> 
> For a 4K pgsize setup, MAX_TLBI_OPS is set to 512, calculated from
> 4096 / 8. Then, any VA range >= 2MB will trigger a flush_tlb_all().
> By setting the threshold to 1024, the 2MB size bumps up to 4MB, i.e.
> the condition becomes range >= 4MB.
> 
> So, it seems to me that requesting a 1GB invalidation will trigger
> a flush_tlb_all() in either case of having a 2MB or a 4MB threshold?
> 
> I can get that the 262144 is the number of pages in a 1GB size, so
> the number of per-page invalidations will be 262144 operations if
> there was no threshold to replace with a full-as invalidation. Yet,
> that wasn't the case since we had a 4MB threshold with an arbitrary
> 1024 for MAX_TLBI_OPS?

I think this is because you can't always batch up the entire range as
you'd like due to things like locking concerns. For example,
move_page_tables() can end up invalidating 2MiB at a time, which is
too low to trigger the old threshold and so you end up doing ever single
pte individually.

Will
Nicolin Chen Aug. 31, 2023, 5:39 p.m. UTC | #8
On Wed, Aug 30, 2023 at 10:49:59PM +0100, Will Deacon wrote:
 
> On Tue, Aug 29, 2023 at 03:15:52PM -0700, Nicolin Chen wrote:
> > Meanwhile, by re-looking at Will's commit log:
> >     arm64: tlbi: Set MAX_TLBI_OPS to PTRS_PER_PTE
> >
> >     In order to reduce the possibility of soft lock-ups, we bound the
> >     maximum number of TLBI operations performed by a single call to
> >     flush_tlb_range() to an arbitrary constant of 1024.
> >
> >     Whilst this does the job of avoiding lock-ups, we can actually be a bit
> >     smarter by defining this as PTRS_PER_PTE. Due to the structure of our
> >     page tables, using PTRS_PER_PTE means that an outer loop calling
> >     flush_tlb_range() for entire table entries will end up performing just a
> >     single TLBI operation for each entry. As an example, mremap()ing a 1GB
> >     range mapped using 4k pages now requires only 512 TLBI operations when
> >     moving the page tables as opposed to 262144 operations (512*512) when
> >     using the current threshold of 1024.
> >
> > I found that I am actually not quite getting the calculation at the
> > end for the comparison between 512 and 262144.
> >
> > For a 4K pgsize setup, MAX_TLBI_OPS is set to 512, calculated from
> > 4096 / 8. Then, any VA range >= 2MB will trigger a flush_tlb_all().
> > By setting the threshold to 1024, the 2MB size bumps up to 4MB, i.e.
> > the condition becomes range >= 4MB.
> >
> > So, it seems to me that requesting a 1GB invalidation will trigger
> > a flush_tlb_all() in either case of having a 2MB or a 4MB threshold?
> >
> > I can get that the 262144 is the number of pages in a 1GB size, so
> > the number of per-page invalidations will be 262144 operations if
> > there was no threshold to replace with a full-as invalidation. Yet,
> > that wasn't the case since we had a 4MB threshold with an arbitrary
> > 1024 for MAX_TLBI_OPS?
> 
> I think this is because you can't always batch up the entire range as
> you'd like due to things like locking concerns. For example,
> move_page_tables() can end up invalidating 2MiB at a time, which is
> too low to trigger the old threshold and so you end up doing ever single
> pte individually.

Thanks for elaborating! So, I see now that it was about the worst
case when 1GB breaks down to 512 pieces of 2MB range invalidation
ops, i.e. 512 flush_tlb_all ops v.s. 262144 flush_tlb_range ops.

Though I have not dig enough, I assume that this worst case could
happen to SVA too, since the IOTLB invalidation is from MMU code.
But the same worst case might not happen to non-SVA pathway, i.e.
TLBI ops for IO Page Table doesn't really benefit from this?

With that being questioned, I got Robin's remarks that it wouldn't
be easy to decide the arbitrary number, so we could just take the
worst case from SVA pathway as the common threshold.

Then, SVA uses the CPU page table, so perhaps we should highlight
that SMMU sets the threshold directly reusing the MAX_TLBI_OPS of
CPU page table rather than calculating from IO page table, though
both of them are in the same format?

Thank you
Nicolin
Nicolin Chen Sept. 1, 2023, 12:08 a.m. UTC | #9
Hi Will/Robin,

On Thu, Aug 31, 2023 at 10:39:15AM -0700, Nicolin Chen wrote:
 
> Though I have not dig enough, I assume that this worst case could
> happen to SVA too, since the IOTLB invalidation is from MMU code.
> But the same worst case might not happen to non-SVA pathway, i.e.
> TLBI ops for IO Page Table doesn't really benefit from this?
> 
> With that being questioned, I got Robin's remarks that it wouldn't
> be easy to decide the arbitrary number, so we could just take the
> worst case from SVA pathway as the common threshold.
> 
> Then, SVA uses the CPU page table, so perhaps we should highlight
> that SMMU sets the threshold directly reusing the MAX_TLBI_OPS of
> CPU page table rather than calculating from IO page table, though
> both of them are in the same format?

Our test team encountered a soft lockup in this path today:
--------------------------------------------------------------------
watchdog: BUG: soft lockup - CPU#244 stuck for 26s!
pstate: 83400009 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
pc : arm_smmu_cmdq_issue_cmdlist+0x178/0xa50
lr : arm_smmu_cmdq_issue_cmdlist+0x150/0xa50
sp : ffff8000d83ef290
x29: ffff8000d83ef290 x28: 000000003b9aca00 x27: 0000000000000000
x26: ffff8000d83ef3c0 x25: da86c0812194a0e8 x24: 0000000000000000
x23: 0000000000000040 x22: ffff8000d83ef340 x21: ffff0000c63980c0
x20: 0000000000000001 x19: ffff0000c6398080 x18: 0000000000000000
x17: 0000000000000000 x16: 0000000000000000 x15: ffff3000b4a3bbb0
x14: ffff3000b4a30888 x13: ffff3000b4a3cf60 x12: 0000000000000000
x11: 0000000000000000 x10: 0000000000000000 x9 : ffffc08120e4d6bc
x8 : 0000000000000000 x7 : 0000000000000000 x6 : 0000000000048cfa
x5 : 0000000000000000 x4 : 0000000000000001 x3 : 000000000000000a
x2 : 0000000080000000 x1 : 0000000000000000 x0 : 0000000000000001
Call trace:
 arm_smmu_cmdq_issue_cmdlist+0x178/0xa50
 __arm_smmu_tlb_inv_range+0x118/0x254
 arm_smmu_tlb_inv_range_asid+0x6c/0x130
 arm_smmu_mm_invalidate_range+0xa0/0xa4
 __mmu_notifier_invalidate_range_end+0x88/0x120
 unmap_vmas+0x194/0x1e0
 unmap_region+0xb4/0x144
 do_mas_align_munmap+0x290/0x490
 do_mas_munmap+0xbc/0x124
 __vm_munmap+0xa8/0x19c
 __arm64_sys_munmap+0x28/0x50
 invoke_syscall+0x78/0x11c
 el0_svc_common.constprop.0+0x58/0x1c0
 do_el0_svc+0x34/0x60
 el0_svc+0x2c/0xd4
 el0t_64_sync_handler+0x114/0x140
 el0t_64_sync+0x1a4/0x1a8
--------------------------------------------------------------------


I think it is the same problem that we fixed in tlbflush.h using
MAX_TLBI_OPS. So, I plan to send a cleaner bug fix (cc stable):
--------------------------------------------------------------------
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index a5a63b1c947e..e3ea7d2a6308 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -186,6 +186,9 @@ static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd)
        }
 }

+/* Copid from arch/arm64/include/asm/tlbflush.h to avoid similar soft lockups */
+#define MAX_TLBI_OPS   (1 << (PAGE_SHIFT - 3))
+
 static void arm_smmu_mm_invalidate_range(struct mmu_notifier *mn,
                                         struct mm_struct *mm,
                                         unsigned long start, unsigned long end)
@@ -201,9 +204,14 @@ static void arm_smmu_mm_invalidate_range(struct mmu_notifier *mn,
         */
        size = end - start;

-       if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM))
-               arm_smmu_tlb_inv_range_asid(start, size, smmu_mn->cd->asid,
-                                           PAGE_SIZE, false, smmu_domain);
+       if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM)) {
+               if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_RANGE_INV) &&
+                   size >= MAX_TLBI_OPS * PAGE_SIZE)
+                       arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid);
+               else
+                       arm_smmu_tlb_inv_range_asid(start, size, smmu_mn->cd->asid,
+                                                   PAGE_SIZE, false, smmu_domain);
+       }
        arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, start, size);
 }
--------------------------------------------------------------------

What do you think about it?

Thanks
Nic
Robin Murphy Sept. 1, 2023, 6:02 p.m. UTC | #10
On 2023-09-01 01:08, Nicolin Chen wrote:
> Hi Will/Robin,
> 
> On Thu, Aug 31, 2023 at 10:39:15AM -0700, Nicolin Chen wrote:
>   
>> Though I have not dig enough, I assume that this worst case could
>> happen to SVA too, since the IOTLB invalidation is from MMU code.
>> But the same worst case might not happen to non-SVA pathway, i.e.
>> TLBI ops for IO Page Table doesn't really benefit from this?
>>
>> With that being questioned, I got Robin's remarks that it wouldn't
>> be easy to decide the arbitrary number, so we could just take the
>> worst case from SVA pathway as the common threshold.
>>
>> Then, SVA uses the CPU page table, so perhaps we should highlight
>> that SMMU sets the threshold directly reusing the MAX_TLBI_OPS of
>> CPU page table rather than calculating from IO page table, though
>> both of them are in the same format?
> 
> Our test team encountered a soft lockup in this path today:
> --------------------------------------------------------------------
> watchdog: BUG: soft lockup - CPU#244 stuck for 26s!

That's a lot of TLBIs!

> pstate: 83400009 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
> pc : arm_smmu_cmdq_issue_cmdlist+0x178/0xa50
> lr : arm_smmu_cmdq_issue_cmdlist+0x150/0xa50
> sp : ffff8000d83ef290
> x29: ffff8000d83ef290 x28: 000000003b9aca00 x27: 0000000000000000
> x26: ffff8000d83ef3c0 x25: da86c0812194a0e8 x24: 0000000000000000
> x23: 0000000000000040 x22: ffff8000d83ef340 x21: ffff0000c63980c0
> x20: 0000000000000001 x19: ffff0000c6398080 x18: 0000000000000000
> x17: 0000000000000000 x16: 0000000000000000 x15: ffff3000b4a3bbb0
> x14: ffff3000b4a30888 x13: ffff3000b4a3cf60 x12: 0000000000000000
> x11: 0000000000000000 x10: 0000000000000000 x9 : ffffc08120e4d6bc
> x8 : 0000000000000000 x7 : 0000000000000000 x6 : 0000000000048cfa
> x5 : 0000000000000000 x4 : 0000000000000001 x3 : 000000000000000a
> x2 : 0000000080000000 x1 : 0000000000000000 x0 : 0000000000000001
> Call trace:
>   arm_smmu_cmdq_issue_cmdlist+0x178/0xa50
>   __arm_smmu_tlb_inv_range+0x118/0x254
>   arm_smmu_tlb_inv_range_asid+0x6c/0x130
>   arm_smmu_mm_invalidate_range+0xa0/0xa4
>   __mmu_notifier_invalidate_range_end+0x88/0x120
>   unmap_vmas+0x194/0x1e0
>   unmap_region+0xb4/0x144
>   do_mas_align_munmap+0x290/0x490
>   do_mas_munmap+0xbc/0x124
>   __vm_munmap+0xa8/0x19c
>   __arm64_sys_munmap+0x28/0x50
>   invoke_syscall+0x78/0x11c
>   el0_svc_common.constprop.0+0x58/0x1c0
>   do_el0_svc+0x34/0x60
>   el0_svc+0x2c/0xd4
>   el0t_64_sync_handler+0x114/0x140
>   el0t_64_sync+0x1a4/0x1a8
> --------------------------------------------------------------------
> 
> 
> I think it is the same problem that we fixed in tlbflush.h using
> MAX_TLBI_OPS. So, I plan to send a cleaner bug fix (cc stable):
> --------------------------------------------------------------------
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> index a5a63b1c947e..e3ea7d2a6308 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> @@ -186,6 +186,9 @@ static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd)
>          }
>   }
> 
> +/* Copid from arch/arm64/include/asm/tlbflush.h to avoid similar soft lockups */
> +#define MAX_TLBI_OPS   (1 << (PAGE_SHIFT - 3))
> +
>   static void arm_smmu_mm_invalidate_range(struct mmu_notifier *mn,
>                                           struct mm_struct *mm,
>                                           unsigned long start, unsigned long end)
> @@ -201,9 +204,14 @@ static void arm_smmu_mm_invalidate_range(struct mmu_notifier *mn,
>           */
>          size = end - start;
> 
> -       if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM))
> -               arm_smmu_tlb_inv_range_asid(start, size, smmu_mn->cd->asid,
> -                                           PAGE_SIZE, false, smmu_domain);
> +       if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM)) {
> +               if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_RANGE_INV) &&
> +                   size >= MAX_TLBI_OPS * PAGE_SIZE)
> +                       arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid);
> +               else
> +                       arm_smmu_tlb_inv_range_asid(start, size, smmu_mn->cd->asid,
> +                                                   PAGE_SIZE, false, smmu_domain);
> +       }
>          arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, start, size);
>   }
> --------------------------------------------------------------------
> 
> What do you think about it?

Looks reasonable to me - I think it's the right shape to foreshadow the 
bigger refactoring we discussed, and I can't object to using 
PAGE_{SIZE,SHIFT} for the calculation when it's specifically in the 
context of SVA.

Thanks,
Robin.
Nicolin Chen Sept. 1, 2023, 6:23 p.m. UTC | #11
On Fri, Sep 01, 2023 at 07:02:19PM +0100, Robin Murphy wrote:

> > Our test team encountered a soft lockup in this path today:
> > --------------------------------------------------------------------
> > watchdog: BUG: soft lockup - CPU#244 stuck for 26s!
> 
> That's a lot of TLBIs!

Well, imagining the use cases of NVIDIA Grace, I'd expect it soon
to be no longer a surprise that we see more stressful cases :-/

> > I think it is the same problem that we fixed in tlbflush.h using
> > MAX_TLBI_OPS. So, I plan to send a cleaner bug fix (cc stable)

> > What do you think about it?
> 
> Looks reasonable to me - I think it's the right shape to foreshadow the
> bigger refactoring we discussed, 

Cool! Thanks for the quick ack. And I could move the MAX_TLBI_OPS
to the SMMU header so a later change in smmu.c can just use it.

> and I can't object to using
> PAGE_{SIZE,SHIFT} for the calculation when it's specifically in the
> context of SVA.

Yea, the current SVA code passes in PAGE_SIZE as the TLBI granule.

Thanks
Nic
Nicolin Chen Jan. 20, 2024, 7:59 p.m. UTC | #12
Hi Robin/Will,

On Tue, Aug 29, 2023 at 02:25:10PM -0700, Robin Murphy wrote:
> > Also, what we need is actually an arbitrary number for max_tlbi_ops.
> > And I think it could be irrelevant to the page size, i.e. either a
> > 4K pgsize or a 64K pgsize could use the same max_tlbi_ops number,
> > because what eventually impacts the latency is the number of loops
> > of building/issuing commands.
> 
> Although there is perhaps a counter-argument for selective invalidation,
> that if you're using 64K pages to improve TLB hit rates vs. 4K, then you
> have more to lose from overinvalidation, so maybe a single threshold
> isn't so appropriate for both.
> 
> Yes, ultimately it all comes down to picking an arbitrary number, but
> the challenge is that we want to pick a *good* one, and ideally have
> some reasoning behind it. As Will suggested, copying what the mm layer
> does gives us an easy line of reasoning, even if it's just in the form
> of passing the buck. And that's actually quite attractive, since
> otherwise we'd then have to get into the question of what really is the
> latency of building and issuing commands, since that clearly depends on
> how fast the CPU is, and how fast the SMMU is, and how busy the SMMU is,
> and how large the command queue is, and how many other CPUs are also
> contending for the command queue... and very quickly it becomes hard to
> believe that any simple constant can be good for all possible systems.

So, here we have another request to optimize this number further,
though the merged arbitrary number copied from MMU side could fix
the soft lockup. The iommu_unmap delay with a 500MB buffer is not
quite satisfying on our testing chip, since the threshold now for
max_tlbi_ops is at 512MB for 64K pgsize (8192 * 64KB).

As Robin remarked, this could be really a case-by-case situation.
So, I wonder if we can rethink of adding a configurable threshold
that has a default value at its current setup matching MMU side.

If this is acceptable, what can be the preferable way of having a
configuration: a per-SMMU or a per-IOMMU-group sysfs node? I am
open for any other option too.

Also, this would be added to the arm_smmu_inv_range_too_big() in
Jason's patch here:
https://lore.kernel.org/linux-iommu/20240115153152.GA50608@ziepe.ca/

Thanks
Nicolin
Jason Gunthorpe Jan. 22, 2024, 1:01 p.m. UTC | #13
On Sat, Jan 20, 2024 at 11:59:45AM -0800, Nicolin Chen wrote:
> Hi Robin/Will,
> 
> On Tue, Aug 29, 2023 at 02:25:10PM -0700, Robin Murphy wrote:
> > > Also, what we need is actually an arbitrary number for max_tlbi_ops.
> > > And I think it could be irrelevant to the page size, i.e. either a
> > > 4K pgsize or a 64K pgsize could use the same max_tlbi_ops number,
> > > because what eventually impacts the latency is the number of loops
> > > of building/issuing commands.
> > 
> > Although there is perhaps a counter-argument for selective invalidation,
> > that if you're using 64K pages to improve TLB hit rates vs. 4K, then you
> > have more to lose from overinvalidation, so maybe a single threshold
> > isn't so appropriate for both.
> > 
> > Yes, ultimately it all comes down to picking an arbitrary number, but
> > the challenge is that we want to pick a *good* one, and ideally have
> > some reasoning behind it. As Will suggested, copying what the mm layer
> > does gives us an easy line of reasoning, even if it's just in the form
> > of passing the buck. And that's actually quite attractive, since
> > otherwise we'd then have to get into the question of what really is the
> > latency of building and issuing commands, since that clearly depends on
> > how fast the CPU is, and how fast the SMMU is, and how busy the SMMU is,
> > and how large the command queue is, and how many other CPUs are also
> > contending for the command queue... and very quickly it becomes hard to
> > believe that any simple constant can be good for all possible systems.
> 
> So, here we have another request to optimize this number further,
> though the merged arbitrary number copied from MMU side could fix
> the soft lockup. The iommu_unmap delay with a 500MB buffer is not
> quite satisfying on our testing chip, since the threshold now for
> max_tlbi_ops is at 512MB for 64K pgsize (8192 * 64KB).
> 
> As Robin remarked, this could be really a case-by-case situation.
> So, I wonder if we can rethink of adding a configurable threshold
> that has a default value at its current setup matching MMU side.
> 
> If this is acceptable, what can be the preferable way of having a
> configuration: a per-SMMU or a per-IOMMU-group sysfs node? I am
> open for any other option too.

Maybe it should be more dynamic and you get xx ms to push
invalidations otherwise it gives up and does invalidate all?

The busier the system the broader the invalidation?

Or do we need to measure at boot time invalidation performance and set
a threshold that way?

Also, it seems to me that SVA use cases and, say, DMA API cases are
somewhat different where we may be willing to wait longer for DMA API.

Jason
Nicolin Chen Jan. 22, 2024, 5:24 p.m. UTC | #14
Hi Jason,

Thank you for the ideas!

On Mon, Jan 22, 2024 at 09:01:52AM -0400, Jason Gunthorpe wrote:
> On Sat, Jan 20, 2024 at 11:59:45AM -0800, Nicolin Chen wrote:
> > Hi Robin/Will,
> > 
> > On Tue, Aug 29, 2023 at 02:25:10PM -0700, Robin Murphy wrote:
> > > > Also, what we need is actually an arbitrary number for max_tlbi_ops.
> > > > And I think it could be irrelevant to the page size, i.e. either a
> > > > 4K pgsize or a 64K pgsize could use the same max_tlbi_ops number,
> > > > because what eventually impacts the latency is the number of loops
> > > > of building/issuing commands.
> > > 
> > > Although there is perhaps a counter-argument for selective invalidation,
> > > that if you're using 64K pages to improve TLB hit rates vs. 4K, then you
> > > have more to lose from overinvalidation, so maybe a single threshold
> > > isn't so appropriate for both.
> > > 
> > > Yes, ultimately it all comes down to picking an arbitrary number, but
> > > the challenge is that we want to pick a *good* one, and ideally have
> > > some reasoning behind it. As Will suggested, copying what the mm layer
> > > does gives us an easy line of reasoning, even if it's just in the form
> > > of passing the buck. And that's actually quite attractive, since
> > > otherwise we'd then have to get into the question of what really is the
> > > latency of building and issuing commands, since that clearly depends on
> > > how fast the CPU is, and how fast the SMMU is, and how busy the SMMU is,
> > > and how large the command queue is, and how many other CPUs are also
> > > contending for the command queue... and very quickly it becomes hard to
> > > believe that any simple constant can be good for all possible systems.
> > 
> > So, here we have another request to optimize this number further,
> > though the merged arbitrary number copied from MMU side could fix
> > the soft lockup. The iommu_unmap delay with a 500MB buffer is not
> > quite satisfying on our testing chip, since the threshold now for
> > max_tlbi_ops is at 512MB for 64K pgsize (8192 * 64KB).
> > 
> > As Robin remarked, this could be really a case-by-case situation.
> > So, I wonder if we can rethink of adding a configurable threshold
> > that has a default value at its current setup matching MMU side.
> > 
> > If this is acceptable, what can be the preferable way of having a
> > configuration: a per-SMMU or a per-IOMMU-group sysfs node? I am
> > open for any other option too.
> 
> Maybe it should be more dynamic and you get xx ms to push
> invalidations otherwise it gives up and does invalidate all?
> 
> The busier the system the broader the invalidation?

Yea, I think this could be good one.

> Or do we need to measure at boot time invalidation performance and set
> a threshold that way?

I see. We can run an invalidation at default max_tlbi_ops to
get its delay in msec or usec, and then set as the threshold
"xx ms" in the idea one.

> Also, it seems to me that SVA use cases and, say, DMA API cases are
> somewhat different where we may be willing to wait longer for DMA API.

Hmm, the lockup that my patch fixed was for an SVA case that
doesn't seem to involve DMA API:
https://lore.kernel.org/linux-iommu/20230901203904.4073-1-nicolinc@nvidia.com/

And the other lockup fix for a non-SVA case from Zhang doesn't
seem to involve DMA API either:
https://lore.kernel.org/linux-iommu/e74ea905-d107-4202-97ca-c2c509e7aa1e@huawei.com/

Maybe we can treat DMA API a bit different. But I am not sure
about the justification of leaving it to wait longer. Mind
elaborating?

Thanks
Nicolin
Jason Gunthorpe Jan. 22, 2024, 5:57 p.m. UTC | #15
On Mon, Jan 22, 2024 at 09:24:08AM -0800, Nicolin Chen wrote:
> > Or do we need to measure at boot time invalidation performance and set
> > a threshold that way?
> 
> I see. We can run an invalidation at default max_tlbi_ops to
> get its delay in msec or usec, and then set as the threshold
> "xx ms" in the idea one.
> 
> > Also, it seems to me that SVA use cases and, say, DMA API cases are
> > somewhat different where we may be willing to wait longer for DMA API.
> 
> Hmm, the lockup that my patch fixed was for an SVA case that
> doesn't seem to involve DMA API:
> https://lore.kernel.org/linux-iommu/20230901203904.4073-1-nicolinc@nvidia.com/
> 
> And the other lockup fix for a non-SVA case from Zhang doesn't
> seem to involve DMA API either:
> https://lore.kernel.org/linux-iommu/e74ea905-d107-4202-97ca-c2c509e7aa1e@huawei.com/
> 
> Maybe we can treat DMA API a bit different. But I am not sure
> about the justification of leaving it to wait longer. Mind
> elaborating?

Well, there are two issues.. The first is the soft lockup, that should
just be reliably prevented. The timer, for instance, is a reasonable
stab at making that universally safe.

Then there is the issue of just raw invalidation performance, where
SVA particularly is linked to the mm and the longer invalidation takes
the slower the apps will be. We don't have any idea where future DMA
might hit the cache, so it is hard to know if all invalidation is not
the right thing..

DMA api is often lazy and the active DMA is a bit more predictable, so
perhaps there is more merit in spending more time to narrow the
invalidation.

The other case was vfio unmap for VM tear down, which ideally would
use whole ASID invalidation.

If your issue is softlockup, not performance, then that should be
prevented strongly. Broadly speaking if SVA is pushing too high an
invalidation workload then we need to agressively trim it, and do so
dynamically. Certainly we should not have a tunable that has to be set
right to avoid soft lockup.

A tunable to improve performance, perhaps, but not to achieve basic
correctness.

Maybe it is really just a simple thing - compute how many invalidation
commands are needed, if they don't all fit in the current queue space,
then do an invalidate all instead?

Jason
Nicolin Chen Jan. 24, 2024, 12:11 a.m. UTC | #16
On Mon, Jan 22, 2024 at 01:57:00PM -0400, Jason Gunthorpe wrote:
> On Mon, Jan 22, 2024 at 09:24:08AM -0800, Nicolin Chen wrote:
> > > Or do we need to measure at boot time invalidation performance and set
> > > a threshold that way?
> > 
> > I see. We can run an invalidation at default max_tlbi_ops to
> > get its delay in msec or usec, and then set as the threshold
> > "xx ms" in the idea one.
> > 
> > > Also, it seems to me that SVA use cases and, say, DMA API cases are
> > > somewhat different where we may be willing to wait longer for DMA API.
> > 
> > Hmm, the lockup that my patch fixed was for an SVA case that
> > doesn't seem to involve DMA API:
> > https://lore.kernel.org/linux-iommu/20230901203904.4073-1-nicolinc@nvidia.com/
> > 
> > And the other lockup fix for a non-SVA case from Zhang doesn't
> > seem to involve DMA API either:
> > https://lore.kernel.org/linux-iommu/e74ea905-d107-4202-97ca-c2c509e7aa1e@huawei.com/
> > 
> > Maybe we can treat DMA API a bit different. But I am not sure
> > about the justification of leaving it to wait longer. Mind
> > elaborating?
> 
> Well, there are two issues.. The first is the soft lockup, that should
> just be reliably prevented. The timer, for instance, is a reasonable
> stab at making that universally safe.
> 
> Then there is the issue of just raw invalidation performance, where
> SVA particularly is linked to the mm and the longer invalidation takes
> the slower the apps will be. We don't have any idea where future DMA
> might hit the cache, so it is hard to know if all invalidation is not
> the right thing..
> 
> DMA api is often lazy and the active DMA is a bit more predictable, so
> perhaps there is more merit in spending more time to narrow the
> invalidation.
> 
> The other case was vfio unmap for VM tear down, which ideally would
> use whole ASID invalidation.

I see! Then we need a flag to pass in __iommu_dma_unmap or so.
If a caller is in dma-iommu.c, do a longer per-page invalidation.

> If your issue is softlockup, not performance, then that should be

We have both issues.

> prevented strongly. Broadly speaking if SVA is pushing too high an
> invalidation workload then we need to agressively trim it, and do so
> dynamically. Certainly we should not have a tunable that has to be set
> right to avoid soft lockup.
> 
> A tunable to improve performance, perhaps, but not to achieve basic
> correctness.

So, should we make an optional tunable only for those who care
about performance? Though I think having a tunable would just
fix both issues.

> Maybe it is really just a simple thing - compute how many invalidation
> commands are needed, if they don't all fit in the current queue space,
> then do an invalidate all instead?

The queue could actually have a large space. But one large-size
invalidation would be divided into batches that have to execute
back-to-back. And the batch size is 64 commands in 64-bit case,
which might be too small as a cap.

Thanks
Nicolin
Nicolin Chen Jan. 25, 2024, 5:09 a.m. UTC | #17
On Wed, Jan 24, 2024 at 02:20:14PM +0800, zhangzekun (A) wrote:
> Also, it seems to me that SVA use cases and, say, DMA API cases are
> somewhat different where we may be willing to wait longer for DMA API.
 
> Hmm, the lockup that my patch fixed was for an SVA case that
> doesn't seem to involve DMA API:
> https://lore.kernel.org/linux-iommu/20230901203904.4073-1-nicolinc@nvidia.com/
> 
> And the other lockup fix for a non-SVA case from Zhang doesn't
> seem to involve DMA API either:
> https://lore.kernel.org/linux-iommu/e74ea905-d107-4202-97ca-c2c509e7aa1e@huawei.com/
> 
> 
> Hi, Nicolin
> 
> These patches do involve DMA APIs, because it modifies arm_smmu_tlb_inv_walk() which will be called when unmapping dma address space.
> 
> iommu_dma_unmap_page
>    __iommu_dma_unmap
>        __iommu_unmap
>            arm_smmu_unmap_pages
>                arm_lpae_unmap_pages
>                    io_pgtable_tlb_flush_walk
>                        arm_smmu_tlb_inv_walk
> 

Oh, thanks for clarifying. I was just looking at your trace log
there, but overlooked other places.
Jason Gunthorpe Jan. 25, 2024, 1:55 p.m. UTC | #18
On Tue, Jan 23, 2024 at 04:11:09PM -0800, Nicolin Chen wrote:
> > prevented strongly. Broadly speaking if SVA is pushing too high an
> > invalidation workload then we need to agressively trim it, and do so
> > dynamically. Certainly we should not have a tunable that has to be set
> > right to avoid soft lockup.
> > 
> > A tunable to improve performance, perhaps, but not to achieve basic
> > correctness.
> 
> So, should we make an optional tunable only for those who care
> about performance? Though I think having a tunable would just
> fix both issues.

When the soft lockup issue is solved you can consider if a tunable is
still interesting..
 
> > Maybe it is really just a simple thing - compute how many invalidation
> > commands are needed, if they don't all fit in the current queue space,
> > then do an invalidate all instead?
> 
> The queue could actually have a large space. But one large-size
> invalidation would be divided into batches that have to execute
> back-to-back. And the batch size is 64 commands in 64-bit case,
> which might be too small as a cap.

Yes, some notable code reorganizing would be needed to implement
something like this

Broadly I'd sketch sort of:

 - Figure out how fast the HW can execute a lot of commands
 - The above should drive some XX maximum number of commands, maybe we
   need to measure at boot, IDK
 - Strongly time bound SVA invalidation:
   * No more than XX commands, if more needed then push invalidate
     all
   * All commands must fit in the available queue space, if more
     needed then push invalidate all
 - The total queue depth must not be larger than YY based on the
   retire rate so that even a full queue will complete invalidation
   below the target time.

A tunable indicating what the SVA time bound target should be might be
appropriate..

Jason
Nicolin Chen Jan. 25, 2024, 5:23 p.m. UTC | #19
On Thu, Jan 25, 2024 at 09:55:37AM -0400, Jason Gunthorpe wrote:
> On Tue, Jan 23, 2024 at 04:11:09PM -0800, Nicolin Chen wrote:
> > > prevented strongly. Broadly speaking if SVA is pushing too high an
> > > invalidation workload then we need to agressively trim it, and do so
> > > dynamically. Certainly we should not have a tunable that has to be set
> > > right to avoid soft lockup.
> > > 
> > > A tunable to improve performance, perhaps, but not to achieve basic
> > > correctness.
> > 
> > So, should we make an optional tunable only for those who care
> > about performance? Though I think having a tunable would just
> > fix both issues.
> 
> When the soft lockup issue is solved you can consider if a tunable is
> still interesting..

Yea, it would be on top of the soft lockup fix. I assume we are
still going with your change: arm_smmu_inv_range_too_big, though
I wonder if we should apply before your rework series to make it
a bug fix..

> > > Maybe it is really just a simple thing - compute how many invalidation
> > > commands are needed, if they don't all fit in the current queue space,
> > > then do an invalidate all instead?
> > 
> > The queue could actually have a large space. But one large-size
> > invalidation would be divided into batches that have to execute
> > back-to-back. And the batch size is 64 commands in 64-bit case,
> > which might be too small as a cap.
> 
> Yes, some notable code reorganizing would be needed to implement
> something like this
> 
> Broadly I'd sketch sort of:
> 
>  - Figure out how fast the HW can execute a lot of commands
>  - The above should drive some XX maximum number of commands, maybe we
>    need to measure at boot, IDK
>  - Strongly time bound SVA invalidation:
>    * No more than XX commands, if more needed then push invalidate
>      all
>    * All commands must fit in the available queue space, if more
>      needed then push invalidate all
>  - The total queue depth must not be larger than YY based on the
>    retire rate so that even a full queue will complete invalidation
>    below the target time.
> 
> A tunable indicating what the SVA time bound target should be might be
> appropriate..

Thanks for listing it out. I will draft something with that, and
should we just confine it to SVA or non DMA callers in general?

Nicolin
Jason Gunthorpe Jan. 25, 2024, 5:47 p.m. UTC | #20
On Thu, Jan 25, 2024 at 09:23:00AM -0800, Nicolin Chen wrote:
> > When the soft lockup issue is solved you can consider if a tunable is
> > still interesting..
> 
> Yea, it would be on top of the soft lockup fix. I assume we are
> still going with your change: arm_smmu_inv_range_too_big, though
> I wonder if we should apply before your rework series to make it
> a bug fix..

It depends what change you settle on..

> > > > Maybe it is really just a simple thing - compute how many invalidation
> > > > commands are needed, if they don't all fit in the current queue space,
> > > > then do an invalidate all instead?
> > > 
> > > The queue could actually have a large space. But one large-size
> > > invalidation would be divided into batches that have to execute
> > > back-to-back. And the batch size is 64 commands in 64-bit case,
> > > which might be too small as a cap.
> > 
> > Yes, some notable code reorganizing would be needed to implement
> > something like this
> > 
> > Broadly I'd sketch sort of:
> > 
> >  - Figure out how fast the HW can execute a lot of commands
> >  - The above should drive some XX maximum number of commands, maybe we
> >    need to measure at boot, IDK
> >  - Strongly time bound SVA invalidation:
> >    * No more than XX commands, if more needed then push invalidate
> >      all
> >    * All commands must fit in the available queue space, if more
> >      needed then push invalidate all
> >  - The total queue depth must not be larger than YY based on the
> >    retire rate so that even a full queue will complete invalidation
> >    below the target time.
> > 
> > A tunable indicating what the SVA time bound target should be might be
> > appropriate..
> 
> Thanks for listing it out. I will draft something with that, and
> should we just confine it to SVA or non DMA callers in general?

Also, how much of this SVA issue is multithreaded? Will multiple
command queues improve anything?

Jason
Nicolin Chen Jan. 25, 2024, 7:55 p.m. UTC | #21
On Thu, Jan 25, 2024 at 01:47:28PM -0400, Jason Gunthorpe wrote:
> On Thu, Jan 25, 2024 at 09:23:00AM -0800, Nicolin Chen wrote:
> > > When the soft lockup issue is solved you can consider if a tunable is
> > > still interesting..
> > 
> > Yea, it would be on top of the soft lockup fix. I assume we are
> > still going with your change: arm_smmu_inv_range_too_big, though
> > I wonder if we should apply before your rework series to make it
> > a bug fix..
> 
> It depends what change you settle on..

I mean your arm_smmu_inv_range_too_big patch. Should it be a bug
fix CCing the stable tree? My previous SVA fix was, by the way.

> > > > > Maybe it is really just a simple thing - compute how many invalidation
> > > > > commands are needed, if they don't all fit in the current queue space,
> > > > > then do an invalidate all instead?
> > > > 
> > > > The queue could actually have a large space. But one large-size
> > > > invalidation would be divided into batches that have to execute
> > > > back-to-back. And the batch size is 64 commands in 64-bit case,
> > > > which might be too small as a cap.
> > > 
> > > Yes, some notable code reorganizing would be needed to implement
> > > something like this
> > > 
> > > Broadly I'd sketch sort of:
> > > 
> > >  - Figure out how fast the HW can execute a lot of commands
> > >  - The above should drive some XX maximum number of commands, maybe we
> > >    need to measure at boot, IDK
> > >  - Strongly time bound SVA invalidation:
> > >    * No more than XX commands, if more needed then push invalidate
> > >      all
> > >    * All commands must fit in the available queue space, if more
> > >      needed then push invalidate all
> > >  - The total queue depth must not be larger than YY based on the
> > >    retire rate so that even a full queue will complete invalidation
> > >    below the target time.
> > > 
> > > A tunable indicating what the SVA time bound target should be might be
> > > appropriate..
> > 
> > Thanks for listing it out. I will draft something with that, and
> > should we just confine it to SVA or non DMA callers in general?
> 
> Also, how much of this SVA issue is multithreaded? Will multiple
> command queues improve anything?

The bottleneck from measurement is mostly at SMMU consuming the
commands with a single CMDQ HW, so multithreading unlikely helps.
And VCMDQ only provides a multi-queue interface/wrapper for VM
isolations.

Thanks
Nic
diff mbox series

Patch

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 72dcdd468cf3..7583d9ecca2b 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -891,6 +891,7 @@  arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 
 	/* TTBR */
 	cfg->arm_lpae_s1_cfg.ttbr = virt_to_phys(data->pgd);
+	cfg->nents_per_pgtable = 1 << data->bits_per_level;
 	return &data->iop;
 
 out_free_data:
@@ -993,6 +994,7 @@  arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
 
 	/* VTTBR */
 	cfg->arm_lpae_s2_cfg.vttbr = virt_to_phys(data->pgd);
+	cfg->nents_per_pgtable = 1 << data->bits_per_level;
 	return &data->iop;
 
 out_free_data:
@@ -1071,6 +1073,7 @@  arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
 					  ARM_MALI_LPAE_TTBR_ADRMODE_TABLE;
 	if (cfg->coherent_walk)
 		cfg->arm_mali_lpae_cfg.transtab |= ARM_MALI_LPAE_TTBR_SHARE_OUTER;
+	cfg->nents_per_pgtable = 1 << data->bits_per_level;
 
 	return &data->iop;
 
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 1b7a44b35616..4b55a327abc1 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -55,6 +55,7 @@  struct iommu_flush_ops {
  *                 tables.
  * @ias:           Input address (iova) size, in bits.
  * @oas:           Output address (paddr) size, in bits.
+ * @nents_per_pgtable: Number of entries per page table.
  * @coherent_walk  A flag to indicate whether or not page table walks made
  *                 by the IOMMU are coherent with the CPU caches.
  * @tlb:           TLB management callbacks for this set of tables.
@@ -96,6 +97,7 @@  struct io_pgtable_cfg {
 	unsigned long			pgsize_bitmap;
 	unsigned int			ias;
 	unsigned int			oas;
+	unsigned int			nents_per_pgtable;
 	bool				coherent_walk;
 	const struct iommu_flush_ops	*tlb;
 	struct device			*iommu_dev;