diff mbox series

[RESEND,RFC,v1,2/5] arm64: Add BBM Level 2 cpu feature

Message ID 20241211160218.41404-3-miko.lenczewski@arm.com (mailing list archive)
State New, archived
Headers show
Series Initial BBML2 support for contpte_convert() | expand

Commit Message

Mikołaj Lenczewski Dec. 11, 2024, 4:01 p.m. UTC
The Break-Before-Make cpu feature supports multiple levels (levels 0-2),
and this commit adds a dedicated BBML2 cpufeature to test against
support for.

In supporting BBM level 2, we open ourselves up to potential TLB
Conflict Abort Exceptions during expected execution, instead of only
in exceptional circumstances. In the case of an abort, it is
implementation defined at what stage the abort is generated, and
the minimal set of required invalidations is also implementation
defined. The maximal set of invalidations is to do a `tlbi vmalle1`
or `tlbi vmalls12e1`, depending on the stage.

Such aborts should not occur on Arm hardware, and were not seen in
benchmarked systems, so unless performance concerns arise, implementing
the abort handlers with the worst-case invalidations seems like an
alright hack.

Signed-off-by: Mikołaj Lenczewski <miko.lenczewski@arm.com>
---
 arch/arm64/include/asm/cpufeature.h | 14 ++++++++++++++
 arch/arm64/kernel/cpufeature.c      |  7 +++++++
 arch/arm64/mm/fault.c               | 27 ++++++++++++++++++++++++++-
 arch/arm64/tools/cpucaps            |  1 +
 4 files changed, 48 insertions(+), 1 deletion(-)

Comments

Marc Zyngier Dec. 12, 2024, 8:25 a.m. UTC | #1
Ah, so this is where this is hiding. I missed it in my review of patch
#1 yesterday.

On Wed, 11 Dec 2024 16:01:38 +0000,
Mikołaj Lenczewski <miko.lenczewski@arm.com> wrote:
> 
> The Break-Before-Make cpu feature supports multiple levels (levels 0-2),
> and this commit adds a dedicated BBML2 cpufeature to test against
> support for.
> 
> In supporting BBM level 2, we open ourselves up to potential TLB
> Conflict Abort Exceptions during expected execution, instead of only
> in exceptional circumstances. In the case of an abort, it is
> implementation defined at what stage the abort is generated, and

*IF* stage-2 is enabled. Also, in the case of the EL2&0 translation
regime, no stage-2 applies, so it can only be a stage-1 abort.

> the minimal set of required invalidations is also implementation
> defined. The maximal set of invalidations is to do a `tlbi vmalle1`
> or `tlbi vmalls12e1`, depending on the stage.
> 
> Such aborts should not occur on Arm hardware, and were not seen in
> benchmarked systems, so unless performance concerns arise, implementing

Which systems? Given that you have deny-listed *all* half recent ARM
Ltd implementations, I'm a bit puzzled.

> the abort handlers with the worst-case invalidations seems like an
> alright hack.
> 
> Signed-off-by: Mikołaj Lenczewski <miko.lenczewski@arm.com>
> ---
>  arch/arm64/include/asm/cpufeature.h | 14 ++++++++++++++
>  arch/arm64/kernel/cpufeature.c      |  7 +++++++
>  arch/arm64/mm/fault.c               | 27 ++++++++++++++++++++++++++-
>  arch/arm64/tools/cpucaps            |  1 +
>  4 files changed, 48 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
> index 8b4e5a3cd24c..a9f2ac335392 100644
> --- a/arch/arm64/include/asm/cpufeature.h
> +++ b/arch/arm64/include/asm/cpufeature.h
> @@ -866,6 +866,20 @@ static __always_inline bool system_supports_mpam_hcr(void)
>  	return alternative_has_cap_unlikely(ARM64_MPAM_HCR);
>  }
>  
> +static inline bool system_supports_bbml2(void)
> +{
> +	/* currently, BBM is only relied on by code touching the userspace page
> +	 * tables, and as such we are guaranteed that caps have been finalised.
> +	 *
> +	 * if later we want to use BBM for kernel mappings, particularly early
> +	 * in the kernel, this may return 0 even if BBML2 is actually supported,
> +	 * which means unnecessary break-before-make sequences, but is still
> +	 * correct

Comment style, capitalisation, punctuation.

> +	 */
> +
> +	return alternative_has_cap_unlikely(ARM64_HAS_BBML2);
> +}
> +
>  int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
>  bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
>  
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index 6ce71f444ed8..7cc94bd5da24 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -2917,6 +2917,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
>  		.matches = has_cpuid_feature,
>  		ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP)
>  	},
> +	{
> +		.desc = "BBM Level 2 Support",
> +		.capability = ARM64_HAS_BBML2,
> +		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
> +		.matches = has_cpuid_feature,
> +		ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, BBM, 2)
> +	},
>  	{
>  		.desc = "52-bit Virtual Addressing for KVM (LPA2)",
>  		.capability = ARM64_HAS_LPA2,
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index ef63651099a9..dc119358cbc1 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -844,6 +844,31 @@ static int do_tag_check_fault(unsigned long far, unsigned long esr,
>  	return 0;
>  }
>  
> +static int do_conflict_abort(unsigned long far, unsigned long esr,
> +			     struct pt_regs *regs)
> +{
> +	if (!system_supports_bbml2())
> +		return do_bad(far, esr, regs);
> +
> +	/* if we receive a TLB conflict abort, we know that there are multiple
> +	 * TLB entries that translate the same address range. the minimum set
> +	 * of invalidations to clear these entries is implementation defined.
> +	 * the maximum set is defined as either tlbi(vmalls12e1) or tlbi(alle1).
> +	 *
> +	 * if el2 is enabled and stage 2 translation enabled, this may be
> +	 * raised as a stage 2 abort. if el2 is enabled but stage 2 translation
> +	 * disabled, or if el2 is disabled, it will be raised as a stage 1
> +	 * abort.
> +	 *
> +	 * local_flush_tlb_all() does a tlbi(vmalle1), which is enough to
> +	 * handle a stage 1 abort.

Same comment about comments.

> +	 */
> +
> +	local_flush_tlb_all();

The elephant in the room: if TLBs are in such a sorry state, what
guarantees we can make it this far?

I honestly don't think you can reliably handle a TLB Conflict abort in
the same translation regime as the original fault, given that we don't
know the scope of that fault. You are probably making an educated
guess that it is good enough on the CPUs you know of, but I don't see
anything in the architecture that indicates the "blast radius" of a
TLB conflict.

Which makes me think that your KVM patch is equally broken on nVHE and
hVHE. Such fault should probably be handled while at EL2, not after
returning to EL1.

Thanks,

	M.
Ryan Roberts Dec. 12, 2024, 10:55 a.m. UTC | #2
On 12/12/2024 08:25, Marc Zyngier wrote:
> Ah, so this is where this is hiding. I missed it in my review of patch
> #1 yesterday.
> 
> On Wed, 11 Dec 2024 16:01:38 +0000,
> Mikołaj Lenczewski <miko.lenczewski@arm.com> wrote:
>>
>> The Break-Before-Make cpu feature supports multiple levels (levels 0-2),
>> and this commit adds a dedicated BBML2 cpufeature to test against
>> support for.
>>
>> In supporting BBM level 2, we open ourselves up to potential TLB
>> Conflict Abort Exceptions during expected execution, instead of only
>> in exceptional circumstances. In the case of an abort, it is
>> implementation defined at what stage the abort is generated, and
> 
> *IF* stage-2 is enabled. Also, in the case of the EL2&0 translation
> regime, no stage-2 applies, so it can only be a stage-1 abort.
> 
>> the minimal set of required invalidations is also implementation
>> defined. The maximal set of invalidations is to do a `tlbi vmalle1`
>> or `tlbi vmalls12e1`, depending on the stage.
>>
>> Such aborts should not occur on Arm hardware, and were not seen in
>> benchmarked systems, so unless performance concerns arise, implementing
> 
> Which systems? Given that you have deny-listed *all* half recent ARM
> Ltd implementations, I'm a bit puzzled.
> 
>> the abort handlers with the worst-case invalidations seems like an
>> alright hack.
>>
>> Signed-off-by: Mikołaj Lenczewski <miko.lenczewski@arm.com>
>> ---
>>  arch/arm64/include/asm/cpufeature.h | 14 ++++++++++++++
>>  arch/arm64/kernel/cpufeature.c      |  7 +++++++
>>  arch/arm64/mm/fault.c               | 27 ++++++++++++++++++++++++++-
>>  arch/arm64/tools/cpucaps            |  1 +
>>  4 files changed, 48 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
>> index 8b4e5a3cd24c..a9f2ac335392 100644
>> --- a/arch/arm64/include/asm/cpufeature.h
>> +++ b/arch/arm64/include/asm/cpufeature.h
>> @@ -866,6 +866,20 @@ static __always_inline bool system_supports_mpam_hcr(void)
>>  	return alternative_has_cap_unlikely(ARM64_MPAM_HCR);
>>  }
>>  
>> +static inline bool system_supports_bbml2(void)
>> +{
>> +	/* currently, BBM is only relied on by code touching the userspace page
>> +	 * tables, and as such we are guaranteed that caps have been finalised.
>> +	 *
>> +	 * if later we want to use BBM for kernel mappings, particularly early
>> +	 * in the kernel, this may return 0 even if BBML2 is actually supported,
>> +	 * which means unnecessary break-before-make sequences, but is still
>> +	 * correct
> 
> Comment style, capitalisation, punctuation.
> 
>> +	 */
>> +
>> +	return alternative_has_cap_unlikely(ARM64_HAS_BBML2);
>> +}
>> +
>>  int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
>>  bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
>>  
>> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
>> index 6ce71f444ed8..7cc94bd5da24 100644
>> --- a/arch/arm64/kernel/cpufeature.c
>> +++ b/arch/arm64/kernel/cpufeature.c
>> @@ -2917,6 +2917,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
>>  		.matches = has_cpuid_feature,
>>  		ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP)
>>  	},
>> +	{
>> +		.desc = "BBM Level 2 Support",
>> +		.capability = ARM64_HAS_BBML2,
>> +		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
>> +		.matches = has_cpuid_feature,
>> +		ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, BBM, 2)
>> +	},
>>  	{
>>  		.desc = "52-bit Virtual Addressing for KVM (LPA2)",
>>  		.capability = ARM64_HAS_LPA2,
>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
>> index ef63651099a9..dc119358cbc1 100644
>> --- a/arch/arm64/mm/fault.c
>> +++ b/arch/arm64/mm/fault.c
>> @@ -844,6 +844,31 @@ static int do_tag_check_fault(unsigned long far, unsigned long esr,
>>  	return 0;
>>  }
>>  
>> +static int do_conflict_abort(unsigned long far, unsigned long esr,
>> +			     struct pt_regs *regs)
>> +{
>> +	if (!system_supports_bbml2())
>> +		return do_bad(far, esr, regs);
>> +
>> +	/* if we receive a TLB conflict abort, we know that there are multiple
>> +	 * TLB entries that translate the same address range. the minimum set
>> +	 * of invalidations to clear these entries is implementation defined.
>> +	 * the maximum set is defined as either tlbi(vmalls12e1) or tlbi(alle1).
>> +	 *
>> +	 * if el2 is enabled and stage 2 translation enabled, this may be
>> +	 * raised as a stage 2 abort. if el2 is enabled but stage 2 translation
>> +	 * disabled, or if el2 is disabled, it will be raised as a stage 1
>> +	 * abort.
>> +	 *
>> +	 * local_flush_tlb_all() does a tlbi(vmalle1), which is enough to
>> +	 * handle a stage 1 abort.
> 
> Same comment about comments.
> 
>> +	 */
>> +
>> +	local_flush_tlb_all();
> 
> The elephant in the room: if TLBs are in such a sorry state, what
> guarantees we can make it this far?

I'll leave Miko to respond to your other comments, but I wanted to address this
one, since it's pretty fundamental. We went around this loop internally and
concluded that what we are doing is architecturally sound.

The expectation is that a conflict abort can only be generated as a result of
the change in patch 4 (and patch 5). That change makes it possible for the TLB
to end up with a multihit. But crucially that can only happen for user space
memory because that change only operates on user memory. And while the TLB may
detect the conflict at any time, the conflict abort is only permitted to be
reported when an architectural access is prevented by the conflict. So we never
do anything that would allow a conflict for a kernel memory access and a user
memory conflict abort can never be triggered as a result of accessing kernel memory.

Copy/pasting comment from AlexC on the topic, which explains it better than I can:

"""
The intent is certainly that in cases where the hardware detects a TLB conflict
abort, it is only permitted to report it (by generating an exception) if it
applies to an access that is being attempted architecturally. ... that property
can be built from the following two properties:

1. The TLB conflict can only be reported as an Instruction Abort or a Data Abort

2. Those two exception types must be reported synchronously and precisely.
"""

> 
> I honestly don't think you can reliably handle a TLB Conflict abort in
> the same translation regime as the original fault, given that we don't
> know the scope of that fault. You are probably making an educated
> guess that it is good enough on the CPUs you know of, but I don't see
> anything in the architecture that indicates the "blast radius" of a
> TLB conflict.

OK, so I'm claiming that the blast radius is limited to the region of memory
that we are operating on in contpte_collapse() in patch 4. Perhaps we need to go
re-read the ARM and come back with the specific statements that led us to that
conclusion?

Thanks,
Ryan

> 
> Which makes me think that your KVM patch is equally broken on nVHE and
> hVHE. Such fault should probably be handled while at EL2, not after
> returning to EL1.
> 
> Thanks,
> 
> 	M.
>
Marc Zyngier Dec. 12, 2024, 2:26 p.m. UTC | #3
On Thu, 12 Dec 2024 10:55:45 +0000,
Ryan Roberts <ryan.roberts@arm.com> wrote:
> 
> On 12/12/2024 08:25, Marc Zyngier wrote:
> >> +
> >> +	local_flush_tlb_all();
> > 
> > The elephant in the room: if TLBs are in such a sorry state, what
> > guarantees we can make it this far?
> 
> I'll leave Miko to respond to your other comments, but I wanted to address this
> one, since it's pretty fundamental. We went around this loop internally and
> concluded that what we are doing is architecturally sound.
> 
> The expectation is that a conflict abort can only be generated as a result of
> the change in patch 4 (and patch 5). That change makes it possible for the TLB
> to end up with a multihit. But crucially that can only happen for user space
> memory because that change only operates on user memory. And while the TLB may
> detect the conflict at any time, the conflict abort is only permitted to be
> reported when an architectural access is prevented by the conflict. So we never
> do anything that would allow a conflict for a kernel memory access and a user
> memory conflict abort can never be triggered as a result of accessing kernel memory.
> 
> Copy/pasting comment from AlexC on the topic, which explains it better than I can:
> 
> """
> The intent is certainly that in cases where the hardware detects a TLB conflict
> abort, it is only permitted to report it (by generating an exception) if it
> applies to an access that is being attempted architecturally. ... that property
> can be built from the following two properties:
> 
> 1. The TLB conflict can only be reported as an Instruction Abort or a Data Abort
> 
> 2. Those two exception types must be reported synchronously and precisely.
> """

I totally agree with this. The issue is that nothing says that the
abort is in any way related to userspace.

> > 
> > I honestly don't think you can reliably handle a TLB Conflict abort in
> > the same translation regime as the original fault, given that we don't
> > know the scope of that fault. You are probably making an educated
> > guess that it is good enough on the CPUs you know of, but I don't see
> > anything in the architecture that indicates the "blast radius" of a
> > TLB conflict.
> 
> OK, so I'm claiming that the blast radius is limited to the region of memory
> that we are operating on in contpte_collapse() in patch 4. Perhaps we need to go
> re-read the ARM and come back with the specific statements that led us to that
> conclusion?

But we don't know for sure what caused this conflict by the time we
arrive in the handler. It could equally be because we have a glaring
bug somewhere on the kernel side, even if you are *now* only concerned
with userspace.

If anything, this should absolutely check for FAR_EL1 and assert that
this is indeed caused by such change.

Thanks,

	M.
Ryan Roberts Dec. 12, 2024, 3:05 p.m. UTC | #4
On 12/12/2024 14:26, Marc Zyngier wrote:
> On Thu, 12 Dec 2024 10:55:45 +0000,
> Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 12/12/2024 08:25, Marc Zyngier wrote:
>>>> +
>>>> +	local_flush_tlb_all();
>>>
>>> The elephant in the room: if TLBs are in such a sorry state, what
>>> guarantees we can make it this far?
>>
>> I'll leave Miko to respond to your other comments, but I wanted to address this
>> one, since it's pretty fundamental. We went around this loop internally and
>> concluded that what we are doing is architecturally sound.
>>
>> The expectation is that a conflict abort can only be generated as a result of
>> the change in patch 4 (and patch 5). That change makes it possible for the TLB
>> to end up with a multihit. But crucially that can only happen for user space
>> memory because that change only operates on user memory. And while the TLB may
>> detect the conflict at any time, the conflict abort is only permitted to be
>> reported when an architectural access is prevented by the conflict. So we never
>> do anything that would allow a conflict for a kernel memory access and a user
>> memory conflict abort can never be triggered as a result of accessing kernel memory.
>>
>> Copy/pasting comment from AlexC on the topic, which explains it better than I can:
>>
>> """
>> The intent is certainly that in cases where the hardware detects a TLB conflict
>> abort, it is only permitted to report it (by generating an exception) if it
>> applies to an access that is being attempted architecturally. ... that property
>> can be built from the following two properties:
>>
>> 1. The TLB conflict can only be reported as an Instruction Abort or a Data Abort
>>
>> 2. Those two exception types must be reported synchronously and precisely.
>> """
> 
> I totally agree with this. The issue is that nothing says that the
> abort is in any way related to userspace.
> 
>>>
>>> I honestly don't think you can reliably handle a TLB Conflict abort in
>>> the same translation regime as the original fault, given that we don't
>>> know the scope of that fault. You are probably making an educated
>>> guess that it is good enough on the CPUs you know of, but I don't see
>>> anything in the architecture that indicates the "blast radius" of a
>>> TLB conflict.
>>
>> OK, so I'm claiming that the blast radius is limited to the region of memory
>> that we are operating on in contpte_collapse() in patch 4. Perhaps we need to go
>> re-read the ARM and come back with the specific statements that led us to that
>> conclusion?

From the ARM:
"""
RFCPSG: If level 1 or level 2 is supported and the Contiguous bit in a set of
Block descriptors or Page descriptors is changed, then a TLB conflict abort can
be generated because multiple translation table entries might exist within a TLB
that translates the same IA.
"""

Although I guess it's not totally explicit, I've interpretted that as saying
that conflicting TLB entries can only arise for the IA range for which the
contiguous bits have been modified in the translation tables.

Given we are only fiddling with the contiguous bits for user space mappings in
this way, that's why I'm asserting we will only get a conflict abort for user
space mappings... assuming the absence of kernel bugs, anyway...

> 
> But we don't know for sure what caused this conflict by the time we
> arrive in the handler. It could equally be because we have a glaring
> bug somewhere on the kernel side, even if you are *now* only concerned
> with userspace.

OK I see what you are saying; previously a conflict abort would have led to
calling do_bad(), which returns 1, which causes do_mem_abort() to either kill
the kernel or the process depending on the origin of the abort. (although if it
came from kernel due to bug, we're just hoping that the conflict doesn't affect
the path through the handler). With this change, we always assume we can fix it
with the TLBI.

How about this change to ensure we still die for issues originating from the kernel?

if (!user_mode(regs) || !system_supports_bbml2())
		return do_bad(far, esr, regs);

> 
> If anything, this should absolutely check for FAR_EL1 and assert that
> this is indeed caused by such change.

I'm not really sure how we would check this reliably? Without patch 5, the
problem is somewhat constrained; we could have as many changes in flight as
there are CPUs so we could keep a list of all the {mm_struct, VA-range} that are
being modified. But if patch 5 is confirmed to be architecturally sound, then
there is no "terminating tlbi" so there is no bound on the set of {mm_struct,
VA-range}'s that could legitimately cause a conflict abort.

Thanks,
Ryan

> 
> Thanks,
> 
> 	M.
>
Marc Zyngier Dec. 12, 2024, 3:48 p.m. UTC | #5
On Thu, 12 Dec 2024 15:05:24 +0000,
Ryan Roberts <ryan.roberts@arm.com> wrote:
> 
> On 12/12/2024 14:26, Marc Zyngier wrote:
> > On Thu, 12 Dec 2024 10:55:45 +0000,
> > Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> On 12/12/2024 08:25, Marc Zyngier wrote:
> >>>> +
> >>>> +	local_flush_tlb_all();
> >>>
> >>> The elephant in the room: if TLBs are in such a sorry state, what
> >>> guarantees we can make it this far?
> >>
> >> I'll leave Miko to respond to your other comments, but I wanted to address this
> >> one, since it's pretty fundamental. We went around this loop internally and
> >> concluded that what we are doing is architecturally sound.
> >>
> >> The expectation is that a conflict abort can only be generated as a result of
> >> the change in patch 4 (and patch 5). That change makes it possible for the TLB
> >> to end up with a multihit. But crucially that can only happen for user space
> >> memory because that change only operates on user memory. And while the TLB may
> >> detect the conflict at any time, the conflict abort is only permitted to be
> >> reported when an architectural access is prevented by the conflict. So we never
> >> do anything that would allow a conflict for a kernel memory access and a user
> >> memory conflict abort can never be triggered as a result of accessing kernel memory.
> >>
> >> Copy/pasting comment from AlexC on the topic, which explains it better than I can:
> >>
> >> """
> >> The intent is certainly that in cases where the hardware detects a TLB conflict
> >> abort, it is only permitted to report it (by generating an exception) if it
> >> applies to an access that is being attempted architecturally. ... that property
> >> can be built from the following two properties:
> >>
> >> 1. The TLB conflict can only be reported as an Instruction Abort or a Data Abort
> >>
> >> 2. Those two exception types must be reported synchronously and precisely.
> >> """
> > 
> > I totally agree with this. The issue is that nothing says that the
> > abort is in any way related to userspace.
> > 
> >>>
> >>> I honestly don't think you can reliably handle a TLB Conflict abort in
> >>> the same translation regime as the original fault, given that we don't
> >>> know the scope of that fault. You are probably making an educated
> >>> guess that it is good enough on the CPUs you know of, but I don't see
> >>> anything in the architecture that indicates the "blast radius" of a
> >>> TLB conflict.
> >>
> >> OK, so I'm claiming that the blast radius is limited to the region of memory
> >> that we are operating on in contpte_collapse() in patch 4. Perhaps we need to go
> >> re-read the ARM and come back with the specific statements that led us to that
> >> conclusion?
> 
> From the ARM:
> """
> RFCPSG: If level 1 or level 2 is supported and the Contiguous bit in a set of
> Block descriptors or Page descriptors is changed, then a TLB conflict abort can
> be generated because multiple translation table entries might exist within a TLB
> that translates the same IA.
> """
> 
> Although I guess it's not totally explicit, I've interpretted that as saying
> that conflicting TLB entries can only arise for the IA range for which the
> contiguous bits have been modified in the translation tables.

Right, that's reassuring, thanks for digging that one.

> Given we are only fiddling with the contiguous bits for user space mappings in
> this way, that's why I'm asserting we will only get a conflict abort for user
> space mappings... assuming the absence of kernel bugs, anyway...

For now. But if you dare scanning the list, you'll find a lot of
people willing to do far more than just that. Including changing the
shape of the linear map.

>
> > 
> > But we don't know for sure what caused this conflict by the time we
> > arrive in the handler. It could equally be because we have a glaring
> > bug somewhere on the kernel side, even if you are *now* only concerned
> > with userspace.
> 
> OK I see what you are saying; previously a conflict abort would have led to
> calling do_bad(), which returns 1, which causes do_mem_abort() to either kill
> the kernel or the process depending on the origin of the abort. (although if it
> came from kernel due to bug, we're just hoping that the conflict doesn't affect
> the path through the handler). With this change, we always assume we can fix it
> with the TLBI.
> 
> How about this change to ensure we still die for issues originating from the kernel?
> 
> if (!user_mode(regs) || !system_supports_bbml2())
> 		return do_bad(far, esr, regs);

That wouldn't catch a TLB conflict on get_user(), would it?

> > If anything, this should absolutely check for FAR_EL1 and assert that
> > this is indeed caused by such change.
> 
> I'm not really sure how we would check this reliably? Without patch 5, the
> problem is somewhat constrained; we could have as many changes in flight as
> there are CPUs so we could keep a list of all the {mm_struct, VA-range} that are
> being modified. But if patch 5 is confirmed to be architecturally sound, then
> there is no "terminating tlbi" so there is no bound on the set of {mm_struct,
> VA-range}'s that could legitimately cause a conflict abort.

I didn't mean to imply that we should identify the exact cause of the
abort. I was hoping to simply check that FAR_EL1 reports a userspace
VA. Why wouldn't that work?

	M.
Ryan Roberts Dec. 12, 2024, 4:03 p.m. UTC | #6
On 12/12/2024 15:48, Marc Zyngier wrote:
> On Thu, 12 Dec 2024 15:05:24 +0000,
> Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 12/12/2024 14:26, Marc Zyngier wrote:
>>> On Thu, 12 Dec 2024 10:55:45 +0000,
>>> Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>
>>>> On 12/12/2024 08:25, Marc Zyngier wrote:
>>>>>> +
>>>>>> +	local_flush_tlb_all();
>>>>>
>>>>> The elephant in the room: if TLBs are in such a sorry state, what
>>>>> guarantees we can make it this far?
>>>>
>>>> I'll leave Miko to respond to your other comments, but I wanted to address this
>>>> one, since it's pretty fundamental. We went around this loop internally and
>>>> concluded that what we are doing is architecturally sound.
>>>>
>>>> The expectation is that a conflict abort can only be generated as a result of
>>>> the change in patch 4 (and patch 5). That change makes it possible for the TLB
>>>> to end up with a multihit. But crucially that can only happen for user space
>>>> memory because that change only operates on user memory. And while the TLB may
>>>> detect the conflict at any time, the conflict abort is only permitted to be
>>>> reported when an architectural access is prevented by the conflict. So we never
>>>> do anything that would allow a conflict for a kernel memory access and a user
>>>> memory conflict abort can never be triggered as a result of accessing kernel memory.
>>>>
>>>> Copy/pasting comment from AlexC on the topic, which explains it better than I can:
>>>>
>>>> """
>>>> The intent is certainly that in cases where the hardware detects a TLB conflict
>>>> abort, it is only permitted to report it (by generating an exception) if it
>>>> applies to an access that is being attempted architecturally. ... that property
>>>> can be built from the following two properties:
>>>>
>>>> 1. The TLB conflict can only be reported as an Instruction Abort or a Data Abort
>>>>
>>>> 2. Those two exception types must be reported synchronously and precisely.
>>>> """
>>>
>>> I totally agree with this. The issue is that nothing says that the
>>> abort is in any way related to userspace.
>>>
>>>>>
>>>>> I honestly don't think you can reliably handle a TLB Conflict abort in
>>>>> the same translation regime as the original fault, given that we don't
>>>>> know the scope of that fault. You are probably making an educated
>>>>> guess that it is good enough on the CPUs you know of, but I don't see
>>>>> anything in the architecture that indicates the "blast radius" of a
>>>>> TLB conflict.
>>>>
>>>> OK, so I'm claiming that the blast radius is limited to the region of memory
>>>> that we are operating on in contpte_collapse() in patch 4. Perhaps we need to go
>>>> re-read the ARM and come back with the specific statements that led us to that
>>>> conclusion?
>>
>> From the ARM:
>> """
>> RFCPSG: If level 1 or level 2 is supported and the Contiguous bit in a set of
>> Block descriptors or Page descriptors is changed, then a TLB conflict abort can
>> be generated because multiple translation table entries might exist within a TLB
>> that translates the same IA.
>> """
>>
>> Although I guess it's not totally explicit, I've interpretted that as saying
>> that conflicting TLB entries can only arise for the IA range for which the
>> contiguous bits have been modified in the translation tables.
> 
> Right, that's reassuring, thanks for digging that one.
> 
>> Given we are only fiddling with the contiguous bits for user space mappings in
>> this way, that's why I'm asserting we will only get a conflict abort for user
>> space mappings... assuming the absence of kernel bugs, anyway...
> 
> For now. But if you dare scanning the list, you'll find a lot of
> people willing to do far more than just that. Including changing the
> shape of the linear map.

Ahh. Sorry I don't do a good job of monitoring the lists. But was just having a
conversation with Catalin about exactly this.

> 
>>
>>>
>>> But we don't know for sure what caused this conflict by the time we
>>> arrive in the handler. It could equally be because we have a glaring
>>> bug somewhere on the kernel side, even if you are *now* only concerned
>>> with userspace.
>>
>> OK I see what you are saying; previously a conflict abort would have led to
>> calling do_bad(), which returns 1, which causes do_mem_abort() to either kill
>> the kernel or the process depending on the origin of the abort. (although if it
>> came from kernel due to bug, we're just hoping that the conflict doesn't affect
>> the path through the handler). With this change, we always assume we can fix it
>> with the TLBI.
>>
>> How about this change to ensure we still die for issues originating from the kernel?
>>
>> if (!user_mode(regs) || !system_supports_bbml2())
>> 		return do_bad(far, esr, regs);
> 
> That wouldn't catch a TLB conflict on get_user(), would it?

Oh good point.

> 
>>> If anything, this should absolutely check for FAR_EL1 and assert that
>>> this is indeed caused by such change.
>>
>> I'm not really sure how we would check this reliably? Without patch 5, the
>> problem is somewhat constrained; we could have as many changes in flight as
>> there are CPUs so we could keep a list of all the {mm_struct, VA-range} that are
>> being modified. But if patch 5 is confirmed to be architecturally sound, then
>> there is no "terminating tlbi" so there is no bound on the set of {mm_struct,
>> VA-range}'s that could legitimately cause a conflict abort.
> 
> I didn't mean to imply that we should identify the exact cause of the
> abort. I was hoping to simply check that FAR_EL1 reports a userspace
> VA. Why wouldn't that work?

Ahh gottya! Yes agreed, this sounds like the right approach.

> 
> 	M.
>
Mikołaj Lenczewski Dec. 13, 2024, 4:49 p.m. UTC | #7
On Thu, Dec 12, 2024 at 08:25:53AM +0000, Marc Zyngier wrote:
> Ah, so this is where this is hiding. I missed it in my review of patch
> #1 yesterday.
> 
> On Wed, 11 Dec 2024 16:01:38 +0000,
> Mikołaj Lenczewski <miko.lenczewski@arm.com> wrote:
> > 
> > The Break-Before-Make cpu feature supports multiple levels (levels 0-2),
> > and this commit adds a dedicated BBML2 cpufeature to test against
> > support for.
> > 
> > In supporting BBM level 2, we open ourselves up to potential TLB
> > Conflict Abort Exceptions during expected execution, instead of only
> > in exceptional circumstances. In the case of an abort, it is
> > implementation defined at what stage the abort is generated, and
> 
> *IF* stage-2 is enabled. Also, in the case of the EL2&0 translation
> regime, no stage-2 applies, so it can only be a stage-1 abort.
> 
> > the minimal set of required invalidations is also implementation
> > defined. The maximal set of invalidations is to do a `tlbi vmalle1`
> > or `tlbi vmalls12e1`, depending on the stage.
> > 
> > Such aborts should not occur on Arm hardware, and were not seen in
> > benchmarked systems, so unless performance concerns arise, implementing
> 
> Which systems? Given that you have deny-listed *all* half recent ARM
> Ltd implementations, I'm a bit puzzled.
> 

I had tested on an earlier series of the patchset that didn't introduce
the MIDR checks (has_bbml2() only read the claimed level of support),
but otherwise had the same implementation.

> >
> > +static inline bool system_supports_bbml2(void)
> > +{
> > +	/* currently, BBM is only relied on by code touching the userspace page
> > +	 * tables, and as such we are guaranteed that caps have been finalised.
> > +	 *
> > +	 * if later we want to use BBM for kernel mappings, particularly early
> > +	 * in the kernel, this may return 0 even if BBML2 is actually supported,
> > +	 * which means unnecessary break-before-make sequences, but is still
> > +	 * correct
> 
> Comment style, capitalisation, punctuation.
> 
> > +	if (!system_supports_bbml2())
> > +		return do_bad(far, esr, regs);
> > +
> > +	/* if we receive a TLB conflict abort, we know that there are multiple
> > +	 * TLB entries that translate the same address range. the minimum set
> > +	 * of invalidations to clear these entries is implementation defined.
> > +	 * the maximum set is defined as either tlbi(vmalls12e1) or tlbi(alle1).
> > +	 *
> > +	 * if el2 is enabled and stage 2 translation enabled, this may be
> > +	 * raised as a stage 2 abort. if el2 is enabled but stage 2 translation
> > +	 * disabled, or if el2 is disabled, it will be raised as a stage 1
> > +	 * abort.
> > +	 *
> > +	 * local_flush_tlb_all() does a tlbi(vmalle1), which is enough to
> > +	 * handle a stage 1 abort.
> 
> Same comment about comments.
> 

Will fix.

Kind regard,
Mikołaj
Mikołaj Lenczewski Dec. 13, 2024, 4:53 p.m. UTC | #8
> > 
> > From the ARM:
> > """
> > RFCPSG: If level 1 or level 2 is supported and the Contiguous bit in a set of
> > Block descriptors or Page descriptors is changed, then a TLB conflict abort can
> > be generated because multiple translation table entries might exist within a TLB
> > that translates the same IA.
> > """
> > 
> > Although I guess it's not totally explicit, I've interpretted that as saying
> > that conflicting TLB entries can only arise for the IA range for which the
> > contiguous bits have been modified in the translation tables.
> 
> Right, that's reassuring, thanks for digging that one.
> 

Thanks Ryan, will improve the comment and commit messages to make this
clearer and to include references to this information.

> > > If anything, this should absolutely check for FAR_EL1 and assert that
> > > this is indeed caused by such change.
> > 
> > I'm not really sure how we would check this reliably? Without patch 5, the
> > problem is somewhat constrained; we could have as many changes in flight as
> > there are CPUs so we could keep a list of all the {mm_struct, VA-range} that are
> > being modified. But if patch 5 is confirmed to be architecturally sound, then
> > there is no "terminating tlbi" so there is no bound on the set of {mm_struct,
> > VA-range}'s that could legitimately cause a conflict abort.
> 
> I didn't mean to imply that we should identify the exact cause of the
> abort. I was hoping to simply check that FAR_EL1 reports a userspace
> VA. Why wouldn't that work?
> 

Will do so.

Kind regards,
Mikołaj
Will Deacon Dec. 19, 2024, 4:45 p.m. UTC | #9
On Thu, Dec 12, 2024 at 04:03:52PM +0000, Ryan Roberts wrote:
> >>> If anything, this should absolutely check for FAR_EL1 and assert that
> >>> this is indeed caused by such change.
> >>
> >> I'm not really sure how we would check this reliably? Without patch 5, the
> >> problem is somewhat constrained; we could have as many changes in flight as
> >> there are CPUs so we could keep a list of all the {mm_struct, VA-range} that are
> >> being modified. But if patch 5 is confirmed to be architecturally sound, then
> >> there is no "terminating tlbi" so there is no bound on the set of {mm_struct,
> >> VA-range}'s that could legitimately cause a conflict abort.
> > 
> > I didn't mean to imply that we should identify the exact cause of the
> > abort. I was hoping to simply check that FAR_EL1 reports a userspace
> > VA. Why wouldn't that work?
> 
> Ahh gottya! Yes agreed, this sounds like the right approach.

Please, can we just not bother handling conflict aborts at all outside of
KVM? This is all dead code, it's complicated and it doesn't scale to the
in-kernel use-cases that others want. There's also not been any attempt
to add the pKVM support for handling host-side conflict aborts from what
I can tell.

For now, I would suggest limiting this series just to the KVM support
for handling a broken/malicious guest. If the contpte performance
improvements are worthwhile (I've asked for data), then let's add support
for the CPUs that handle the conflict in hardware (I believe this is far
more common than reporting the abort) so that the in-kernel users can
benefit whilst keeping the code manageable at the same time.

Will
Jonathan Cameron Jan. 2, 2025, 12:07 p.m. UTC | #10
On Thu, 19 Dec 2024 16:45:28 +0000
Will Deacon <will@kernel.org> wrote:

> On Thu, Dec 12, 2024 at 04:03:52PM +0000, Ryan Roberts wrote:
> > >>> If anything, this should absolutely check for FAR_EL1 and assert that
> > >>> this is indeed caused by such change.  
> > >>
> > >> I'm not really sure how we would check this reliably? Without patch 5, the
> > >> problem is somewhat constrained; we could have as many changes in flight as
> > >> there are CPUs so we could keep a list of all the {mm_struct, VA-range} that are
> > >> being modified. But if patch 5 is confirmed to be architecturally sound, then
> > >> there is no "terminating tlbi" so there is no bound on the set of {mm_struct,
> > >> VA-range}'s that could legitimately cause a conflict abort.  
> > > 
> > > I didn't mean to imply that we should identify the exact cause of the
> > > abort. I was hoping to simply check that FAR_EL1 reports a userspace
> > > VA. Why wouldn't that work?  
> > 
> > Ahh gottya! Yes agreed, this sounds like the right approach.  
> 
> Please, can we just not bother handling conflict aborts at all outside of
> KVM? This is all dead code, it's complicated and it doesn't scale to the
> in-kernel use-cases that others want. There's also not been any attempt
> to add the pKVM support for handling host-side conflict aborts from what
> I can tell.
> 
> For now, I would suggest limiting this series just to the KVM support
> for handling a broken/malicious guest. If the contpte performance
> improvements are worthwhile (I've asked for data), then let's add support
> for the CPUs that handle the conflict in hardware (I believe this is far
> more common than reporting the abort) so that the in-kernel users can
> benefit whilst keeping the code manageable at the same time.
> 

Hi All,

Given direction the discussion is going in time to raise a hand.

Huawei has implementations that support BBML2, and might report TLB conflict
abort after changing block size directly until an appropriate TLB invalidation
instruction completes and this Implementation Choice is architecturally compliant.

Jonathan

> Will
>
Marc Zyngier Jan. 2, 2025, 12:30 p.m. UTC | #11
Hi Jonathan,

On Thu, 02 Jan 2025 12:07:04 +0000,
Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
> 
> On Thu, 19 Dec 2024 16:45:28 +0000
> Will Deacon <will@kernel.org> wrote:
> 
> > On Thu, Dec 12, 2024 at 04:03:52PM +0000, Ryan Roberts wrote:
> > > >>> If anything, this should absolutely check for FAR_EL1 and assert that
> > > >>> this is indeed caused by such change.  
> > > >>
> > > >> I'm not really sure how we would check this reliably? Without patch 5, the
> > > >> problem is somewhat constrained; we could have as many changes in flight as
> > > >> there are CPUs so we could keep a list of all the {mm_struct, VA-range} that are
> > > >> being modified. But if patch 5 is confirmed to be architecturally sound, then
> > > >> there is no "terminating tlbi" so there is no bound on the set of {mm_struct,
> > > >> VA-range}'s that could legitimately cause a conflict abort.  
> > > > 
> > > > I didn't mean to imply that we should identify the exact cause of the
> > > > abort. I was hoping to simply check that FAR_EL1 reports a userspace
> > > > VA. Why wouldn't that work?  
> > > 
> > > Ahh gottya! Yes agreed, this sounds like the right approach.  
> > 
> > Please, can we just not bother handling conflict aborts at all outside of
> > KVM? This is all dead code, it's complicated and it doesn't scale to the
> > in-kernel use-cases that others want. There's also not been any attempt
> > to add the pKVM support for handling host-side conflict aborts from what
> > I can tell.
> > 
> > For now, I would suggest limiting this series just to the KVM support
> > for handling a broken/malicious guest. If the contpte performance
> > improvements are worthwhile (I've asked for data), then let's add support
> > for the CPUs that handle the conflict in hardware (I believe this is far
> > more common than reporting the abort) so that the in-kernel users can
> > benefit whilst keeping the code manageable at the same time.
> > 
> 
> Hi All,
> 
> Given direction the discussion is going in time to raise a hand.
> 
> Huawei has implementations that support BBML2, and might report TLB conflict
> abort after changing block size directly until an appropriate TLB invalidation
> instruction completes and this Implementation Choice is architecturally compliant.

Compliant, absolutely. That's the letter of the spec. The usefulness
aspect is, however, more debatable, and this is what Will is pointing
out.

Dealing with TLB Conflict aborts is an absolute pain if you need
to handle it within the same Translation Regime and using the same
TTBR as the one that has generated the fault. So at least for the time
being, it might be preferable to only worry about the implementations
that will promise to never generate such an abort and quietly perform
an invalidation behind the kernel's back.

Thanks,

	M.
Will Deacon Jan. 3, 2025, 3:35 p.m. UTC | #12
On Thu, Jan 02, 2025 at 12:30:34PM +0000, Marc Zyngier wrote:
> On Thu, 02 Jan 2025 12:07:04 +0000,
> Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
> > On Thu, 19 Dec 2024 16:45:28 +0000
> > Will Deacon <will@kernel.org> wrote:
> > > On Thu, Dec 12, 2024 at 04:03:52PM +0000, Ryan Roberts wrote:
> > > > >>> If anything, this should absolutely check for FAR_EL1 and assert that
> > > > >>> this is indeed caused by such change.  
> > > > >>
> > > > >> I'm not really sure how we would check this reliably? Without patch 5, the
> > > > >> problem is somewhat constrained; we could have as many changes in flight as
> > > > >> there are CPUs so we could keep a list of all the {mm_struct, VA-range} that are
> > > > >> being modified. But if patch 5 is confirmed to be architecturally sound, then
> > > > >> there is no "terminating tlbi" so there is no bound on the set of {mm_struct,
> > > > >> VA-range}'s that could legitimately cause a conflict abort.  
> > > > > 
> > > > > I didn't mean to imply that we should identify the exact cause of the
> > > > > abort. I was hoping to simply check that FAR_EL1 reports a userspace
> > > > > VA. Why wouldn't that work?  
> > > > 
> > > > Ahh gottya! Yes agreed, this sounds like the right approach.  
> > > 
> > > Please, can we just not bother handling conflict aborts at all outside of
> > > KVM? This is all dead code, it's complicated and it doesn't scale to the
> > > in-kernel use-cases that others want. There's also not been any attempt
> > > to add the pKVM support for handling host-side conflict aborts from what
> > > I can tell.
> > > 
> > > For now, I would suggest limiting this series just to the KVM support
> > > for handling a broken/malicious guest. If the contpte performance
> > > improvements are worthwhile (I've asked for data), then let's add support
> > > for the CPUs that handle the conflict in hardware (I believe this is far
> > > more common than reporting the abort) so that the in-kernel users can
> > > benefit whilst keeping the code manageable at the same time.
> > > 
> > 
> > Given direction the discussion is going in time to raise a hand.
> > 
> > Huawei has implementations that support BBML2, and might report TLB conflict
> > abort after changing block size directly until an appropriate TLB invalidation
> > instruction completes and this Implementation Choice is architecturally compliant.
> 
> Compliant, absolutely. That's the letter of the spec. The usefulness
> aspect is, however, more debatable, and this is what Will is pointing
> out.
> 
> Dealing with TLB Conflict aborts is an absolute pain if you need
> to handle it within the same Translation Regime and using the same
> TTBR as the one that has generated the fault. So at least for the time
> being, it might be preferable to only worry about the implementations
> that will promise to never generate such an abort and quietly perform
> an invalidation behind the kernel's back.

Agreed. We're not dropping support for CPUs that don't give us what we'd
like here, we're just not bending over to port and maintain new
optimisations for them. I think that's a reasonable compromise?

That said, thanks for raising this, Jonathan. It's a useful data point
to know that TLB conflict aborts exist in the wild!

Will
Ryan Roberts Jan. 3, 2025, 4 p.m. UTC | #13
On 03/01/2025 15:35, Will Deacon wrote:
> On Thu, Jan 02, 2025 at 12:30:34PM +0000, Marc Zyngier wrote:
>> On Thu, 02 Jan 2025 12:07:04 +0000,
>> Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
>>> On Thu, 19 Dec 2024 16:45:28 +0000
>>> Will Deacon <will@kernel.org> wrote:
>>>> On Thu, Dec 12, 2024 at 04:03:52PM +0000, Ryan Roberts wrote:
>>>>>>>> If anything, this should absolutely check for FAR_EL1 and assert that
>>>>>>>> this is indeed caused by such change.  
>>>>>>>
>>>>>>> I'm not really sure how we would check this reliably? Without patch 5, the
>>>>>>> problem is somewhat constrained; we could have as many changes in flight as
>>>>>>> there are CPUs so we could keep a list of all the {mm_struct, VA-range} that are
>>>>>>> being modified. But if patch 5 is confirmed to be architecturally sound, then
>>>>>>> there is no "terminating tlbi" so there is no bound on the set of {mm_struct,
>>>>>>> VA-range}'s that could legitimately cause a conflict abort.  
>>>>>>
>>>>>> I didn't mean to imply that we should identify the exact cause of the
>>>>>> abort. I was hoping to simply check that FAR_EL1 reports a userspace
>>>>>> VA. Why wouldn't that work?  
>>>>>
>>>>> Ahh gottya! Yes agreed, this sounds like the right approach.  
>>>>
>>>> Please, can we just not bother handling conflict aborts at all outside of
>>>> KVM? This is all dead code, it's complicated and it doesn't scale to the
>>>> in-kernel use-cases that others want. There's also not been any attempt
>>>> to add the pKVM support for handling host-side conflict aborts from what
>>>> I can tell.
>>>>
>>>> For now, I would suggest limiting this series just to the KVM support
>>>> for handling a broken/malicious guest. If the contpte performance
>>>> improvements are worthwhile (I've asked for data), then let's add support
>>>> for the CPUs that handle the conflict in hardware (I believe this is far
>>>> more common than reporting the abort) so that the in-kernel users can
>>>> benefit whilst keeping the code manageable at the same time.
>>>>
>>>
>>> Given direction the discussion is going in time to raise a hand.
>>>
>>> Huawei has implementations that support BBML2, and might report TLB conflict
>>> abort after changing block size directly until an appropriate TLB invalidation
>>> instruction completes and this Implementation Choice is architecturally compliant.
>>
>> Compliant, absolutely. That's the letter of the spec. The usefulness
>> aspect is, however, more debatable, and this is what Will is pointing
>> out.
>>
>> Dealing with TLB Conflict aborts is an absolute pain if you need
>> to handle it within the same Translation Regime and using the same
>> TTBR as the one that has generated the fault. So at least for the time
>> being, it might be preferable to only worry about the implementations
>> that will promise to never generate such an abort and quietly perform
>> an invalidation behind the kernel's back.
> 
> Agreed. We're not dropping support for CPUs that don't give us what we'd
> like here, we're just not bending over to port and maintain new
> optimisations for them. I think that's a reasonable compromise?
> 
> That said, thanks for raising this, Jonathan. It's a useful data point
> to know that TLB conflict aborts exist in the wild!

Indeed. Just to make it explicit; if we were to support all architecturally
compliant BBML2 implementations, we would need to drop the final patch in this
series. But since it sounds like we will be taking the approach of only allowing
these optimizations for implementations that never raise conflict abort and
handle it all in HW, it should be safe to keep the optimization in that final
patch. I'll work with Miko to get this bashed into shape and reposted.

Thanks,
Ryan

> 
> Will
Jonathan Cameron Jan. 3, 2025, 6:18 p.m. UTC | #14
On Fri, 3 Jan 2025 16:00:59 +0000
Ryan Roberts <ryan.roberts@arm.com> wrote:

> On 03/01/2025 15:35, Will Deacon wrote:
> > On Thu, Jan 02, 2025 at 12:30:34PM +0000, Marc Zyngier wrote:  
> >> On Thu, 02 Jan 2025 12:07:04 +0000,
> >> Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:  
> >>> On Thu, 19 Dec 2024 16:45:28 +0000
> >>> Will Deacon <will@kernel.org> wrote:  
> >>>> On Thu, Dec 12, 2024 at 04:03:52PM +0000, Ryan Roberts wrote:  
> >>>>>>>> If anything, this should absolutely check for FAR_EL1 and assert that
> >>>>>>>> this is indeed caused by such change.    
> >>>>>>>
> >>>>>>> I'm not really sure how we would check this reliably? Without patch 5, the
> >>>>>>> problem is somewhat constrained; we could have as many changes in flight as
> >>>>>>> there are CPUs so we could keep a list of all the {mm_struct, VA-range} that are
> >>>>>>> being modified. But if patch 5 is confirmed to be architecturally sound, then
> >>>>>>> there is no "terminating tlbi" so there is no bound on the set of {mm_struct,
> >>>>>>> VA-range}'s that could legitimately cause a conflict abort.    
> >>>>>>
> >>>>>> I didn't mean to imply that we should identify the exact cause of the
> >>>>>> abort. I was hoping to simply check that FAR_EL1 reports a userspace
> >>>>>> VA. Why wouldn't that work?    
> >>>>>
> >>>>> Ahh gottya! Yes agreed, this sounds like the right approach.    
> >>>>
> >>>> Please, can we just not bother handling conflict aborts at all outside of
> >>>> KVM? This is all dead code, it's complicated and it doesn't scale to the
> >>>> in-kernel use-cases that others want. There's also not been any attempt
> >>>> to add the pKVM support for handling host-side conflict aborts from what
> >>>> I can tell.
> >>>>
> >>>> For now, I would suggest limiting this series just to the KVM support
> >>>> for handling a broken/malicious guest. If the contpte performance
> >>>> improvements are worthwhile (I've asked for data), then let's add support
> >>>> for the CPUs that handle the conflict in hardware (I believe this is far
> >>>> more common than reporting the abort) so that the in-kernel users can
> >>>> benefit whilst keeping the code manageable at the same time.
> >>>>  
> >>>
> >>> Given direction the discussion is going in time to raise a hand.
> >>>
> >>> Huawei has implementations that support BBML2, and might report TLB conflict
> >>> abort after changing block size directly until an appropriate TLB invalidation
> >>> instruction completes and this Implementation Choice is architecturally compliant.  
> >>
> >> Compliant, absolutely. That's the letter of the spec. The usefulness
> >> aspect is, however, more debatable, and this is what Will is pointing
> >> out.
> >>
> >> Dealing with TLB Conflict aborts is an absolute pain if you need
> >> to handle it within the same Translation Regime and using the same
> >> TTBR as the one that has generated the fault. So at least for the time
> >> being, it might be preferable to only worry about the implementations
> >> that will promise to never generate such an abort and quietly perform
> >> an invalidation behind the kernel's back.  
> > 
> > Agreed. We're not dropping support for CPUs that don't give us what we'd
> > like here, we're just not bending over to port and maintain new
> > optimisations for them. I think that's a reasonable compromise?

Subject to usual maintainability vs performance questions sure.
Given we are the ones with the implementation, it's perhaps up to us
to prove the added complexity for a given optimization is worth the hassle
(maybe leaning on Arm to help out ;)  We have some activity going on
around this, but are unfortunately not ready to share.

> > 
> > That said, thanks for raising this, Jonathan. It's a useful data point
> > to know that TLB conflict aborts exist in the wild!  

My work here is done ;)

> 
> Indeed. Just to make it explicit; if we were to support all architecturally
> compliant BBML2 implementations, we would need to drop the final patch in this
> series. But since it sounds like we will be taking the approach of only allowing
> these optimizations for implementations that never raise conflict abort and
> handle it all in HW, it should be safe to keep the optimization in that final
> patch. I'll work with Miko to get this bashed into shape and reposted.

Obviously I'd want perf numbers to justify it (working on that) but I'd like
to keep on the table the option of patch 5 being the only part that is dependent
on being non conflict aborting hardware. I think even that is a performance
question rather than a correctness one - it simply widens the window in which
we might see a fault and have to dump the TLB. (I may well have missed something
though).

As a side note on that last patch, it is easy to conceive of a BBML2
solution that doesn't do conflict aborts, but for which it is still a performance
nightmare to not flush. As a fictional implementation, where our CPUs get a conflict
abort, we could instead have stalled the core and pushed the abort info to a management
controller, and flushed the whole TLB to resolve (plus probably the CPU pipeline).
If sufficiently rare that's not a totally stupid implementation (subject to some
optimizations). It is basically offloading what we are going to do in software on
a conflict abort with somewhat similar performance cost making widening the window
a very bad idea.

So the proposed allow list might need to be rather more nuanced than "can we get
a fault?"  We all love per uarch performance related opt ins.

Jonathan

> 
> Thanks,
> Ryan
> 
> > 
> > Will  
> 
>
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 8b4e5a3cd24c..a9f2ac335392 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -866,6 +866,20 @@  static __always_inline bool system_supports_mpam_hcr(void)
 	return alternative_has_cap_unlikely(ARM64_MPAM_HCR);
 }
 
+static inline bool system_supports_bbml2(void)
+{
+	/* currently, BBM is only relied on by code touching the userspace page
+	 * tables, and as such we are guaranteed that caps have been finalised.
+	 *
+	 * if later we want to use BBM for kernel mappings, particularly early
+	 * in the kernel, this may return 0 even if BBML2 is actually supported,
+	 * which means unnecessary break-before-make sequences, but is still
+	 * correct
+	 */
+
+	return alternative_has_cap_unlikely(ARM64_HAS_BBML2);
+}
+
 int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
 bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
 
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 6ce71f444ed8..7cc94bd5da24 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2917,6 +2917,13 @@  static const struct arm64_cpu_capabilities arm64_features[] = {
 		.matches = has_cpuid_feature,
 		ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP)
 	},
+	{
+		.desc = "BBM Level 2 Support",
+		.capability = ARM64_HAS_BBML2,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.matches = has_cpuid_feature,
+		ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, BBM, 2)
+	},
 	{
 		.desc = "52-bit Virtual Addressing for KVM (LPA2)",
 		.capability = ARM64_HAS_LPA2,
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index ef63651099a9..dc119358cbc1 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -844,6 +844,31 @@  static int do_tag_check_fault(unsigned long far, unsigned long esr,
 	return 0;
 }
 
+static int do_conflict_abort(unsigned long far, unsigned long esr,
+			     struct pt_regs *regs)
+{
+	if (!system_supports_bbml2())
+		return do_bad(far, esr, regs);
+
+	/* if we receive a TLB conflict abort, we know that there are multiple
+	 * TLB entries that translate the same address range. the minimum set
+	 * of invalidations to clear these entries is implementation defined.
+	 * the maximum set is defined as either tlbi(vmalls12e1) or tlbi(alle1).
+	 *
+	 * if el2 is enabled and stage 2 translation enabled, this may be
+	 * raised as a stage 2 abort. if el2 is enabled but stage 2 translation
+	 * disabled, or if el2 is disabled, it will be raised as a stage 1
+	 * abort.
+	 *
+	 * local_flush_tlb_all() does a tlbi(vmalle1), which is enough to
+	 * handle a stage 1 abort.
+	 */
+
+	local_flush_tlb_all();
+
+	return 0;
+}
+
 static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"ttbr address size fault"	},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"level 1 address size fault"	},
@@ -893,7 +918,7 @@  static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 45"			},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 46"			},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 47"			},
-	{ do_bad,		SIGKILL, SI_KERNEL,	"TLB conflict abort"		},
+	{ do_conflict_abort,	SIGKILL, SI_KERNEL,	"TLB conflict abort"		},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"Unsupported atomic hardware update fault"	},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 50"			},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 51"			},
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index eb17f59e543c..4ee0fbb7765b 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -26,6 +26,7 @@  HAS_ECV
 HAS_ECV_CNTPOFF
 HAS_EPAN
 HAS_EVT
+HAS_BBML2
 HAS_FPMR
 HAS_FGT
 HAS_FPSIMD