diff mbox series

[v19,10/11] KVM: arm64: nvhe: Disable branch generation in nVHE guests

Message ID 20250202-arm-brbe-v19-v19-10-1c1300802385@kernel.org (mailing list archive)
State New
Headers show
Series arm64/perf: Enable branch stack sampling | expand

Commit Message

Rob Herring Feb. 3, 2025, 12:43 a.m. UTC
From: Anshuman Khandual <anshuman.khandual@arm.com>

While BRBE can record branches within guests, the host recording
branches in guests is not supported by perf. Therefore, BRBE needs to be
disabled on guest entry and restored on exit.

For nVHE, this requires explicit handling for guests. Before
entering a guest, save the BRBE state and disable the it. When
returning to the host, restore the state.

For VHE, it is not necessary. We initialize
BRBCR_EL1.{E1BRE,E0BRE}=={0,0} at boot time, and HCR_EL2.TGE==1 while
running in the host. We configure BRBCR_EL2.{E2BRE,E0HBRE} to enable
branch recording in the host. When entering the guest, we set
HCR_EL2.TGE==0 which means BRBCR_EL1 is used instead of BRBCR_EL2.
Consequently for VHE, BRBE recording is disabled at EL1 and EL0 when
running a guest.

Should recording in guests (by the host) ever be desired, the perf ABI
will need to be extended to distinguish guest addresses (struct
perf_branch_entry.priv) for starters. BRBE records would also need to be
invalidated on guest entry/exit as guest/host EL1 and EL0 records can't
be distinguished.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Co-developed-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
v19:
 - Rework due to v6.14 debug flag changes
 - Redo commit message
---
 arch/arm64/include/asm/kvm_host.h  |  2 ++
 arch/arm64/kvm/debug.c             |  4 ++++
 arch/arm64/kvm/hyp/nvhe/debug-sr.c | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+)

Comments

Anshuman Khandual Feb. 3, 2025, 9:16 a.m. UTC | #1
On 2/3/25 06:13, Rob Herring (Arm) wrote:
> From: Anshuman Khandual <anshuman.khandual@arm.com>
> 
> While BRBE can record branches within guests, the host recording
> branches in guests is not supported by perf. Therefore, BRBE needs to be
> disabled on guest entry and restored on exit.
> 
> For nVHE, this requires explicit handling for guests. Before
> entering a guest, save the BRBE state and disable the it. When
> returning to the host, restore the state.
> 
> For VHE, it is not necessary. We initialize
> BRBCR_EL1.{E1BRE,E0BRE}=={0,0} at boot time, and HCR_EL2.TGE==1 while
> running in the host. We configure BRBCR_EL2.{E2BRE,E0HBRE} to enable
> branch recording in the host. When entering the guest, we set
> HCR_EL2.TGE==0 which means BRBCR_EL1 is used instead of BRBCR_EL2.
> Consequently for VHE, BRBE recording is disabled at EL1 and EL0 when
> running a guest.
> 
> Should recording in guests (by the host) ever be desired, the perf ABI
> will need to be extended to distinguish guest addresses (struct
> perf_branch_entry.priv) for starters. BRBE records would also need to be
> invalidated on guest entry/exit as guest/host EL1 and EL0 records can't
> be distinguished.
> 
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> Signed-off-by: Mark Rutland <mark.rutland@arm.com>
> Co-developed-by: Rob Herring (Arm) <robh@kernel.org>
> Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
> ---
> v19:
>  - Rework due to v6.14 debug flag changes
>  - Redo commit message
> ---
>  arch/arm64/include/asm/kvm_host.h  |  2 ++
>  arch/arm64/kvm/debug.c             |  4 ++++
>  arch/arm64/kvm/hyp/nvhe/debug-sr.c | 32 ++++++++++++++++++++++++++++++++
>  3 files changed, 38 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 7cfa024de4e3..4fc246a1ee6b 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -619,6 +619,7 @@ struct kvm_host_data {
>  #define KVM_HOST_DATA_FLAG_HOST_SME_ENABLED		3
>  #define KVM_HOST_DATA_FLAG_TRBE_ENABLED			4
>  #define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED	5
> +#define KVM_HOST_DATA_FLAG_HAS_BRBE			6

Although there is some variation in these feature names above, but seems
like KVM_HOST_DATA_FLAG_HAS_BRBE is an appropriate one for BRBE handling.

>  	unsigned long flags;
>  
>  	struct kvm_cpu_context host_ctxt;
> @@ -662,6 +663,7 @@ struct kvm_host_data {
>  		u64 trfcr_el1;
>  		/* Values of trap registers for the host before guest entry. */
>  		u64 mdcr_el2;
> +		u64 brbcr_el1;
>  	} host_debug_state;
>  
>  	/* Guest trace filter value */
> diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
> index 0e4c805e7e89..bc6015108a68 100644
> --- a/arch/arm64/kvm/debug.c
> +++ b/arch/arm64/kvm/debug.c
> @@ -81,6 +81,10 @@ void kvm_init_host_debug_data(void)
>  	    !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P))
>  		host_data_set_flag(HAS_SPE);
>  
> +	/* Check if we have BRBE implemented and available at the host */
> +	if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_BRBE_SHIFT))
> +		host_data_set_flag(HAS_BRBE);
> +
>  	if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceFilt_SHIFT)) {
>  		/* Force disable trace in protected mode in case of no TRBE */
>  		if (is_protected_kvm_enabled())
> diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
> index 2f4a4f5036bb..2a1c0f49792b 100644
> --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
> +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
> @@ -92,12 +92,42 @@ static void __trace_switch_to_host(void)
>  			  *host_data_ptr(host_debug_state.trfcr_el1));
>  }
>  
> +static void __debug_save_brbe(u64 *brbcr_el1)
> +{
> +	*brbcr_el1 = 0;
> +
> +	/* Check if the BRBE is enabled */
> +	if (!(read_sysreg_el1(SYS_BRBCR) & (BRBCR_ELx_E0BRE | BRBCR_ELx_ExBRE)))
> +		return;
> +
> +	/*
> +	 * Prohibit branch record generation while we are in guest.
> +	 * Since access to BRBCR_EL1 is trapped, the guest can't
> +	 * modify the filtering set by the host.
> +	 */
> +	*brbcr_el1 = read_sysreg_el1(SYS_BRBCR);
> +	write_sysreg_el1(0, SYS_BRBCR);
> +}
> +
> +static void __debug_restore_brbe(u64 brbcr_el1)
> +{
> +	if (!brbcr_el1)
> +		return;
> +
> +	/* Restore BRBE controls */
> +	write_sysreg_el1(brbcr_el1, SYS_BRBCR);
> +}
> +
>  void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
>  {
>  	/* Disable and flush SPE data generation */
>  	if (host_data_test_flag(HAS_SPE))
>  		__debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1));
>  
> +	/* Disable BRBE branch records */
> +	if (host_data_test_flag(HAS_BRBE))
> +		__debug_save_brbe(host_data_ptr(host_debug_state.brbcr_el1));
> +
>  	if (__trace_needs_switch())
>  		__trace_switch_to_guest();
>  }
> @@ -111,6 +141,8 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
>  {
>  	if (host_data_test_flag(HAS_SPE))
>  		__debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1));
> +	if (host_data_test_flag(HAS_BRBE))
> +		__debug_restore_brbe(*host_data_ptr(host_debug_state.brbcr_el1));
>  	if (__trace_needs_switch())
>  		__trace_switch_to_host();
>  }
> 

LGTM
James Clark Feb. 3, 2025, 11:28 a.m. UTC | #2
On 03/02/2025 12:43 am, Rob Herring (Arm) wrote:
> From: Anshuman Khandual <anshuman.khandual@arm.com>
> 
> While BRBE can record branches within guests, the host recording
> branches in guests is not supported by perf. Therefore, BRBE needs to be
> disabled on guest entry and restored on exit.

I don't think this is strictly true. You only need a Perf session in the 
guest to records sideband events. That allows you to make sense of the 
userspace addresses, but by then you might as well record BRBE in the 
guest in the first place. See [1] for an example.

With kernel addresses it might be even easier as all you need is 
--guestvmlinux, --guestkallsyms etc and no sideband events.

[1]: 
https://lore.kernel.org/all/20220711093218.10967-25-adrian.hunter@intel.com/

> 
> For nVHE, this requires explicit handling for guests. Before
> entering a guest, save the BRBE state and disable the it. When
> returning to the host, restore the state.
> 
> For VHE, it is not necessary. We initialize
> BRBCR_EL1.{E1BRE,E0BRE}=={0,0} at boot time, and HCR_EL2.TGE==1 while
> running in the host. We configure BRBCR_EL2.{E2BRE,E0HBRE} to enable
> branch recording in the host. When entering the guest, we set
> HCR_EL2.TGE==0 which means BRBCR_EL1 is used instead of BRBCR_EL2.
> Consequently for VHE, BRBE recording is disabled at EL1 and EL0 when
> running a guest.
> 
> Should recording in guests (by the host) ever be desired, the perf ABI
> will need to be extended to distinguish guest addresses (struct
> perf_branch_entry.priv) for starters. 

There's already this which would be enough (if every entry in the branch 
buffer matches it):

   sample->cpumode == PERF_RECORD_MISC_GUEST_KERNEL
   sample->cpumode == PERF_RECORD_MISC_GUEST_USER

But I don't think we need all the extra complexity. Just let the guest 
use all of BRBE and then there isn't really a use case that's not 
supported. I assume a lot of these workflows were added for trace 
because it's not supported in guests, but I don't think that applies to 
BRBE so we can skip them and go straight to full BRBE in guest support. 
As a later change obviously, these comments are more about the commit 
message.

James
Leo Yan Feb. 13, 2025, 5:03 p.m. UTC | #3
On Sun, Feb 02, 2025 at 06:43:04PM -0600, Rob Herring (Arm) wrote:

[...]

> +static void __debug_save_brbe(u64 *brbcr_el1)
> +{
> +       *brbcr_el1 = 0;
> +
> +       /* Check if the BRBE is enabled */
> +       if (!(read_sysreg_el1(SYS_BRBCR) & (BRBCR_ELx_E0BRE | BRBCR_ELx_ExBRE)))
> +               return;
> +
> +       /*
> +        * Prohibit branch record generation while we are in guest.
> +        * Since access to BRBCR_EL1 is trapped, the guest can't
> +        * modify the filtering set by the host.
> +        */
> +       *brbcr_el1 = read_sysreg_el1(SYS_BRBCR);
> +       write_sysreg_el1(0, SYS_BRBCR);
> +}

Should flush branch record and use isb() before exit host kernel?

I see inconsistence between the function above and BRBE's disable
function. Here it clears E0BRE / ExBRE bits for disabling BRBE, but the
BRBE driver sets the PAUSED bit in BRBFCR_EL1 for disabling BRBE.

Thanks,
Leo
Rob Herring Feb. 13, 2025, 11:16 p.m. UTC | #4
On Thu, Feb 13, 2025 at 11:03 AM Leo Yan <leo.yan@arm.com> wrote:
>
> On Sun, Feb 02, 2025 at 06:43:04PM -0600, Rob Herring (Arm) wrote:
>
> [...]
>
> > +static void __debug_save_brbe(u64 *brbcr_el1)
> > +{
> > +       *brbcr_el1 = 0;
> > +
> > +       /* Check if the BRBE is enabled */
> > +       if (!(read_sysreg_el1(SYS_BRBCR) & (BRBCR_ELx_E0BRE | BRBCR_ELx_ExBRE)))
> > +               return;
> > +
> > +       /*
> > +        * Prohibit branch record generation while we are in guest.
> > +        * Since access to BRBCR_EL1 is trapped, the guest can't
> > +        * modify the filtering set by the host.
> > +        */
> > +       *brbcr_el1 = read_sysreg_el1(SYS_BRBCR);
> > +       write_sysreg_el1(0, SYS_BRBCR);
> > +}
>
> Should flush branch record and use isb() before exit host kernel?

I don't think so. The isb()'s in the other cases appear to be related
to ordering WRT memory buffers. BRBE is just registers. I would assume
that there's some barrier before we switch to the guest.

> I see inconsistence between the function above and BRBE's disable
> function. Here it clears E0BRE / ExBRE bits for disabling BRBE, but the
> BRBE driver sets the PAUSED bit in BRBFCR_EL1 for disabling BRBE.

Indeed. This works, but the enabled check won't work. I'm going to add
clearing BRBCR to brbe_disable(), and this part will stay the same.

Rob
Leo Yan Feb. 14, 2025, 9:55 a.m. UTC | #5
On Thu, Feb 13, 2025 at 05:16:45PM -0600, Rob Herring wrote:

[...]

> > > +static void __debug_save_brbe(u64 *brbcr_el1)
> > > +{
> > > +       *brbcr_el1 = 0;
> > > +
> > > +       /* Check if the BRBE is enabled */
> > > +       if (!(read_sysreg_el1(SYS_BRBCR) & (BRBCR_ELx_E0BRE | BRBCR_ELx_ExBRE)))
> > > +               return;
> > > +
> > > +       /*
> > > +        * Prohibit branch record generation while we are in guest.
> > > +        * Since access to BRBCR_EL1 is trapped, the guest can't
> > > +        * modify the filtering set by the host.
> > > +        */
> > > +       *brbcr_el1 = read_sysreg_el1(SYS_BRBCR);
> > > +       write_sysreg_el1(0, SYS_BRBCR);
> > > +}
> >
> > Should flush branch record and use isb() before exit host kernel?
> 
> I don't think so. The isb()'s in the other cases appear to be related
> to ordering WRT memory buffers. BRBE is just registers. I would assume
> that there's some barrier before we switch to the guest.

Given BRBCR is a system register, my understanding is the followd ISB
can ensure the writing BRBCR has finished and take effect.  As a result,
it is promised that the branch record has been stopped.

However, with isb() it is not necessarily to say the branch records have
been flushed to the buffer.  The purpose at here is just to stop record.
The BRBE driver will take care the flush issue when it reads records.

I agreed that it is likely barriers in the followed switch flow can assure
the writing BRBCR to take effect.  It might be good to add a comment for
easier maintenance.

> > I see inconsistence between the function above and BRBE's disable
> > function. Here it clears E0BRE / ExBRE bits for disabling BRBE, but the
> > BRBE driver sets the PAUSED bit in BRBFCR_EL1 for disabling BRBE.
> 
> Indeed. This works, but the enabled check won't work. I'm going to add
> clearing BRBCR to brbe_disable(), and this part will stay the same.

Seems to me, a right logic would be:

- In BRBE driver, the brbe_disable() function should clear E0BRE and
  ExBRE bits in BRBCR.  It can make sure the BRBE is totally disabled
  when a perf session is terminated.

- For a kvm context switching, it is good to use PAUSED bit.  If a host
  is branch record enabled, this is a light way for temporarily pause
  branch record for the switched VM.

Thanks,
Leo
Rob Herring Feb. 18, 2025, 2:17 p.m. UTC | #6
On Fri, Feb 14, 2025 at 3:55 AM Leo Yan <leo.yan@arm.com> wrote:
>
> On Thu, Feb 13, 2025 at 05:16:45PM -0600, Rob Herring wrote:
>
> [...]
>
> > > > +static void __debug_save_brbe(u64 *brbcr_el1)
> > > > +{
> > > > +       *brbcr_el1 = 0;
> > > > +
> > > > +       /* Check if the BRBE is enabled */
> > > > +       if (!(read_sysreg_el1(SYS_BRBCR) & (BRBCR_ELx_E0BRE | BRBCR_ELx_ExBRE)))
> > > > +               return;
> > > > +
> > > > +       /*
> > > > +        * Prohibit branch record generation while we are in guest.
> > > > +        * Since access to BRBCR_EL1 is trapped, the guest can't
> > > > +        * modify the filtering set by the host.
> > > > +        */
> > > > +       *brbcr_el1 = read_sysreg_el1(SYS_BRBCR);
> > > > +       write_sysreg_el1(0, SYS_BRBCR);
> > > > +}
> > >
> > > Should flush branch record and use isb() before exit host kernel?
> >
> > I don't think so. The isb()'s in the other cases appear to be related
> > to ordering WRT memory buffers. BRBE is just registers. I would assume
> > that there's some barrier before we switch to the guest.
>
> Given BRBCR is a system register, my understanding is the followd ISB
> can ensure the writing BRBCR has finished and take effect.  As a result,
> it is promised that the branch record has been stopped.
>
> However, with isb() it is not necessarily to say the branch records have
> been flushed to the buffer.  The purpose at here is just to stop record.
> The BRBE driver will take care the flush issue when it reads records.
>
> I agreed that it is likely barriers in the followed switch flow can assure
> the writing BRBCR to take effect.  It might be good to add a comment for
> easier maintenance.
>
> > > I see inconsistence between the function above and BRBE's disable
> > > function. Here it clears E0BRE / ExBRE bits for disabling BRBE, but the
> > > BRBE driver sets the PAUSED bit in BRBFCR_EL1 for disabling BRBE.
> >
> > Indeed. This works, but the enabled check won't work. I'm going to add
> > clearing BRBCR to brbe_disable(), and this part will stay the same.
>
> Seems to me, a right logic would be:
>
> - In BRBE driver, the brbe_disable() function should clear E0BRE and
>   ExBRE bits in BRBCR.  It can make sure the BRBE is totally disabled
>   when a perf session is terminated.
>
> - For a kvm context switching, it is good to use PAUSED bit.  If a host
>   is branch record enabled, this is a light way for temporarily pause
>   branch record for the switched VM.

We have to read BRBCR to see if it is enabled as PAUSED is unknown out
of reset and the driver may not exist to initialize it. Either way, it
is a register read and write, so same overhead for both.

Rob
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 7cfa024de4e3..4fc246a1ee6b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -619,6 +619,7 @@  struct kvm_host_data {
 #define KVM_HOST_DATA_FLAG_HOST_SME_ENABLED		3
 #define KVM_HOST_DATA_FLAG_TRBE_ENABLED			4
 #define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED	5
+#define KVM_HOST_DATA_FLAG_HAS_BRBE			6
 	unsigned long flags;
 
 	struct kvm_cpu_context host_ctxt;
@@ -662,6 +663,7 @@  struct kvm_host_data {
 		u64 trfcr_el1;
 		/* Values of trap registers for the host before guest entry. */
 		u64 mdcr_el2;
+		u64 brbcr_el1;
 	} host_debug_state;
 
 	/* Guest trace filter value */
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 0e4c805e7e89..bc6015108a68 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -81,6 +81,10 @@  void kvm_init_host_debug_data(void)
 	    !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P))
 		host_data_set_flag(HAS_SPE);
 
+	/* Check if we have BRBE implemented and available at the host */
+	if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_BRBE_SHIFT))
+		host_data_set_flag(HAS_BRBE);
+
 	if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceFilt_SHIFT)) {
 		/* Force disable trace in protected mode in case of no TRBE */
 		if (is_protected_kvm_enabled())
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
index 2f4a4f5036bb..2a1c0f49792b 100644
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -92,12 +92,42 @@  static void __trace_switch_to_host(void)
 			  *host_data_ptr(host_debug_state.trfcr_el1));
 }
 
+static void __debug_save_brbe(u64 *brbcr_el1)
+{
+	*brbcr_el1 = 0;
+
+	/* Check if the BRBE is enabled */
+	if (!(read_sysreg_el1(SYS_BRBCR) & (BRBCR_ELx_E0BRE | BRBCR_ELx_ExBRE)))
+		return;
+
+	/*
+	 * Prohibit branch record generation while we are in guest.
+	 * Since access to BRBCR_EL1 is trapped, the guest can't
+	 * modify the filtering set by the host.
+	 */
+	*brbcr_el1 = read_sysreg_el1(SYS_BRBCR);
+	write_sysreg_el1(0, SYS_BRBCR);
+}
+
+static void __debug_restore_brbe(u64 brbcr_el1)
+{
+	if (!brbcr_el1)
+		return;
+
+	/* Restore BRBE controls */
+	write_sysreg_el1(brbcr_el1, SYS_BRBCR);
+}
+
 void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
 {
 	/* Disable and flush SPE data generation */
 	if (host_data_test_flag(HAS_SPE))
 		__debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1));
 
+	/* Disable BRBE branch records */
+	if (host_data_test_flag(HAS_BRBE))
+		__debug_save_brbe(host_data_ptr(host_debug_state.brbcr_el1));
+
 	if (__trace_needs_switch())
 		__trace_switch_to_guest();
 }
@@ -111,6 +141,8 @@  void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
 {
 	if (host_data_test_flag(HAS_SPE))
 		__debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1));
+	if (host_data_test_flag(HAS_BRBE))
+		__debug_restore_brbe(*host_data_ptr(host_debug_state.brbcr_el1));
 	if (__trace_needs_switch())
 		__trace_switch_to_host();
 }