diff mbox series

KVM: arm64: pmu: Resync EL0 state on counter rotation

Message ID 20230811180520.131727-1-maz@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: pmu: Resync EL0 state on counter rotation | expand

Commit Message

Marc Zyngier Aug. 11, 2023, 6:05 p.m. UTC
Huang Shijie reports that, when profiling a guest from the host
with a number of events that exceeds the number of available
counters, the reported counts are wildly inaccurate. Without
the counter oversubscription, the reported counts are correct.

Their investigation indicates that upon counter rotation (which
takes place on the back of a timer interrupt), we fail to
re-apply the guest EL0 enabling, leading to the counting of host
events instead of guest events.

In order to solve this, add yet another hook between the host PMU
driver and KVM, re-applying the guest EL0 configuration if the
right conditions apply (the host is VHE, we are in interrupt
context, and we interrupted a running vcpu). This triggers a new
vcpu request which will apply the correct configuration on guest
reentry.

With this, we have the correct counts, even when the counters are
oversubscribed.

Reported-by: Huang Shijie <shijie@os.amperecomputing.com>
Suggested-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20230809013953.7692-1-shijie@os.amperecomputing.com
---
 arch/arm64/include/asm/kvm_host.h |  1 +
 arch/arm64/kvm/arm.c              |  3 +++
 arch/arm64/kvm/pmu.c              | 18 ++++++++++++++++++
 drivers/perf/arm_pmuv3.c          |  2 ++
 include/kvm/arm_pmu.h             |  2 ++
 5 files changed, 26 insertions(+)

Comments

Shijie Huang Aug. 14, 2023, 2:58 a.m. UTC | #1
Hi Marc,

在 2023/8/12 2:05, Marc Zyngier 写道:
> Huang Shijie reports that, when profiling a guest from the host
> with a number of events that exceeds the number of available
> counters, the reported counts are wildly inaccurate. Without
> the counter oversubscription, the reported counts are correct.
>
> Their investigation indicates that upon counter rotation (which
> takes place on the back of a timer interrupt), we fail to
> re-apply the guest EL0 enabling, leading to the counting of host
> events instead of guest events.
>
> In order to solve this, add yet another hook between the host PMU
> driver and KVM, re-applying the guest EL0 configuration if the
> right conditions apply (the host is VHE, we are in interrupt
> context, and we interrupted a running vcpu). This triggers a new
> vcpu request which will apply the correct configuration on guest
> reentry.
>
> With this, we have the correct counts, even when the counters are
> oversubscribed.
>
> Reported-by: Huang Shijie <shijie@os.amperecomputing.com>
> Suggested-by: Oliver Upton <oliver.upton@linux.dev>
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Link: https://lore.kernel.org/r/20230809013953.7692-1-shijie@os.amperecomputing.com
> ---
>   arch/arm64/include/asm/kvm_host.h |  1 +
>   arch/arm64/kvm/arm.c              |  3 +++
>   arch/arm64/kvm/pmu.c              | 18 ++++++++++++++++++
>   drivers/perf/arm_pmuv3.c          |  2 ++
>   include/kvm/arm_pmu.h             |  2 ++
>   5 files changed, 26 insertions(+)
>
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index d3dd05bbfe23..553040e0e375 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -49,6 +49,7 @@
>   #define KVM_REQ_RELOAD_GICv4	KVM_ARCH_REQ(4)
>   #define KVM_REQ_RELOAD_PMU	KVM_ARCH_REQ(5)
>   #define KVM_REQ_SUSPEND		KVM_ARCH_REQ(6)
> +#define KVM_REQ_RESYNC_PMU_EL0	KVM_ARCH_REQ(7)
>   
>   #define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
>   				     KVM_DIRTY_LOG_INITIALLY_SET)
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 72dc53a75d1c..978b0411082f 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -803,6 +803,9 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
>   			kvm_pmu_handle_pmcr(vcpu,
>   					    __vcpu_sys_reg(vcpu, PMCR_EL0));
>   
> +		if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu))
> +			kvm_vcpu_pmu_restore_guest(vcpu);
> +
>   		if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
>   			return kvm_vcpu_suspend(vcpu);
>   
> diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
> index 121f1a14c829..0eea225fd09a 100644
> --- a/arch/arm64/kvm/pmu.c
> +++ b/arch/arm64/kvm/pmu.c
> @@ -236,3 +236,21 @@ bool kvm_set_pmuserenr(u64 val)
>   	ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val;
>   	return true;
>   }
> +
> +/*
> + * If we interrupted the guest to update the host PMU context, make
> + * sure we re-apply the guest EL0 state.
> + */
> +void kvm_vcpu_pmu_resync_el0(void)
> +{
> +	struct kvm_vcpu *vcpu;
> +
> +	if (!has_vhe() || !in_interrupt())
> +		return;
> +
> +	vcpu = kvm_get_running_vcpu();
> +	if (!vcpu)
> +		return;
> +
> +	kvm_make_request(KVM_REQ_RESYNC_PMU_EL0, vcpu);
> +}
> diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
> index 08b3a1bf0ef6..6a3d8176f54a 100644
> --- a/drivers/perf/arm_pmuv3.c
> +++ b/drivers/perf/arm_pmuv3.c
> @@ -772,6 +772,8 @@ static void armv8pmu_start(struct arm_pmu *cpu_pmu)
>   
>   	/* Enable all counters */
>   	armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
> +
> +	kvm_vcpu_pmu_resync_el0();
>   }

I read the perf code again, and find it maybe not good to do it in 
armv8pmu_start.

    Assume we install a new perf event to a CPU "x" from CPU 0,  a VM 
guest is running on CPU "x":

     perf_event_open() --> perf_install_in_context() -->

     call this function in  IPI interrupt: ___perf_install_in_context().

    armv8pmu_start() will be called in ___perf_install_in_context() in IPI.

    so kvm_vcpu_pmu_resync_el0() will _make_ a request by meeting the 
conditions:

              1.) in interrupt context.

              2.) a guest is running on this CPU.


But in actually, the request should not send out.

Please correct me if I am wrong.

IMHO, the best solution is add  a hook in the perf/core code, and make 
the request there.

I will send my v3 patch.


Thanks

Huang Shijie






>   
>   static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
> diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
> index 847da6fc2713..3a8a70a60794 100644
> --- a/include/kvm/arm_pmu.h
> +++ b/include/kvm/arm_pmu.h
> @@ -74,6 +74,7 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu);
>   struct kvm_pmu_events *kvm_get_pmu_events(void);
>   void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu);
>   void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu);
> +void kvm_vcpu_pmu_resync_el0(void);
>   
>   #define kvm_vcpu_has_pmu(vcpu)					\
>   	(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
> @@ -171,6 +172,7 @@ static inline u8 kvm_arm_pmu_get_pmuver_limit(void)
>   {
>   	return 0;
>   }
> +static inline void kvm_vcpu_pmu_resync_el0(void) {}
>   
>   #endif
>
kernel test robot Aug. 14, 2023, 5:03 a.m. UTC | #2
Hi Marc,

kernel test robot noticed the following build errors:

[auto build test ERROR on kvmarm/next]
[also build test ERROR on arm64/for-next/core soc/for-next linus/master v6.5-rc6 next-20230809]
[cannot apply to arm/for-next arm/fixes]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Marc-Zyngier/KVM-arm64-pmu-Resync-EL0-state-on-counter-rotation/20230812-020712
base:   https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git next
patch link:    https://lore.kernel.org/r/20230811180520.131727-1-maz%40kernel.org
patch subject: [PATCH] KVM: arm64: pmu: Resync EL0 state on counter rotation
config: arm-allmodconfig (https://download.01.org/0day-ci/archive/20230814/202308141209.Xtm2r0tL-lkp@intel.com/config)
compiler: arm-linux-gnueabi-gcc (GCC) 12.3.0
reproduce: (https://download.01.org/0day-ci/archive/20230814/202308141209.Xtm2r0tL-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202308141209.Xtm2r0tL-lkp@intel.com/

All errors (new ones prefixed by >>):

   drivers/perf/arm_pmuv3.c:144:51: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD'
     144 |         [C(L1D)][C(OP_READ)][C(RESULT_MISS)]    = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:122:65: warning: initialized field overwritten [-Woverride-init]
     122 | #define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR                       0x0041
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:145:51: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR'
     145 |         [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:122:65: note: (near initialization for 'armv8_vulcan_perf_cache_map[0][1][0]')
     122 | #define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR                       0x0041
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:145:51: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR'
     145 |         [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:124:65: warning: initialized field overwritten [-Woverride-init]
     124 | #define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR                0x0043
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:146:51: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR'
     146 |         [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]   = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:124:65: note: (near initialization for 'armv8_vulcan_perf_cache_map[0][1][1]')
     124 | #define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR                0x0043
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:146:51: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR'
     146 |         [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]   = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:133:65: warning: initialized field overwritten [-Woverride-init]
     133 | #define ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD                         0x004E
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:148:51: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD'
     148 |         [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:133:65: note: (near initialization for 'armv8_vulcan_perf_cache_map[3][0][0]')
     133 | #define ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD                         0x004E
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:148:51: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD'
     148 |         [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:134:65: warning: initialized field overwritten [-Woverride-init]
     134 | #define ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR                         0x004F
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:149:52: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR'
     149 |         [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR,
         |                                                    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:134:65: note: (near initialization for 'armv8_vulcan_perf_cache_map[3][1][0]')
     134 | #define ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR                         0x004F
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:149:52: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR'
     149 |         [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR,
         |                                                    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:131:65: warning: initialized field overwritten [-Woverride-init]
     131 | #define ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD                  0x004C
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:150:51: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD'
     150 |         [C(DTLB)][C(OP_READ)][C(RESULT_MISS)]   = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:131:65: note: (near initialization for 'armv8_vulcan_perf_cache_map[3][0][1]')
     131 | #define ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD                  0x004C
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:150:51: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD'
     150 |         [C(DTLB)][C(OP_READ)][C(RESULT_MISS)]   = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:132:65: warning: initialized field overwritten [-Woverride-init]
     132 | #define ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR                  0x004D
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:151:51: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR'
     151 |         [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]  = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:132:65: note: (near initialization for 'armv8_vulcan_perf_cache_map[3][1][1]')
     132 | #define ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR                  0x004D
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:151:51: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR'
     151 |         [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]  = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:148:65: warning: initialized field overwritten [-Woverride-init]
     148 | #define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD                      0x0060
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:153:51: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD'
     153 |         [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:148:65: note: (near initialization for 'armv8_vulcan_perf_cache_map[6][0][0]')
     148 | #define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD                      0x0060
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:153:51: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD'
     153 |         [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:149:65: warning: initialized field overwritten [-Woverride-init]
     149 | #define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR                      0x0061
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:154:52: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR'
     154 |         [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
         |                                                    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:149:65: note: (near initialization for 'armv8_vulcan_perf_cache_map[6][1][0]')
     149 | #define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR                      0x0061
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:154:52: note: in expansion of macro 'ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR'
     154 |         [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
         |                                                    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   drivers/perf/arm_pmuv3.c: In function 'armv8pmu_start':
>> drivers/perf/arm_pmuv3.c:776:9: error: implicit declaration of function 'kvm_vcpu_pmu_resync_el0' [-Werror=implicit-function-declaration]
     776 |         kvm_vcpu_pmu_resync_el0();
         |         ^~~~~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +/kvm_vcpu_pmu_resync_el0 +776 drivers/perf/arm_pmuv3.c

   758	
   759	static void armv8pmu_start(struct arm_pmu *cpu_pmu)
   760	{
   761		struct perf_event_context *ctx;
   762		int nr_user = 0;
   763	
   764		ctx = perf_cpu_task_ctx();
   765		if (ctx)
   766			nr_user = ctx->nr_user;
   767	
   768		if (sysctl_perf_user_access && nr_user)
   769			armv8pmu_enable_user_access(cpu_pmu);
   770		else
   771			armv8pmu_disable_user_access();
   772	
   773		/* Enable all counters */
   774		armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
   775	
 > 776		kvm_vcpu_pmu_resync_el0();
   777	}
   778
Leo Yan Aug. 14, 2023, 7:16 a.m. UTC | #3
On Fri, Aug 11, 2023 at 07:05:20PM +0100, Marc Zyngier wrote:
> Huang Shijie reports that, when profiling a guest from the host
> with a number of events that exceeds the number of available
> counters, the reported counts are wildly inaccurate. Without
> the counter oversubscription, the reported counts are correct.
> 
> Their investigation indicates that upon counter rotation (which
> takes place on the back of a timer interrupt), we fail to
> re-apply the guest EL0 enabling, leading to the counting of host
> events instead of guest events.

Seems to me, it's not clear for why the counter rotation will cause
the issue.

In the example shared by Shijie in [1], the cycle counter is enabled for
both host and guest, and cycle counter is a dedicated event which does
not share counter with other events.  Even there have counter rotation,
it should not impact the cycle counter.

I mean if we cannot explain clearly for this part, we don't find the
root cause, and this patch (and Shijie's patch) just walks around the
issue.

Thanks,
Leo

[1] https://lore.kernel.org/lkml/20230810072906.4007-1-shijie@os.amperecomputing.com/
Shijie Huang Aug. 14, 2023, 8:12 a.m. UTC | #4
Hi Leo,

在 2023/8/14 15:16, Leo Yan 写道:
> On Fri, Aug 11, 2023 at 07:05:20PM +0100, Marc Zyngier wrote:
>> Huang Shijie reports that, when profiling a guest from the host
>> with a number of events that exceeds the number of available
>> counters, the reported counts are wildly inaccurate. Without
>> the counter oversubscription, the reported counts are correct.
>>
>> Their investigation indicates that upon counter rotation (which
>> takes place on the back of a timer interrupt), we fail to
>> re-apply the guest EL0 enabling, leading to the counting of host
>> events instead of guest events.
> Seems to me, it's not clear for why the counter rotation will cause
> the issue.
>
> In the example shared by Shijie in [1], the cycle counter is enabled for
> both host and guest, and cycle counter is a dedicated event which does
> not share counter with other events.  Even there have counter rotation,
> it should not impact the cycle counter.

Just take a simple case:

    perf stat -e cycles:G,cycles:H, e2,e3,e4,e5,e6,e7 ....


Assume we have 8 events, but PMU only privides 7 counters(cycle + 6 normal)

1.) The initial:

          event 0 (cycles:G) ---> used cycle counter

          event 1 (cycles:H)  ---> used counter 1

           event 2 ---> used counter 2

           event 3 ---> used counter 3

            event 4 ---> used counter 4

            event 5 ---> used counter 5

            event 6---> used counter 6

  2.) After the event rotation , the event0 will put to the tail of the 
list, please see rotate_ctx()

the first round, it will like this:

          event 1(cycles:H) ---> used cycle counter

          event 2 ---> used counter 1

         event 3 ---> used counter 2

         event 4 ---> used counter 3

         event 5 ---> used counter 4

          event 6 ---> used counter 5

          event 7 ---> used counter 6


  3.) Rotation it again, event 1 will in the tail.

      In the second round, it will like this:

         evnet 0(cycles:G) ---> used cycle counter

        event 2 ---> used counter 1

         event 3 ---> used counter 2

         event 4 ---> used counter 3

         event 5 ---> used counter 4

          event 6 ---> used counter 5

          event 7 ---> used counter 6


  4.) Rotation it again, in the third round, it will like this:

          evnet 0(cycles:G) ---> used cycle counter

        event 3 ---> used counter 1

         event 4 ---> used counter 2

         event 5 ---> used counter 3

         event 6 ---> used counter 4

          event 7 ---> used counter 5

          event 2(cycles:H) ---> used counter 6


....

So it will impact the cycle counter..:)


Thanks

Huang Shijie
Leo Yan Aug. 14, 2023, 8:47 a.m. UTC | #5
Hi Shijie,

On Mon, Aug 14, 2023 at 04:12:23PM +0800, Shijie Huang wrote:

[...]

> > > Their investigation indicates that upon counter rotation (which
> > > takes place on the back of a timer interrupt), we fail to
> > > re-apply the guest EL0 enabling, leading to the counting of host
> > > events instead of guest events.
> >
> > Seems to me, it's not clear for why the counter rotation will cause
> > the issue.
> > 
> > In the example shared by Shijie in [1], the cycle counter is enabled for
> > both host and guest, and cycle counter is a dedicated event which does
> > not share counter with other events.  Even there have counter rotation,
> > it should not impact the cycle counter.
> 
> Just take a simple case:
> 
>    perf stat -e cycles:G,cycles:H, e2,e3,e4,e5,e6,e7 ....
> 
> 
> Assume we have 8 events, but PMU only privides 7 counters(cycle + 6 normal)

Thanks for the detailed info, now I understand it.

Seems to me, based on Marc's patch, we need to apply below change.  In
below code, we don't need to change the perf core code and we can
resolve it as a common issue for Arm PMU drivers.


diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 121f1a14c829..8f9673cdadec 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -38,14 +38,20 @@ struct kvm_pmu_events *kvm_get_pmu_events(void)
 void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
 {
 	struct kvm_pmu_events *pmu = kvm_get_pmu_events();
+	int resync;
 
 	if (!kvm_arm_support_pmu_v3() || !pmu || !kvm_pmu_switch_needed(attr))
 		return;
 
+	resync = pmu->events_guest != set;
+
 	if (!attr->exclude_host)
 		pmu->events_host |= set;
 	if (!attr->exclude_guest)
 		pmu->events_guest |= set;
+
+	if (resync)
+		kvm_vcpu_pmu_resync_el0();
 }
 
 /*
@@ -60,6 +66,8 @@ void kvm_clr_pmu_events(u32 clr)
 
 	pmu->events_host &= ~clr;
 	pmu->events_guest &= ~clr;
+
+	kvm_vcpu_pmu_resync_el0();
 }
Shijie Huang Aug. 14, 2023, 9:15 a.m. UTC | #6
Hi Leo,

在 2023/8/14 16:47, Leo Yan 写道:
> Hi Shijie,
>
> On Mon, Aug 14, 2023 at 04:12:23PM +0800, Shijie Huang wrote:
>
> [...]
>
>>>> Their investigation indicates that upon counter rotation (which
>>>> takes place on the back of a timer interrupt), we fail to
>>>> re-apply the guest EL0 enabling, leading to the counting of host
>>>> events instead of guest events.
>>> Seems to me, it's not clear for why the counter rotation will cause
>>> the issue.
>>>
>>> In the example shared by Shijie in [1], the cycle counter is enabled for
>>> both host and guest, and cycle counter is a dedicated event which does
>>> not share counter with other events.  Even there have counter rotation,
>>> it should not impact the cycle counter.
>> Just take a simple case:
>>
>>     perf stat -e cycles:G,cycles:H, e2,e3,e4,e5,e6,e7 ....
>>
>>
>> Assume we have 8 events, but PMU only privides 7 counters(cycle + 6 normal)
> Thanks for the detailed info, now I understand it.
>
> Seems to me, based on Marc's patch, we need to apply below change.  In
> below code, we don't need to change the perf core code and we can
> resolve it as a common issue for Arm PMU drivers.

I am not sure which one is better. :)

My own latest solution is v3:

http://lists.infradead.org/pipermail/linux-arm-kernel/2023-August/859377.html


Thanks

Huang Shijie

>
> diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
> index 121f1a14c829..8f9673cdadec 100644
> --- a/arch/arm64/kvm/pmu.c
> +++ b/arch/arm64/kvm/pmu.c
> @@ -38,14 +38,20 @@ struct kvm_pmu_events *kvm_get_pmu_events(void)
>   void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
>   {
>   	struct kvm_pmu_events *pmu = kvm_get_pmu_events();
> +	int resync;
>   
>   	if (!kvm_arm_support_pmu_v3() || !pmu || !kvm_pmu_switch_needed(attr))
>   		return;
>   
> +	resync = pmu->events_guest != set;
> +
>   	if (!attr->exclude_host)
>   		pmu->events_host |= set;
>   	if (!attr->exclude_guest)
>   		pmu->events_guest |= set;
> +
> +	if (resync)
> +		kvm_vcpu_pmu_resync_el0();
>   }
>   
>   /*
> @@ -60,6 +66,8 @@ void kvm_clr_pmu_events(u32 clr)
>   
>   	pmu->events_host &= ~clr;
>   	pmu->events_guest &= ~clr;
> +
> +	kvm_vcpu_pmu_resync_el0();
>   }
>
Shijie Huang Aug. 14, 2023, 9:29 a.m. UTC | #7
Hi Leo,

在 2023/8/14 16:47, Leo Yan 写道:
> Hi Shijie,
>
> On Mon, Aug 14, 2023 at 04:12:23PM +0800, Shijie Huang wrote:
>
> [...]
>
>>>> Their investigation indicates that upon counter rotation (which
>>>> takes place on the back of a timer interrupt), we fail to
>>>> re-apply the guest EL0 enabling, leading to the counting of host
>>>> events instead of guest events.
>>> Seems to me, it's not clear for why the counter rotation will cause
>>> the issue.
>>>
>>> In the example shared by Shijie in [1], the cycle counter is enabled for
>>> both host and guest, and cycle counter is a dedicated event which does
>>> not share counter with other events.  Even there have counter rotation,
>>> it should not impact the cycle counter.
>> Just take a simple case:
>>
>>     perf stat -e cycles:G,cycles:H, e2,e3,e4,e5,e6,e7 ....
>>
>>
>> Assume we have 8 events, but PMU only privides 7 counters(cycle + 6 normal)
> Thanks for the detailed info, now I understand it.
>
> Seems to me, based on Marc's patch, we need to apply below change.  In
> below code, we don't need to change the perf core code and we can
> resolve it as a common issue for Arm PMU drivers.
>
>
> diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
> index 121f1a14c829..8f9673cdadec 100644
> --- a/arch/arm64/kvm/pmu.c
> +++ b/arch/arm64/kvm/pmu.c
> @@ -38,14 +38,20 @@ struct kvm_pmu_events *kvm_get_pmu_events(void)
>   void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
>   {
>   	struct kvm_pmu_events *pmu = kvm_get_pmu_events();
> +	int resync;
>   
>   	if (!kvm_arm_support_pmu_v3() || !pmu || !kvm_pmu_switch_needed(attr))
>   		return;
>   
> +	resync = pmu->events_guest != set;

If we set two events in guest, the resync will set

For example:

            perf stat -e cycles:Gu, cycles:Gk


If so, this is not reasonble...


Thanks

Huang Shijie

> +
>   	if (!attr->exclude_host)
>   		pmu->events_host |= set;
>   	if (!attr->exclude_guest)
>   		pmu->events_guest |= set;
> +
> +	if (resync)
> +		kvm_vcpu_pmu_resync_el0();
>   }
>   
>   /*
> @@ -60,6 +66,8 @@ void kvm_clr_pmu_events(u32 clr)
>   
>   	pmu->events_host &= ~clr;
>   	pmu->events_guest &= ~clr;
> +
> +	kvm_vcpu_pmu_resync_el0();
>   }
>
Leo Yan Aug. 14, 2023, 10:01 a.m. UTC | #8
Hi Shijie,

On Mon, Aug 14, 2023 at 05:29:54PM +0800, Shijie Huang wrote:


[...]

> > Seems to me, based on Marc's patch, we need to apply below change.  In
> > below code, we don't need to change the perf core code and we can
> > resolve it as a common issue for Arm PMU drivers.
> > 
> > 
> > diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
> > index 121f1a14c829..8f9673cdadec 100644
> > --- a/arch/arm64/kvm/pmu.c
> > +++ b/arch/arm64/kvm/pmu.c
> > @@ -38,14 +38,20 @@ struct kvm_pmu_events *kvm_get_pmu_events(void)
> >   void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
> >   {
> >   	struct kvm_pmu_events *pmu = kvm_get_pmu_events();
> > +	int resync;
> >   	if (!kvm_arm_support_pmu_v3() || !pmu || !kvm_pmu_switch_needed(attr))
> >   		return;
> > +	resync = pmu->events_guest != set;
> 
> If we set two events in guest, the resync will set
> 
> For example:
> 
>            perf stat -e cycles:Gu, cycles:Gk
> 
> 
> If so, this is not reasonble...

You mean if set two guest events, the kvm_vcpu_pmu_resync_el0() will
be invoked twice, and the second calling is not reasonable, right?
I can accept this since I personally think this should not introduce
much performance penalty.

I understand your preference to call kvm_vcpu_pmu_resync_el0() from
perf core layer, but this is not a common issue for all PMU events and
crossing arches.  Furthermore, even perf core rotates events, it's not
necessarily mean we must restore events for guest in the case there
have no event is enabled for guest.

Thanks,
Leo
Shijie Huang Aug. 15, 2023, 2:59 a.m. UTC | #9
Hi Leo,

在 2023/8/14 18:01, Leo Yan 写道:
> Hi Shijie,
>
> On Mon, Aug 14, 2023 at 05:29:54PM +0800, Shijie Huang wrote:
>
>
> [...]
>
>>> Seems to me, based on Marc's patch, we need to apply below change.  In
>>> below code, we don't need to change the perf core code and we can
>>> resolve it as a common issue for Arm PMU drivers.
>>>
>>>
>>> diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
>>> index 121f1a14c829..8f9673cdadec 100644
>>> --- a/arch/arm64/kvm/pmu.c
>>> +++ b/arch/arm64/kvm/pmu.c
>>> @@ -38,14 +38,20 @@ struct kvm_pmu_events *kvm_get_pmu_events(void)
>>>    void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
>>>    {
>>>    	struct kvm_pmu_events *pmu = kvm_get_pmu_events();
>>> +	int resync;
>>>    	if (!kvm_arm_support_pmu_v3() || !pmu || !kvm_pmu_switch_needed(attr))
>>>    		return;
>>> +	resync = pmu->events_guest != set;
>> If we set two events in guest, the resync will set
>>
>> For example:
>>
>>             perf stat -e cycles:Gu, cycles:Gk
>>
>>
>> If so, this is not reasonble...
> You mean if set two guest events, the kvm_vcpu_pmu_resync_el0() will
> be invoked twice, and the second calling is not reasonable, right?

IMHO, even the first time is not reasonable. why call 
kvm_vcpu_pmu_resync_el0() when event rotation

does not happen?



> I can accept this since I personally think this should not introduce
> much performance penalty.
>
> I understand your preference to call kvm_vcpu_pmu_resync_el0() from
> perf core layer, but this is not a common issue for all PMU events and
> crossing arches.  Furthermore, even perf core rotates events, it's not

If we can find a better way to fix it in PMU code, I am okay too. :)

I tried to fix it in PMU code, but I am not satified with it.


> necessarily mean we must restore events for guest in the case there
> have no event is enabled for guest.

Not only for events in guest, but also for the events in the host too.

In the kvm_vcpu_pmu_restore_guest(), it also disable the EL0 for host 
events.


Thanks

Huang Shijie

>
> Thanks,
> Leo
Marc Zyngier Aug. 15, 2023, 6:27 a.m. UTC | #10
On Mon, 14 Aug 2023 03:58:32 +0100,
Shijie Huang <shijie@amperemail.onmicrosoft.com> wrote:
> 
> Hi Marc,
> 
> 在 2023/8/12 2:05, Marc Zyngier 写道:
> > Huang Shijie reports that, when profiling a guest from the host
> > with a number of events that exceeds the number of available
> > counters, the reported counts are wildly inaccurate. Without
> > the counter oversubscription, the reported counts are correct.
> > 
> > Their investigation indicates that upon counter rotation (which
> > takes place on the back of a timer interrupt), we fail to
> > re-apply the guest EL0 enabling, leading to the counting of host
> > events instead of guest events.
> > 
> > In order to solve this, add yet another hook between the host PMU
> > driver and KVM, re-applying the guest EL0 configuration if the
> > right conditions apply (the host is VHE, we are in interrupt
> > context, and we interrupted a running vcpu). This triggers a new
> > vcpu request which will apply the correct configuration on guest
> > reentry.
> > 
> > With this, we have the correct counts, even when the counters are
> > oversubscribed.
> > 
> > Reported-by: Huang Shijie <shijie@os.amperecomputing.com>
> > Suggested-by: Oliver Upton <oliver.upton@linux.dev>
> > Signed-off-by: Marc Zyngier <maz@kernel.org>
> > Cc: Mark Rutland <mark.rutland@arm.com>
> > Cc: Will Deacon <will@kernel.org>
> > Link: https://lore.kernel.org/r/20230809013953.7692-1-shijie@os.amperecomputing.com
> > ---
> >   arch/arm64/include/asm/kvm_host.h |  1 +
> >   arch/arm64/kvm/arm.c              |  3 +++
> >   arch/arm64/kvm/pmu.c              | 18 ++++++++++++++++++
> >   drivers/perf/arm_pmuv3.c          |  2 ++
> >   include/kvm/arm_pmu.h             |  2 ++
> >   5 files changed, 26 insertions(+)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index d3dd05bbfe23..553040e0e375 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -49,6 +49,7 @@
> >   #define KVM_REQ_RELOAD_GICv4	KVM_ARCH_REQ(4)
> >   #define KVM_REQ_RELOAD_PMU	KVM_ARCH_REQ(5)
> >   #define KVM_REQ_SUSPEND		KVM_ARCH_REQ(6)
> > +#define KVM_REQ_RESYNC_PMU_EL0	KVM_ARCH_REQ(7)
> >     #define KVM_DIRTY_LOG_MANUAL_CAPS
> > (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
> >   				     KVM_DIRTY_LOG_INITIALLY_SET)
> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index 72dc53a75d1c..978b0411082f 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -803,6 +803,9 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
> >   			kvm_pmu_handle_pmcr(vcpu,
> >   					    __vcpu_sys_reg(vcpu, PMCR_EL0));
> >   +		if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu))
> > +			kvm_vcpu_pmu_restore_guest(vcpu);
> > +
> >   		if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
> >   			return kvm_vcpu_suspend(vcpu);
> >   diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
> > index 121f1a14c829..0eea225fd09a 100644
> > --- a/arch/arm64/kvm/pmu.c
> > +++ b/arch/arm64/kvm/pmu.c
> > @@ -236,3 +236,21 @@ bool kvm_set_pmuserenr(u64 val)
> >   	ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val;
> >   	return true;
> >   }
> > +
> > +/*
> > + * If we interrupted the guest to update the host PMU context, make
> > + * sure we re-apply the guest EL0 state.
> > + */
> > +void kvm_vcpu_pmu_resync_el0(void)
> > +{
> > +	struct kvm_vcpu *vcpu;
> > +
> > +	if (!has_vhe() || !in_interrupt())
> > +		return;
> > +
> > +	vcpu = kvm_get_running_vcpu();
> > +	if (!vcpu)
> > +		return;
> > +
> > +	kvm_make_request(KVM_REQ_RESYNC_PMU_EL0, vcpu);
> > +}
> > diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
> > index 08b3a1bf0ef6..6a3d8176f54a 100644
> > --- a/drivers/perf/arm_pmuv3.c
> > +++ b/drivers/perf/arm_pmuv3.c
> > @@ -772,6 +772,8 @@ static void armv8pmu_start(struct arm_pmu *cpu_pmu)
> >     	/* Enable all counters */
> >   	armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
> > +
> > +	kvm_vcpu_pmu_resync_el0();
> >   }
> 
> I read the perf code again, and find it maybe not good to do it in
> armv8pmu_start.
> 
>    Assume we install a new perf event to a CPU "x" from CPU 0,  a VM
> guest is running on CPU "x":
> 
>     perf_event_open() --> perf_install_in_context() -->
> 
>     call this function in  IPI interrupt: ___perf_install_in_context().
> 
>    armv8pmu_start() will be called in ___perf_install_in_context() in IPI.
> 
>    so kvm_vcpu_pmu_resync_el0() will _make_ a request by meeting the
> conditions:
> 
>              1.) in interrupt context.
> 
>              2.) a guest is running on this CPU.
> 
> 
> But in actually, the request should not send out.

Why shouldn't it be applied? This isn't supposed to be always
necessary, but it needs to be applied whenever there is a possibility
for counters to be updated behind our back, something that is a pretty
event anyway.

> Please correct me if I am wrong.
> 
> IMHO, the best solution is add  a hook in the perf/core code, and make
> the request there.

I disagree. I'm still completely opposed to anything that will add
such a hook in the core perf code, specially as a weak symbol. The
interactions must be strictly between the PMUv3 driver and KVM,
because they are the only parties involved here.

I will *not* take such a patch.

	M.
Marc Zyngier Aug. 15, 2023, 6:32 a.m. UTC | #11
On Mon, 14 Aug 2023 08:16:27 +0100,
Leo Yan <leo.yan@linaro.org> wrote:
> 
> On Fri, Aug 11, 2023 at 07:05:20PM +0100, Marc Zyngier wrote:
> > Huang Shijie reports that, when profiling a guest from the host
> > with a number of events that exceeds the number of available
> > counters, the reported counts are wildly inaccurate. Without
> > the counter oversubscription, the reported counts are correct.
> > 
> > Their investigation indicates that upon counter rotation (which
> > takes place on the back of a timer interrupt), we fail to
> > re-apply the guest EL0 enabling, leading to the counting of host
> > events instead of guest events.
> 
> Seems to me, it's not clear for why the counter rotation will cause
> the issue.

Maybe unclear to you, but rather clear to me (and most people else on
Cc).

> 
> In the example shared by Shijie in [1], the cycle counter is enabled
> for both host and guest

No. You're misreading the example. We're profiling the guest from the
host, and the guest has no PMU access.

> and cycle counter is a dedicated event
> which does not share counter with other events.  Even there have
> counter rotation, it should not impact the cycle counter.

Who says that we're counting cycles using the cycle counter? This is
an event like any other, and it can be counted on any counter.

> 
> I mean if we cannot explain clearly for this part, we don't find the
> root cause, and this patch (and Shijie's patch) just walks around the
> issue.

We have the root cause. You just need to think a bit harder.

	M.
Shijie Huang Aug. 15, 2023, 7:24 a.m. UTC | #12
Hi Marc,

在 2023/8/15 14:27, Marc Zyngier 写道:
> On Mon, 14 Aug 2023 03:58:32 +0100,
> Shijie Huang <shijie@amperemail.onmicrosoft.com> wrote:
>> Hi Marc,
>>
>> 在 2023/8/12 2:05, Marc Zyngier 写道:
>>> Huang Shijie reports that, when profiling a guest from the host
>>> with a number of events that exceeds the number of available
>>> counters, the reported counts are wildly inaccurate. Without
>>> the counter oversubscription, the reported counts are correct.
>>>
>>> Their investigation indicates that upon counter rotation (which
>>> takes place on the back of a timer interrupt), we fail to
>>> re-apply the guest EL0 enabling, leading to the counting of host
>>> events instead of guest events.
>>>
>>> In order to solve this, add yet another hook between the host PMU
>>> driver and KVM, re-applying the guest EL0 configuration if the
>>> right conditions apply (the host is VHE, we are in interrupt
>>> context, and we interrupted a running vcpu). This triggers a new
>>> vcpu request which will apply the correct configuration on guest
>>> reentry.
>>>
>>> With this, we have the correct counts, even when the counters are
>>> oversubscribed.
>>>
>>> Reported-by: Huang Shijie <shijie@os.amperecomputing.com>
>>> Suggested-by: Oliver Upton <oliver.upton@linux.dev>
>>> Signed-off-by: Marc Zyngier <maz@kernel.org>
>>> Cc: Mark Rutland <mark.rutland@arm.com>
>>> Cc: Will Deacon <will@kernel.org>
>>> Link: https://lore.kernel.org/r/20230809013953.7692-1-shijie@os.amperecomputing.com
>>> ---
>>>    arch/arm64/include/asm/kvm_host.h |  1 +
>>>    arch/arm64/kvm/arm.c              |  3 +++
>>>    arch/arm64/kvm/pmu.c              | 18 ++++++++++++++++++
>>>    drivers/perf/arm_pmuv3.c          |  2 ++
>>>    include/kvm/arm_pmu.h             |  2 ++
>>>    5 files changed, 26 insertions(+)
>>>
>>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>>> index d3dd05bbfe23..553040e0e375 100644
>>> --- a/arch/arm64/include/asm/kvm_host.h
>>> +++ b/arch/arm64/include/asm/kvm_host.h
>>> @@ -49,6 +49,7 @@
>>>    #define KVM_REQ_RELOAD_GICv4	KVM_ARCH_REQ(4)
>>>    #define KVM_REQ_RELOAD_PMU	KVM_ARCH_REQ(5)
>>>    #define KVM_REQ_SUSPEND		KVM_ARCH_REQ(6)
>>> +#define KVM_REQ_RESYNC_PMU_EL0	KVM_ARCH_REQ(7)
>>>      #define KVM_DIRTY_LOG_MANUAL_CAPS
>>> (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
>>>    				     KVM_DIRTY_LOG_INITIALLY_SET)
>>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
>>> index 72dc53a75d1c..978b0411082f 100644
>>> --- a/arch/arm64/kvm/arm.c
>>> +++ b/arch/arm64/kvm/arm.c
>>> @@ -803,6 +803,9 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
>>>    			kvm_pmu_handle_pmcr(vcpu,
>>>    					    __vcpu_sys_reg(vcpu, PMCR_EL0));
>>>    +		if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu))
>>> +			kvm_vcpu_pmu_restore_guest(vcpu);
>>> +
>>>    		if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
>>>    			return kvm_vcpu_suspend(vcpu);
>>>    diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
>>> index 121f1a14c829..0eea225fd09a 100644
>>> --- a/arch/arm64/kvm/pmu.c
>>> +++ b/arch/arm64/kvm/pmu.c
>>> @@ -236,3 +236,21 @@ bool kvm_set_pmuserenr(u64 val)
>>>    	ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val;
>>>    	return true;
>>>    }
>>> +
>>> +/*
>>> + * If we interrupted the guest to update the host PMU context, make
>>> + * sure we re-apply the guest EL0 state.
>>> + */
>>> +void kvm_vcpu_pmu_resync_el0(void)
>>> +{
>>> +	struct kvm_vcpu *vcpu;
>>> +
>>> +	if (!has_vhe() || !in_interrupt())
>>> +		return;
>>> +
>>> +	vcpu = kvm_get_running_vcpu();
>>> +	if (!vcpu)
>>> +		return;
>>> +
>>> +	kvm_make_request(KVM_REQ_RESYNC_PMU_EL0, vcpu);
>>> +}
>>> diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
>>> index 08b3a1bf0ef6..6a3d8176f54a 100644
>>> --- a/drivers/perf/arm_pmuv3.c
>>> +++ b/drivers/perf/arm_pmuv3.c
>>> @@ -772,6 +772,8 @@ static void armv8pmu_start(struct arm_pmu *cpu_pmu)
>>>      	/* Enable all counters */
>>>    	armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
>>> +
>>> +	kvm_vcpu_pmu_resync_el0();
>>>    }
>> I read the perf code again, and find it maybe not good to do it in
>> armv8pmu_start.
>>
>>     Assume we install a new perf event to a CPU "x" from CPU 0,  a VM
>> guest is running on CPU "x":
>>
>>      perf_event_open() --> perf_install_in_context() -->
>>
>>      call this function in  IPI interrupt: ___perf_install_in_context().
>>
>>     armv8pmu_start() will be called in ___perf_install_in_context() in IPI.
>>
>>     so kvm_vcpu_pmu_resync_el0() will _make_ a request by meeting the
>> conditions:
>>
>>               1.) in interrupt context.
>>
>>               2.) a guest is running on this CPU.
>>
>>
>> But in actually, the request should not send out.
> Why shouldn't it be applied? This isn't supposed to be always
> necessary, but it needs to be applied whenever there is a possibility
> for counters to be updated behind our back, something that is a pretty
> event anyway.

okay.


>> Please correct me if I am wrong.
>>
>> IMHO, the best solution is add  a hook in the perf/core code, and make
>> the request there.
> I disagree. I'm still completely opposed to anything that will add
> such a hook in the core perf code, specially as a weak symbol. The
> interactions must be strictly between the PMUv3 driver and KVM,
> because they are the only parties involved here.
>
> I will *not* take such a patch.

okay, please ignore my v3 patch.

Tested_by: Huang Shijie <shijie@os.amperecomputing.com>


Thanks

Huang Shijie

>
> 	M.
>
Leo Yan Aug. 16, 2023, 3:04 a.m. UTC | #13
On Tue, Aug 15, 2023 at 07:32:40AM +0100, Marc Zyngier wrote:
> On Mon, 14 Aug 2023 08:16:27 +0100,
> Leo Yan <leo.yan@linaro.org> wrote:
> > 
> > On Fri, Aug 11, 2023 at 07:05:20PM +0100, Marc Zyngier wrote:
> > > Huang Shijie reports that, when profiling a guest from the host
> > > with a number of events that exceeds the number of available
> > > counters, the reported counts are wildly inaccurate. Without
> > > the counter oversubscription, the reported counts are correct.
> > > 
> > > Their investigation indicates that upon counter rotation (which
> > > takes place on the back of a timer interrupt), we fail to
> > > re-apply the guest EL0 enabling, leading to the counting of host
> > > events instead of guest events.
> > 
> > Seems to me, it's not clear for why the counter rotation will cause
> > the issue.
> 
> Maybe unclear to you, but rather clear to me (and most people else on
> Cc).

I have to admit this it true.

> > In the example shared by Shijie in [1], the cycle counter is enabled
> > for both host and guest
> 
> No. You're misreading the example. We're profiling the guest from the
> host, and the guest has no PMU access.
> 
> > and cycle counter is a dedicated event
> > which does not share counter with other events.  Even there have
> > counter rotation, it should not impact the cycle counter.
> 
> Who says that we're counting cycles using the cycle counter? This is
> an event like any other, and it can be counted on any counter.

Sorry for noise.

> > I mean if we cannot explain clearly for this part, we don't find the
> > root cause, and this patch (and Shijie's patch) just walks around the
> > issue.
> 
> We have the root cause. You just need to think a bit harder.

Let me elaborate a bit more for my concern.  The question is how we can
know the exactly the host and the guest have the different counter
enabling?

Shijie's patch relies on perf event rotation to trigger syncing for
PMU PMEVTYPER and PMCCFILTR registers.  The perf event rotation will
enable and disable some events, but it doesn't mean the host and the
guest enable different counters.  If we use the perf event rotation to
trigger syncing, there must introduce redundant operations.

In your patch, it resyncs the PMU registers in the function
armv8pmu_start(), this function is invoked not only when start PMU
event, it also is invoked in PMU interrupt handler (see
armv8pmu_handle_irq()), this also will lead to redundant syncing if
we use the perf record command for PMU event sampling:

  perf record -e cycles:G,cycles:H -d -d -- sleep 10

This is why I think we should trigger the syncing in the function
kvm_set_pmu_events(), where we can know exactly the event mismatching
between the host and the guest.  At the beginning it has checked the
difference between the host and the guest by calling
kvm_pmu_switch_needed(attr), thus we don't need to add more condition
checking and directly call kvm_vcpu_pmu_resync_el0().

diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 121f1a14c829..99adcdbb6a5d 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -46,6 +46,12 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
                pmu->events_host |= set;
        if (!attr->exclude_guest)
                pmu->events_guest |= set;
+
+       /*
+        * The host and the guest enable different events for EL0,
+        * resync it.
+        */
+       kvm_vcpu_pmu_resync_el0();
 }

Thanks,
Leo
Shijie Huang Aug. 16, 2023, 3:31 a.m. UTC | #14
Hi Leo,

在 2023/8/16 11:04, Leo Yan 写道:
> On Tue, Aug 15, 2023 at 07:32:40AM +0100, Marc Zyngier wrote:
>> On Mon, 14 Aug 2023 08:16:27 +0100,
>> Leo Yan <leo.yan@linaro.org> wrote:
>>> On Fri, Aug 11, 2023 at 07:05:20PM +0100, Marc Zyngier wrote:
>>>> Huang Shijie reports that, when profiling a guest from the host
>>>> with a number of events that exceeds the number of available
>>>> counters, the reported counts are wildly inaccurate. Without
>>>> the counter oversubscription, the reported counts are correct.
>>>>
>>>> Their investigation indicates that upon counter rotation (which
>>>> takes place on the back of a timer interrupt), we fail to
>>>> re-apply the guest EL0 enabling, leading to the counting of host
>>>> events instead of guest events.
>>> Seems to me, it's not clear for why the counter rotation will cause
>>> the issue.
>> Maybe unclear to you, but rather clear to me (and most people else on
>> Cc).
> I have to admit this it true.
>
>>> In the example shared by Shijie in [1], the cycle counter is enabled
>>> for both host and guest
>> No. You're misreading the example. We're profiling the guest from the
>> host, and the guest has no PMU access.
>>
>>> and cycle counter is a dedicated event
>>> which does not share counter with other events.  Even there have
>>> counter rotation, it should not impact the cycle counter.
>> Who says that we're counting cycles using the cycle counter? This is
>> an event like any other, and it can be counted on any counter.
> Sorry for noise.
>
>>> I mean if we cannot explain clearly for this part, we don't find the
>>> root cause, and this patch (and Shijie's patch) just walks around the
>>> issue.
>> We have the root cause. You just need to think a bit harder.
> Let me elaborate a bit more for my concern.  The question is how we can
> know the exactly the host and the guest have the different counter
> enabling?
>
> Shijie's patch relies on perf event rotation to trigger syncing for
> PMU PMEVTYPER and PMCCFILTR registers.  The perf event rotation will
> enable and disable some events, but it doesn't mean the host and the
> guest enable different counters.  If we use the perf event rotation to

In the event rotation, it does mean the host and guest use different 
counters.

Please read my previoust email which shows an example. :)


Thanks

Huang Shijie


> trigger syncing, there must introduce redundant operations.
>
>
kernel test robot Aug. 16, 2023, 9:15 p.m. UTC | #15
Hi Marc,

kernel test robot noticed the following build errors:

[auto build test ERROR on kvmarm/next]
[also build test ERROR on arm/for-next arm64/for-next/core soc/for-next linus/master arm/fixes v6.5-rc6 next-20230816]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Marc-Zyngier/KVM-arm64-pmu-Resync-EL0-state-on-counter-rotation/20230812-020712
base:   https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git next
patch link:    https://lore.kernel.org/r/20230811180520.131727-1-maz%40kernel.org
patch subject: [PATCH] KVM: arm64: pmu: Resync EL0 state on counter rotation
config: arm-randconfig-r046-20230817 (https://download.01.org/0day-ci/archive/20230817/202308170409.yogZXrWD-lkp@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
reproduce: (https://download.01.org/0day-ci/archive/20230817/202308170409.yogZXrWD-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202308170409.yogZXrWD-lkp@intel.com/

All errors (new ones prefixed by >>):

         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:141:2: note: previous initialization is here
     141 |         PERF_CACHE_MAP_ALL_UNSUPPORTED,
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:45:31: note: expanded from macro 'PERF_CACHE_MAP_ALL_UNSUPPORTED'
      45 |                 [0 ... C(RESULT_MAX) - 1] = CACHE_OP_UNSUPPORTED,       \
         |                                             ^~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:37:31: note: expanded from macro 'CACHE_OP_UNSUPPORTED'
      37 | #define CACHE_OP_UNSUPPORTED            0xFFFF
         |                                         ^~~~~~
   drivers/perf/arm_pmuv3.c:148:44: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
     148 |         [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:133:44: note: expanded from macro 'ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD'
     133 | #define ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD                         0x004E
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:141:2: note: previous initialization is here
     141 |         PERF_CACHE_MAP_ALL_UNSUPPORTED,
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:45:31: note: expanded from macro 'PERF_CACHE_MAP_ALL_UNSUPPORTED'
      45 |                 [0 ... C(RESULT_MAX) - 1] = CACHE_OP_UNSUPPORTED,       \
         |                                             ^~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:37:31: note: expanded from macro 'CACHE_OP_UNSUPPORTED'
      37 | #define CACHE_OP_UNSUPPORTED            0xFFFF
         |                                         ^~~~~~
   drivers/perf/arm_pmuv3.c:149:45: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
     149 |         [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR,
         |                                                    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:134:44: note: expanded from macro 'ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR'
     134 | #define ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR                         0x004F
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:141:2: note: previous initialization is here
     141 |         PERF_CACHE_MAP_ALL_UNSUPPORTED,
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:45:31: note: expanded from macro 'PERF_CACHE_MAP_ALL_UNSUPPORTED'
      45 |                 [0 ... C(RESULT_MAX) - 1] = CACHE_OP_UNSUPPORTED,       \
         |                                             ^~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:37:31: note: expanded from macro 'CACHE_OP_UNSUPPORTED'
      37 | #define CACHE_OP_UNSUPPORTED            0xFFFF
         |                                         ^~~~~~
   drivers/perf/arm_pmuv3.c:150:42: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
     150 |         [C(DTLB)][C(OP_READ)][C(RESULT_MISS)]   = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:131:50: note: expanded from macro 'ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD'
     131 | #define ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD                  0x004C
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:141:2: note: previous initialization is here
     141 |         PERF_CACHE_MAP_ALL_UNSUPPORTED,
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:45:31: note: expanded from macro 'PERF_CACHE_MAP_ALL_UNSUPPORTED'
      45 |                 [0 ... C(RESULT_MAX) - 1] = CACHE_OP_UNSUPPORTED,       \
         |                                             ^~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:37:31: note: expanded from macro 'CACHE_OP_UNSUPPORTED'
      37 | #define CACHE_OP_UNSUPPORTED            0xFFFF
         |                                         ^~~~~~
   drivers/perf/arm_pmuv3.c:151:43: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
     151 |         [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]  = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:132:50: note: expanded from macro 'ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR'
     132 | #define ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR                  0x004D
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:141:2: note: previous initialization is here
     141 |         PERF_CACHE_MAP_ALL_UNSUPPORTED,
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:45:31: note: expanded from macro 'PERF_CACHE_MAP_ALL_UNSUPPORTED'
      45 |                 [0 ... C(RESULT_MAX) - 1] = CACHE_OP_UNSUPPORTED,       \
         |                                             ^~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:37:31: note: expanded from macro 'CACHE_OP_UNSUPPORTED'
      37 | #define CACHE_OP_UNSUPPORTED            0xFFFF
         |                                         ^~~~~~
   drivers/perf/arm_pmuv3.c:153:44: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
     153 |         [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
         |                                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:148:46: note: expanded from macro 'ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD'
     148 | #define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD                      0x0060
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:141:2: note: previous initialization is here
     141 |         PERF_CACHE_MAP_ALL_UNSUPPORTED,
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:45:31: note: expanded from macro 'PERF_CACHE_MAP_ALL_UNSUPPORTED'
      45 |                 [0 ... C(RESULT_MAX) - 1] = CACHE_OP_UNSUPPORTED,       \
         |                                             ^~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:37:31: note: expanded from macro 'CACHE_OP_UNSUPPORTED'
      37 | #define CACHE_OP_UNSUPPORTED            0xFFFF
         |                                         ^~~~~~
   drivers/perf/arm_pmuv3.c:154:45: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
     154 |         [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
         |                                                    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmuv3.h:149:46: note: expanded from macro 'ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR'
     149 | #define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR                      0x0061
         |                                                                 ^~~~~~
   drivers/perf/arm_pmuv3.c:141:2: note: previous initialization is here
     141 |         PERF_CACHE_MAP_ALL_UNSUPPORTED,
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:45:31: note: expanded from macro 'PERF_CACHE_MAP_ALL_UNSUPPORTED'
      45 |                 [0 ... C(RESULT_MAX) - 1] = CACHE_OP_UNSUPPORTED,       \
         |                                             ^~~~~~~~~~~~~~~~~~~~
   include/linux/perf/arm_pmu.h:37:31: note: expanded from macro 'CACHE_OP_UNSUPPORTED'
      37 | #define CACHE_OP_UNSUPPORTED            0xFFFF
         |                                         ^~~~~~
>> drivers/perf/arm_pmuv3.c:776:2: error: call to undeclared function 'kvm_vcpu_pmu_resync_el0'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     776 |         kvm_vcpu_pmu_resync_el0();
         |         ^
   55 warnings and 1 error generated.


vim +/kvm_vcpu_pmu_resync_el0 +776 drivers/perf/arm_pmuv3.c

   758	
   759	static void armv8pmu_start(struct arm_pmu *cpu_pmu)
   760	{
   761		struct perf_event_context *ctx;
   762		int nr_user = 0;
   763	
   764		ctx = perf_cpu_task_ctx();
   765		if (ctx)
   766			nr_user = ctx->nr_user;
   767	
   768		if (sysctl_perf_user_access && nr_user)
   769			armv8pmu_enable_user_access(cpu_pmu);
   770		else
   771			armv8pmu_disable_user_access();
   772	
   773		/* Enable all counters */
   774		armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
   775	
 > 776		kvm_vcpu_pmu_resync_el0();
   777	}
   778
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index d3dd05bbfe23..553040e0e375 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -49,6 +49,7 @@ 
 #define KVM_REQ_RELOAD_GICv4	KVM_ARCH_REQ(4)
 #define KVM_REQ_RELOAD_PMU	KVM_ARCH_REQ(5)
 #define KVM_REQ_SUSPEND		KVM_ARCH_REQ(6)
+#define KVM_REQ_RESYNC_PMU_EL0	KVM_ARCH_REQ(7)
 
 #define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
 				     KVM_DIRTY_LOG_INITIALLY_SET)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 72dc53a75d1c..978b0411082f 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -803,6 +803,9 @@  static int check_vcpu_requests(struct kvm_vcpu *vcpu)
 			kvm_pmu_handle_pmcr(vcpu,
 					    __vcpu_sys_reg(vcpu, PMCR_EL0));
 
+		if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu))
+			kvm_vcpu_pmu_restore_guest(vcpu);
+
 		if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
 			return kvm_vcpu_suspend(vcpu);
 
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 121f1a14c829..0eea225fd09a 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -236,3 +236,21 @@  bool kvm_set_pmuserenr(u64 val)
 	ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val;
 	return true;
 }
+
+/*
+ * If we interrupted the guest to update the host PMU context, make
+ * sure we re-apply the guest EL0 state.
+ */
+void kvm_vcpu_pmu_resync_el0(void)
+{
+	struct kvm_vcpu *vcpu;
+
+	if (!has_vhe() || !in_interrupt())
+		return;
+
+	vcpu = kvm_get_running_vcpu();
+	if (!vcpu)
+		return;
+
+	kvm_make_request(KVM_REQ_RESYNC_PMU_EL0, vcpu);
+}
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 08b3a1bf0ef6..6a3d8176f54a 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -772,6 +772,8 @@  static void armv8pmu_start(struct arm_pmu *cpu_pmu)
 
 	/* Enable all counters */
 	armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
+
+	kvm_vcpu_pmu_resync_el0();
 }
 
 static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 847da6fc2713..3a8a70a60794 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -74,6 +74,7 @@  int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu);
 struct kvm_pmu_events *kvm_get_pmu_events(void);
 void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu);
 void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu);
+void kvm_vcpu_pmu_resync_el0(void);
 
 #define kvm_vcpu_has_pmu(vcpu)					\
 	(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
@@ -171,6 +172,7 @@  static inline u8 kvm_arm_pmu_get_pmuver_limit(void)
 {
 	return 0;
 }
+static inline void kvm_vcpu_pmu_resync_el0(void) {}
 
 #endif