diff mbox series

[PATCHv3] perf kvm: add kvm-stat for arm64

Message ID 20200917003645.689665-1-sergey.senozhatsky@gmail.com (mailing list archive)
State New, archived
Headers show
Series [PATCHv3] perf kvm: add kvm-stat for arm64 | expand

Commit Message

Sergey Senozhatsky Sept. 17, 2020, 12:36 a.m. UTC
Add support for perf kvm stat on arm64 platform.

Example:
 # perf kvm stat report

Analyze events for all VMs, all VCPUs:

    VM-EXIT    Samples  Samples%     Time%    Min Time    Max Time         Avg time

   DABT_LOW     661867    98.91%    40.45%      2.19us   3364.65us      6.24us ( +-   0.34% )
        IRQ       4598     0.69%    57.44%      2.89us   3397.59us   1276.27us ( +-   1.61% )
        WFx       1475     0.22%     1.71%      2.22us   3388.63us    118.31us ( +-   8.69% )
   IABT_LOW       1018     0.15%     0.38%      2.22us   2742.07us     38.29us ( +-  12.55% )
      SYS64        180     0.03%     0.01%      2.07us    112.91us      6.57us ( +-  14.95% )
      HVC64         17     0.00%     0.01%      2.19us    322.35us     42.95us ( +-  58.98% )

Total Samples:669155, Total events handled time:10216387.86us.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
---
v3: report ARM_EXCEPTION_IL exceptions (Leo)
v2: reworked the patch after offline discussion with Suleiman

 tools/perf/arch/arm64/Makefile                |  1 +
 tools/perf/arch/arm64/util/Build              |  1 +
 .../arch/arm64/util/arm64_exception_types.h   | 92 +++++++++++++++++++
 tools/perf/arch/arm64/util/kvm-stat.c         | 86 +++++++++++++++++
 4 files changed, 180 insertions(+)
 create mode 100644 tools/perf/arch/arm64/util/arm64_exception_types.h
 create mode 100644 tools/perf/arch/arm64/util/kvm-stat.c

Comments

Leo Yan Sept. 17, 2020, 8:47 a.m. UTC | #1
On Thu, Sep 17, 2020 at 09:36:45AM +0900, Sergey Senozhatsky wrote:
> Add support for perf kvm stat on arm64 platform.
> 
> Example:
>  # perf kvm stat report
> 
> Analyze events for all VMs, all VCPUs:
> 
>     VM-EXIT    Samples  Samples%     Time%    Min Time    Max Time         Avg time
> 
>    DABT_LOW     661867    98.91%    40.45%      2.19us   3364.65us      6.24us ( +-   0.34% )
>         IRQ       4598     0.69%    57.44%      2.89us   3397.59us   1276.27us ( +-   1.61% )
>         WFx       1475     0.22%     1.71%      2.22us   3388.63us    118.31us ( +-   8.69% )
>    IABT_LOW       1018     0.15%     0.38%      2.22us   2742.07us     38.29us ( +-  12.55% )
>       SYS64        180     0.03%     0.01%      2.07us    112.91us      6.57us ( +-  14.95% )
>       HVC64         17     0.00%     0.01%      2.19us    322.35us     42.95us ( +-  58.98% )
> 
> Total Samples:669155, Total events handled time:10216387.86us.
> 
> Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
> ---
> v3: report ARM_EXCEPTION_IL exceptions (Leo)
> v2: reworked the patch after offline discussion with Suleiman
> 
>  tools/perf/arch/arm64/Makefile                |  1 +
>  tools/perf/arch/arm64/util/Build              |  1 +
>  .../arch/arm64/util/arm64_exception_types.h   | 92 +++++++++++++++++++
>  tools/perf/arch/arm64/util/kvm-stat.c         | 86 +++++++++++++++++
>  4 files changed, 180 insertions(+)
>  create mode 100644 tools/perf/arch/arm64/util/arm64_exception_types.h
>  create mode 100644 tools/perf/arch/arm64/util/kvm-stat.c
> 
> diff --git a/tools/perf/arch/arm64/Makefile b/tools/perf/arch/arm64/Makefile
> index dbef716a1913..fab3095fb5d0 100644
> --- a/tools/perf/arch/arm64/Makefile
> +++ b/tools/perf/arch/arm64/Makefile
> @@ -4,6 +4,7 @@ PERF_HAVE_DWARF_REGS := 1
>  endif
>  PERF_HAVE_JITDUMP := 1
>  PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
> +HAVE_KVM_STAT_SUPPORT := 1
>  
>  #
>  # Syscall table generation for perf
> diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
> index 5c13438c7bd4..4cba12f4b741 100644
> --- a/tools/perf/arch/arm64/util/Build
> +++ b/tools/perf/arch/arm64/util/Build
> @@ -1,6 +1,7 @@
>  perf-y += header.o
>  perf-y += machine.o
>  perf-y += perf_regs.o
> +perf-y += kvm-stat.o
>  perf-$(CONFIG_DWARF)     += dwarf-regs.o
>  perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
>  perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
> diff --git a/tools/perf/arch/arm64/util/arm64_exception_types.h b/tools/perf/arch/arm64/util/arm64_exception_types.h
> new file mode 100644
> index 000000000000..27c981ebe401
> --- /dev/null
> +++ b/tools/perf/arch/arm64/util/arm64_exception_types.h
> @@ -0,0 +1,92 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#ifndef ARCH_PERF_ARM64_EXCEPTION_TYPES_H
> +#define ARCH_PERF_ARM64_EXCEPTION_TYPES_H
> +
> +/* Per asm/virt.h */
> +#define HVC_STUB_ERR		  0xbadca11
> +
> +/* Per asm/kvm_asm.h */
> +#define ARM_EXCEPTION_IRQ		0
> +#define ARM_EXCEPTION_EL1_SERROR	1
> +#define ARM_EXCEPTION_TRAP		2
> +#define ARM_EXCEPTION_IL		3
> +/* The hyp-stub will return this for any kvm_call_hyp() call */
> +#define ARM_EXCEPTION_HYP_GONE		HVC_STUB_ERR
> +
> +#define kvm_arm_exception_type					\
> +	{ARM_EXCEPTION_IRQ,		"IRQ"		},	\
> +	{ARM_EXCEPTION_EL1_SERROR,	"SERROR"	},	\
> +	{ARM_EXCEPTION_TRAP,		"TRAP"		},	\
> +	{ARM_EXCEPTION_IL,		"ILLEGAL"	},	\
> +	{ARM_EXCEPTION_HYP_GONE,	"HYP_GONE"	}
> +
> +/* Per asm/esr.h */
> +#define ESR_ELx_EC_UNKNOWN	(0x00)
> +#define ESR_ELx_EC_WFx		(0x01)
> +/* Unallocated EC: 0x02 */
> +#define ESR_ELx_EC_CP15_32	(0x03)
> +#define ESR_ELx_EC_CP15_64	(0x04)
> +#define ESR_ELx_EC_CP14_MR	(0x05)
> +#define ESR_ELx_EC_CP14_LS	(0x06)
> +#define ESR_ELx_EC_FP_ASIMD	(0x07)
> +#define ESR_ELx_EC_CP10_ID	(0x08)	/* EL2 only */
> +#define ESR_ELx_EC_PAC		(0x09)	/* EL2 and above */
> +/* Unallocated EC: 0x0A - 0x0B */
> +#define ESR_ELx_EC_CP14_64	(0x0C)
> +/* Unallocated EC: 0x0d */
> +#define ESR_ELx_EC_ILL		(0x0E)
> +/* Unallocated EC: 0x0F - 0x10 */
> +#define ESR_ELx_EC_SVC32	(0x11)
> +#define ESR_ELx_EC_HVC32	(0x12)	/* EL2 only */
> +#define ESR_ELx_EC_SMC32	(0x13)	/* EL2 and above */
> +/* Unallocated EC: 0x14 */
> +#define ESR_ELx_EC_SVC64	(0x15)
> +#define ESR_ELx_EC_HVC64	(0x16)	/* EL2 and above */
> +#define ESR_ELx_EC_SMC64	(0x17)	/* EL2 and above */
> +#define ESR_ELx_EC_SYS64	(0x18)
> +#define ESR_ELx_EC_SVE		(0x19)
> +#define ESR_ELx_EC_ERET		(0x1a)	/* EL2 only */
> +/* Unallocated EC: 0x1b - 0x1E */
> +#define ESR_ELx_EC_IMP_DEF	(0x1f)	/* EL3 only */
> +#define ESR_ELx_EC_IABT_LOW	(0x20)
> +#define ESR_ELx_EC_IABT_CUR	(0x21)
> +#define ESR_ELx_EC_PC_ALIGN	(0x22)
> +/* Unallocated EC: 0x23 */
> +#define ESR_ELx_EC_DABT_LOW	(0x24)
> +#define ESR_ELx_EC_DABT_CUR	(0x25)
> +#define ESR_ELx_EC_SP_ALIGN	(0x26)
> +/* Unallocated EC: 0x27 */
> +#define ESR_ELx_EC_FP_EXC32	(0x28)
> +/* Unallocated EC: 0x29 - 0x2B */
> +#define ESR_ELx_EC_FP_EXC64	(0x2C)
> +/* Unallocated EC: 0x2D - 0x2E */
> +#define ESR_ELx_EC_SERROR	(0x2F)
> +#define ESR_ELx_EC_BREAKPT_LOW	(0x30)
> +#define ESR_ELx_EC_BREAKPT_CUR	(0x31)
> +#define ESR_ELx_EC_SOFTSTP_LOW	(0x32)
> +#define ESR_ELx_EC_SOFTSTP_CUR	(0x33)
> +#define ESR_ELx_EC_WATCHPT_LOW	(0x34)
> +#define ESR_ELx_EC_WATCHPT_CUR	(0x35)
> +/* Unallocated EC: 0x36 - 0x37 */
> +#define ESR_ELx_EC_BKPT32	(0x38)
> +/* Unallocated EC: 0x39 */
> +#define ESR_ELx_EC_VECTOR32	(0x3A)	/* EL2 only */
> +/* Unallocated EC: 0x3B */
> +#define ESR_ELx_EC_BRK64	(0x3C)
> +/* Unallocated EC: 0x3D - 0x3F */
> +#define ESR_ELx_EC_MAX		(0x3F)
> +
> +#define ECN(x) { ESR_ELx_EC_##x, #x }
> +
> +#define kvm_arm_exception_class \
> +	ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \
> +	ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(PAC), ECN(CP14_64), \
> +	ECN(SVC64), ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(SVE), \
> +	ECN(IMP_DEF), ECN(IABT_LOW), ECN(IABT_CUR), \
> +	ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \
> +	ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \
> +	ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \
> +	ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \
> +	ECN(BKPT32), ECN(VECTOR32), ECN(BRK64)
> +
> +#endif /* ARCH_PERF_ARM64_EXCEPTION_TYPES_H */
> diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/arch/arm64/util/kvm-stat.c
> new file mode 100644
> index 000000000000..32e58576f186
> --- /dev/null
> +++ b/tools/perf/arch/arm64/util/kvm-stat.c
> @@ -0,0 +1,86 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <errno.h>
> +#include <memory.h>
> +#include "../../util/evsel.h"
> +#include "../../util/kvm-stat.h"
> +#include "arm64_exception_types.h"
> +#include "debug.h"
> +
> +define_exit_reasons_table(arm64_exit_reasons, kvm_arm_exception_type);
> +define_exit_reasons_table(arm64_trap_exit_reasons, kvm_arm_exception_class);
> +
> +const char *kvm_trap_exit_reason = "esr_ec";
> +const char *vcpu_id_str = "id";
> +const int decode_str_len = 20;
> +const char *kvm_exit_reason = "ret";
> +const char *kvm_entry_trace = "kvm:kvm_entry";
> +const char *kvm_exit_trace = "kvm:kvm_exit";
> +
> +const char *kvm_events_tp[] = {
> +	"kvm:kvm_entry",
> +	"kvm:kvm_exit",
> +	NULL,
> +};
> +
> +static void event_get_key(struct evsel *evsel,
> +			  struct perf_sample *sample,
> +			  struct event_key *key)
> +{
> +	key->info = 0;
> +	key->key = perf_evsel__intval(evsel, sample, kvm_exit_reason);

Now the perf/core branch doesn't have API perf_evsel__intval(), and it
has been replaced with evsel__intval(); so please

s/perf_evsel__intval/evsel__intval

It's good to rebase patch on the Arnaldo's perf/core branch:
  $ git clone --single-branch --branch perf/core \
       https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git

> +	key->exit_reasons = arm64_exit_reasons;
> +
> +	/*
> +	 * TRAP exceptions carry exception class info in esr_ec field
> +	 * and, hence, we need to use a different exit_reasons table to
> +	 * properly decode event's est_ec.
> +	 */
> +	if (key->key == ARM_EXCEPTION_TRAP) {
> +		key->key = perf_evsel__intval(evsel, sample,

Ditto.

> +					      kvm_trap_exit_reason);

Otherwise, this patch is good for me and I have tested this patch with
below commands:

  $ perf kvm stat record
  $ perf kvm stat report

Reviewed-by: Leo Yan <leo.yan@linaro.org>
Tested-by: Leo Yan <leo.yan@linaro.org>

Thanks,
Leo

> +		key->exit_reasons = arm64_trap_exit_reasons;
> +	}
> +}
> +
> +static bool event_begin(struct evsel *evsel,
> +			struct perf_sample *sample __maybe_unused,
> +			struct event_key *key __maybe_unused)
> +{
> +	return !strcmp(evsel->name, kvm_entry_trace);
> +}
> +
> +static bool event_end(struct evsel *evsel,
> +		      struct perf_sample *sample,
> +		      struct event_key *key)
> +{
> +	if (!strcmp(evsel->name, kvm_exit_trace)) {
> +		event_get_key(evsel, sample, key);
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static struct kvm_events_ops exit_events = {
> +	.is_begin_event = event_begin,
> +	.is_end_event	= event_end,
> +	.decode_key	= exit_event_decode_key,
> +	.name		= "VM-EXIT"
> +};
> +
> +struct kvm_reg_events_ops kvm_reg_events_ops[] = {
> +	{
> +		.name	= "vmexit",
> +		.ops	= &exit_events,
> +	},
> +	{ NULL },
> +};
> +
> +const char * const kvm_skip_events[] = {
> +	NULL,
> +};
> +
> +int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
> +{
> +	kvm->exit_reasons_isa = "arm64";
> +	return 0;
> +}
> -- 
> 2.28.0
>
Sergey Senozhatsky Sept. 17, 2020, 9:54 a.m. UTC | #2
On (20/09/17 16:47), Leo Yan wrote:
[..]
> > +static void event_get_key(struct evsel *evsel,
> > +			  struct perf_sample *sample,
> > +			  struct event_key *key)
> > +{
> > +	key->info = 0;
> > +	key->key = perf_evsel__intval(evsel, sample, kvm_exit_reason);
> 
> Now the perf/core branch doesn't have API perf_evsel__intval(), and it
> has been replaced with evsel__intval(); so please
> 
> s/perf_evsel__intval/evsel__intval

ACK.
The kernel I'm using still have perf_evsel__intval().

> > +	if (key->key == ARM_EXCEPTION_TRAP) {
> > +		key->key = perf_evsel__intval(evsel, sample,
> 
> Ditto.

ACK. Sorry about that.

> Otherwise, this patch is good for me and I have tested this patch with
> below commands:
> 
>   $ perf kvm stat record
>   $ perf kvm stat report

"perf kvm stat live" should also work.

> Reviewed-by: Leo Yan <leo.yan@linaro.org>
> Tested-by: Leo Yan <leo.yan@linaro.org>

Thanks!

	-ss
Leo Yan Sept. 17, 2020, 10:09 a.m. UTC | #3
[ + Marc ]

On Thu, Sep 17, 2020 at 09:36:45AM +0900, Sergey Senozhatsky wrote:

[...]

> diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/arch/arm64/util/kvm-stat.c
> new file mode 100644
> index 000000000000..32e58576f186
> --- /dev/null
> +++ b/tools/perf/arch/arm64/util/kvm-stat.c
> @@ -0,0 +1,86 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <errno.h>
> +#include <memory.h>
> +#include "../../util/evsel.h"
> +#include "../../util/kvm-stat.h"
> +#include "arm64_exception_types.h"
> +#include "debug.h"
> +
> +define_exit_reasons_table(arm64_exit_reasons, kvm_arm_exception_type);
> +define_exit_reasons_table(arm64_trap_exit_reasons, kvm_arm_exception_class);
> +
> +const char *kvm_trap_exit_reason = "esr_ec";
> +const char *vcpu_id_str = "id";

On Arm64, ftrace tracepoint "kvm_entry" doesn't contain the field "id"
or field "vcpu_id", thus it always reads out the "id" is 0 and it is
recorded into Perf's structure vcpu_event_record::vcpu_id and assigned
to perf thread's private data "thread::private".

With current code, it will not mess up different vcpus' samples because
now the samples are analyzed based on thread context, but since all
threads' "vcpu_id" is zero, thus all samples are accounted for
"vcpu_id=0" and cannot print out correct result with option "--vcpu":


  $ perf kvm stat report --vcpu 4

  Analyze events for all VMs, VCPU 4:

             VM-EXIT    Samples  Samples%     Time%    Min Time    Max Time         Avg time

  Total Samples:0, Total events handled time:0.00us.


This is an issue I observed, if we want to support option "--vcpu",
seems we need to change ftrace event for "kvm_entry", but this will
break ABI.

Essentially, this issue is caused by different archs using different
format for ftrace event "kvm_entry", on x86 it contains feild
"vcpu_id" but arm64 only just records "vcpu_pc".

@Marc, @Will, do you have any suggestion for this?  Do you think it's
feasible to add a new field "vcpu_id" into the tracepoint "kvm_entry"
for Arm64's version?

Thanks,
Leo
Leo Yan Sept. 17, 2020, 10:12 a.m. UTC | #4
Add Marc at this time, sorry for spamming.

On Thu, Sep 17, 2020 at 06:09:50PM +0800, Leo Yan wrote:
> [ + Marc ]
> 
> On Thu, Sep 17, 2020 at 09:36:45AM +0900, Sergey Senozhatsky wrote:
> 
> [...]
> 
> > diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/arch/arm64/util/kvm-stat.c
> > new file mode 100644
> > index 000000000000..32e58576f186
> > --- /dev/null
> > +++ b/tools/perf/arch/arm64/util/kvm-stat.c
> > @@ -0,0 +1,86 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include <errno.h>
> > +#include <memory.h>
> > +#include "../../util/evsel.h"
> > +#include "../../util/kvm-stat.h"
> > +#include "arm64_exception_types.h"
> > +#include "debug.h"
> > +
> > +define_exit_reasons_table(arm64_exit_reasons, kvm_arm_exception_type);
> > +define_exit_reasons_table(arm64_trap_exit_reasons, kvm_arm_exception_class);
> > +
> > +const char *kvm_trap_exit_reason = "esr_ec";
> > +const char *vcpu_id_str = "id";
> 
> On Arm64, ftrace tracepoint "kvm_entry" doesn't contain the field "id"
> or field "vcpu_id", thus it always reads out the "id" is 0 and it is
> recorded into Perf's structure vcpu_event_record::vcpu_id and assigned
> to perf thread's private data "thread::private".
> 
> With current code, it will not mess up different vcpus' samples because
> now the samples are analyzed based on thread context, but since all
> threads' "vcpu_id" is zero, thus all samples are accounted for
> "vcpu_id=0" and cannot print out correct result with option "--vcpu":
> 
> 
>   $ perf kvm stat report --vcpu 4
> 
>   Analyze events for all VMs, VCPU 4:
> 
>              VM-EXIT    Samples  Samples%     Time%    Min Time    Max Time         Avg time
> 
>   Total Samples:0, Total events handled time:0.00us.
> 
> 
> This is an issue I observed, if we want to support option "--vcpu",
> seems we need to change ftrace event for "kvm_entry", but this will
> break ABI.
> 
> Essentially, this issue is caused by different archs using different
> format for ftrace event "kvm_entry", on x86 it contains feild
> "vcpu_id" but arm64 only just records "vcpu_pc".
> 
> @Marc, @Will, do you have any suggestion for this?  Do you think it's
> feasible to add a new field "vcpu_id" into the tracepoint "kvm_entry"
> for Arm64's version?
> 
> Thanks,
> Leo
Marc Zyngier Sept. 17, 2020, 10:21 a.m. UTC | #5
On 2020-09-17 11:12, Leo Yan wrote:
> Add Marc at this time, sorry for spamming.
> 
> On Thu, Sep 17, 2020 at 06:09:50PM +0800, Leo Yan wrote:
>> [ + Marc ]
>> 
>> On Thu, Sep 17, 2020 at 09:36:45AM +0900, Sergey Senozhatsky wrote:
>> 
>> [...]
>> 
>> > diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/arch/arm64/util/kvm-stat.c
>> > new file mode 100644
>> > index 000000000000..32e58576f186
>> > --- /dev/null
>> > +++ b/tools/perf/arch/arm64/util/kvm-stat.c
>> > @@ -0,0 +1,86 @@
>> > +// SPDX-License-Identifier: GPL-2.0
>> > +#include <errno.h>
>> > +#include <memory.h>
>> > +#include "../../util/evsel.h"
>> > +#include "../../util/kvm-stat.h"
>> > +#include "arm64_exception_types.h"
>> > +#include "debug.h"
>> > +
>> > +define_exit_reasons_table(arm64_exit_reasons, kvm_arm_exception_type);
>> > +define_exit_reasons_table(arm64_trap_exit_reasons, kvm_arm_exception_class);
>> > +
>> > +const char *kvm_trap_exit_reason = "esr_ec";
>> > +const char *vcpu_id_str = "id";
>> 
>> On Arm64, ftrace tracepoint "kvm_entry" doesn't contain the field "id"
>> or field "vcpu_id", thus it always reads out the "id" is 0 and it is
>> recorded into Perf's structure vcpu_event_record::vcpu_id and assigned
>> to perf thread's private data "thread::private".
>> 
>> With current code, it will not mess up different vcpus' samples 
>> because
>> now the samples are analyzed based on thread context, but since all
>> threads' "vcpu_id" is zero, thus all samples are accounted for
>> "vcpu_id=0" and cannot print out correct result with option "--vcpu":
>> 
>> 
>>   $ perf kvm stat report --vcpu 4
>> 
>>   Analyze events for all VMs, VCPU 4:
>> 
>>              VM-EXIT    Samples  Samples%     Time%    Min Time    Max 
>> Time         Avg time
>> 
>>   Total Samples:0, Total events handled time:0.00us.
>> 
>> 
>> This is an issue I observed, if we want to support option "--vcpu",
>> seems we need to change ftrace event for "kvm_entry", but this will
>> break ABI.
>> 
>> Essentially, this issue is caused by different archs using different
>> format for ftrace event "kvm_entry", on x86 it contains feild
>> "vcpu_id" but arm64 only just records "vcpu_pc".
>> 
>> @Marc, @Will, do you have any suggestion for this?  Do you think it's
>> feasible to add a new field "vcpu_id" into the tracepoint "kvm_entry"
>> for Arm64's version?

The question really is: how will you handle the ABI breackage?
I don't see a good solution for it, apart from having a *separate*
tracepoint that collects all the information you need. And even that is
really ugly.

         M.
Sergey Senozhatsky Sept. 17, 2020, 11 a.m. UTC | #6
On (20/09/17 11:21), Marc Zyngier wrote:
> > > On Arm64, ftrace tracepoint "kvm_entry" doesn't contain the field "id"
> > > or field "vcpu_id", thus it always reads out the "id" is 0 and it is

Right, Leo, I saw that. I put "id", because there is ... well, there is
nothing we can put there. The trace points on arm64, unlike on s390 or x86,
don't contain info which can let us map trace-event to vcpu_id (even
(void *)vcpu would have been rather helpful).

> > > Essentially, this issue is caused by different archs using different
> > > format for ftrace event "kvm_entry", on x86 it contains feild
> > > "vcpu_id" but arm64 only just records "vcpu_pc".

Exactly. I wish trace-points were less of pain-points.

So 'perf kvm stat' on arm64 works, but it's not as feature rich as on other
platforms; at the same it's better than nothing, I suppose.

	-ss
Leo Yan Sept. 17, 2020, 11:42 a.m. UTC | #7
On Thu, Sep 17, 2020 at 11:21:15AM +0100, Marc Zyngier wrote:

[...]

> > > > +const char *vcpu_id_str = "id";
> > > 
> > > On Arm64, ftrace tracepoint "kvm_entry" doesn't contain the field "id"
> > > or field "vcpu_id", thus it always reads out the "id" is 0 and it is
> > > recorded into Perf's structure vcpu_event_record::vcpu_id and assigned
> > > to perf thread's private data "thread::private".
> > > 
> > > With current code, it will not mess up different vcpus' samples
> > > because
> > > now the samples are analyzed based on thread context, but since all
> > > threads' "vcpu_id" is zero, thus all samples are accounted for
> > > "vcpu_id=0" and cannot print out correct result with option "--vcpu":
> > > 
> > > 
> > >   $ perf kvm stat report --vcpu 4
> > > 
> > >   Analyze events for all VMs, VCPU 4:
> > > 
> > >              VM-EXIT    Samples  Samples%     Time%    Min Time
> > > Max Time         Avg time
> > > 
> > >   Total Samples:0, Total events handled time:0.00us.
> > > 
> > > 
> > > This is an issue I observed, if we want to support option "--vcpu",
> > > seems we need to change ftrace event for "kvm_entry", but this will
> > > break ABI.
> > > 
> > > Essentially, this issue is caused by different archs using different
> > > format for ftrace event "kvm_entry", on x86 it contains feild
> > > "vcpu_id" but arm64 only just records "vcpu_pc".
> > > 
> > > @Marc, @Will, do you have any suggestion for this?  Do you think it's
> > > feasible to add a new field "vcpu_id" into the tracepoint "kvm_entry"
> > > for Arm64's version?
> 
> The question really is: how will you handle the ABI breackage?
> I don't see a good solution for it, apart from having a *separate*
> tracepoint that collects all the information you need. And even that is
> really ugly.

I searched a bit and found in practice it's not impossible to add new
parameters for existed tracepoint, e.g. [1][2] are two examples to add
new parameters for existed tracepoints and have been merged into
mainline kernel.  IIUC, we keep the old parameters for a tracepoint
so this can avoid to break ABI if any apps have used this tracepoint,
and adding a new parameter for the tracepoint should be safe.

If you agree with this, I'd like to suggest to apply below change.
How about you think for this?


diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 46dc3d75cf13..d9f9b8e1df77 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -736,7 +736,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 		/**************************************************************
 		 * Enter the guest
 		 */
-		trace_kvm_entry(*vcpu_pc(vcpu));
+		trace_kvm_entry(vcpu->vcpu_id, *vcpu_pc(vcpu));
 		guest_enter_irqoff();
 
 		ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
index 4691053c5ee4..e1d3e7a67e8b 100644
--- a/arch/arm64/kvm/trace_arm.h
+++ b/arch/arm64/kvm/trace_arm.h
@@ -12,18 +12,20 @@
  * Tracepoints for entry/exit to guest
  */
 TRACE_EVENT(kvm_entry,
-	TP_PROTO(unsigned long vcpu_pc),
-	TP_ARGS(vcpu_pc),
+	TP_PROTO(unsigned int vcpu_id, unsigned long vcpu_pc),
+	TP_ARGS(vcpu_id, vcpu_pc),
 
 	TP_STRUCT__entry(
+		__field(	unsigned int,	vcpu_id		)
 		__field(	unsigned long,	vcpu_pc		)
 	),
 
 	TP_fast_assign(
+		__entry->vcpu_id		= vcpu_id;
 		__entry->vcpu_pc		= vcpu_pc;
 	),
 
-	TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
+	TP_printk("vcpu: %u, PC: 0x%08lx", __entry->vcpu_id, __entry->vcpu_pc)
 );
 
 TRACE_EVENT(kvm_exit,


Thanks,
Leo

[1] https://lkml.org/lkml/2019/2/26/282
[2] https://lore.kernel.org/linux-mm/20191106080037.GA59367@google.com/t/
Marc Zyngier Sept. 17, 2020, 11:53 a.m. UTC | #8
On 2020-09-17 12:42, Leo Yan wrote:
> On Thu, Sep 17, 2020 at 11:21:15AM +0100, Marc Zyngier wrote:
> 
> [...]
> 
>> > > > +const char *vcpu_id_str = "id";
>> > >
>> > > On Arm64, ftrace tracepoint "kvm_entry" doesn't contain the field "id"
>> > > or field "vcpu_id", thus it always reads out the "id" is 0 and it is
>> > > recorded into Perf's structure vcpu_event_record::vcpu_id and assigned
>> > > to perf thread's private data "thread::private".
>> > >
>> > > With current code, it will not mess up different vcpus' samples
>> > > because
>> > > now the samples are analyzed based on thread context, but since all
>> > > threads' "vcpu_id" is zero, thus all samples are accounted for
>> > > "vcpu_id=0" and cannot print out correct result with option "--vcpu":
>> > >
>> > >
>> > >   $ perf kvm stat report --vcpu 4
>> > >
>> > >   Analyze events for all VMs, VCPU 4:
>> > >
>> > >              VM-EXIT    Samples  Samples%     Time%    Min Time
>> > > Max Time         Avg time
>> > >
>> > >   Total Samples:0, Total events handled time:0.00us.
>> > >
>> > >
>> > > This is an issue I observed, if we want to support option "--vcpu",
>> > > seems we need to change ftrace event for "kvm_entry", but this will
>> > > break ABI.
>> > >
>> > > Essentially, this issue is caused by different archs using different
>> > > format for ftrace event "kvm_entry", on x86 it contains feild
>> > > "vcpu_id" but arm64 only just records "vcpu_pc".
>> > >
>> > > @Marc, @Will, do you have any suggestion for this?  Do you think it's
>> > > feasible to add a new field "vcpu_id" into the tracepoint "kvm_entry"
>> > > for Arm64's version?
>> 
>> The question really is: how will you handle the ABI breackage?
>> I don't see a good solution for it, apart from having a *separate*
>> tracepoint that collects all the information you need. And even that 
>> is
>> really ugly.
> 
> I searched a bit and found in practice it's not impossible to add new
> parameters for existed tracepoint, e.g. [1][2] are two examples to add
> new parameters for existed tracepoints and have been merged into
> mainline kernel.  IIUC, we keep the old parameters for a tracepoint
> so this can avoid to break ABI if any apps have used this tracepoint,
> and adding a new parameter for the tracepoint should be safe.
> 
> If you agree with this, I'd like to suggest to apply below change.
> How about you think for this?
> 
> 
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 46dc3d75cf13..d9f9b8e1df77 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -736,7 +736,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
>  		/**************************************************************
>  		 * Enter the guest
>  		 */
> -		trace_kvm_entry(*vcpu_pc(vcpu));
> +		trace_kvm_entry(vcpu->vcpu_id, *vcpu_pc(vcpu));
>  		guest_enter_irqoff();
> 
>  		ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
> diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
> index 4691053c5ee4..e1d3e7a67e8b 100644
> --- a/arch/arm64/kvm/trace_arm.h
> +++ b/arch/arm64/kvm/trace_arm.h
> @@ -12,18 +12,20 @@
>   * Tracepoints for entry/exit to guest
>   */
>  TRACE_EVENT(kvm_entry,
> -	TP_PROTO(unsigned long vcpu_pc),
> -	TP_ARGS(vcpu_pc),
> +	TP_PROTO(unsigned int vcpu_id, unsigned long vcpu_pc),
> +	TP_ARGS(vcpu_id, vcpu_pc),
> 
>  	TP_STRUCT__entry(
> +		__field(	unsigned int,	vcpu_id		)
>  		__field(	unsigned long,	vcpu_pc		)
>  	),
> 
>  	TP_fast_assign(
> +		__entry->vcpu_id		= vcpu_id;
>  		__entry->vcpu_pc		= vcpu_pc;
>  	),
> 
> -	TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
> +	TP_printk("vcpu: %u, PC: 0x%08lx", __entry->vcpu_id, 
> __entry->vcpu_pc)
>  );
> 
>  TRACE_EVENT(kvm_exit,
> 

How is that not breaking the ABI? You are adding a new field, and 
anything
that expect to read 'PC: 0x.....' at the beginning of the line now 
fails.
The examples you give are also blatant ABI breakages. because it is done
somewhere else doesn't make it valid.

Anything that can be parsed by userspace is ABI. If you don't believe 
me,
please read the entertaining discussion we had when we tried to drop
Bogomips from /proc/cpuinfo.

So unless you get me Linus' stamp of approval for this, it's not 
happening.
Feel free to add a *new* tracepoint instead.

         M.
Leo Yan Sept. 17, 2020, 12:52 p.m. UTC | #9
On Thu, Sep 17, 2020 at 12:53:02PM +0100, Marc Zyngier wrote:
> On 2020-09-17 12:42, Leo Yan wrote:
> > On Thu, Sep 17, 2020 at 11:21:15AM +0100, Marc Zyngier wrote:
> > 
> > [...]
> > 
> > > > > > +const char *vcpu_id_str = "id";
> > > > >
> > > > > On Arm64, ftrace tracepoint "kvm_entry" doesn't contain the field "id"
> > > > > or field "vcpu_id", thus it always reads out the "id" is 0 and it is
> > > > > recorded into Perf's structure vcpu_event_record::vcpu_id and assigned
> > > > > to perf thread's private data "thread::private".
> > > > >
> > > > > With current code, it will not mess up different vcpus' samples
> > > > > because
> > > > > now the samples are analyzed based on thread context, but since all
> > > > > threads' "vcpu_id" is zero, thus all samples are accounted for
> > > > > "vcpu_id=0" and cannot print out correct result with option "--vcpu":
> > > > >
> > > > >
> > > > >   $ perf kvm stat report --vcpu 4
> > > > >
> > > > >   Analyze events for all VMs, VCPU 4:
> > > > >
> > > > >              VM-EXIT    Samples  Samples%     Time%    Min Time
> > > > > Max Time         Avg time
> > > > >
> > > > >   Total Samples:0, Total events handled time:0.00us.
> > > > >
> > > > >
> > > > > This is an issue I observed, if we want to support option "--vcpu",
> > > > > seems we need to change ftrace event for "kvm_entry", but this will
> > > > > break ABI.
> > > > >
> > > > > Essentially, this issue is caused by different archs using different
> > > > > format for ftrace event "kvm_entry", on x86 it contains feild
> > > > > "vcpu_id" but arm64 only just records "vcpu_pc".
> > > > >
> > > > > @Marc, @Will, do you have any suggestion for this?  Do you think it's
> > > > > feasible to add a new field "vcpu_id" into the tracepoint "kvm_entry"
> > > > > for Arm64's version?
> > > 
> > > The question really is: how will you handle the ABI breackage?
> > > I don't see a good solution for it, apart from having a *separate*
> > > tracepoint that collects all the information you need. And even that
> > > is
> > > really ugly.
> > 
> > I searched a bit and found in practice it's not impossible to add new
> > parameters for existed tracepoint, e.g. [1][2] are two examples to add
> > new parameters for existed tracepoints and have been merged into
> > mainline kernel.  IIUC, we keep the old parameters for a tracepoint
> > so this can avoid to break ABI if any apps have used this tracepoint,
> > and adding a new parameter for the tracepoint should be safe.
> > 
> > If you agree with this, I'd like to suggest to apply below change.
> > How about you think for this?
> > 
> > 
> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index 46dc3d75cf13..d9f9b8e1df77 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -736,7 +736,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
> >  		/**************************************************************
> >  		 * Enter the guest
> >  		 */
> > -		trace_kvm_entry(*vcpu_pc(vcpu));
> > +		trace_kvm_entry(vcpu->vcpu_id, *vcpu_pc(vcpu));
> >  		guest_enter_irqoff();
> > 
> >  		ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
> > diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
> > index 4691053c5ee4..e1d3e7a67e8b 100644
> > --- a/arch/arm64/kvm/trace_arm.h
> > +++ b/arch/arm64/kvm/trace_arm.h
> > @@ -12,18 +12,20 @@
> >   * Tracepoints for entry/exit to guest
> >   */
> >  TRACE_EVENT(kvm_entry,
> > -	TP_PROTO(unsigned long vcpu_pc),
> > -	TP_ARGS(vcpu_pc),
> > +	TP_PROTO(unsigned int vcpu_id, unsigned long vcpu_pc),
> > +	TP_ARGS(vcpu_id, vcpu_pc),
> > 
> >  	TP_STRUCT__entry(
> > +		__field(	unsigned int,	vcpu_id		)
> >  		__field(	unsigned long,	vcpu_pc		)
> >  	),
> > 
> >  	TP_fast_assign(
> > +		__entry->vcpu_id		= vcpu_id;
> >  		__entry->vcpu_pc		= vcpu_pc;
> >  	),
> > 
> > -	TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
> > +	TP_printk("vcpu: %u, PC: 0x%08lx", __entry->vcpu_id, __entry->vcpu_pc)
> >  );
> > 
> >  TRACE_EVENT(kvm_exit,
> > 
> 
> How is that not breaking the ABI? You are adding a new field, and anything
> that expect to read 'PC: 0x.....' at the beginning of the line now fails.
> The examples you give are also blatant ABI breakages. because it is done
> somewhere else doesn't make it valid.
> 
> Anything that can be parsed by userspace is ABI. If you don't believe me,
> please read the entertaining discussion we had when we tried to drop
> Bogomips from /proc/cpuinfo.

The discussion thread was too long [1] to read all replies :)

... but I understand we should be very careful for ABI breakage.

> So unless you get me Linus' stamp of approval for this, it's not happening.
> Feel free to add a *new* tracepoint instead.

Okay, thanks for the info and suggestions.

Thanks,
Leo

[1] https://lkml.org/lkml/2015/1/4/132
Leo Yan Sept. 17, 2020, 1:08 p.m. UTC | #10
Hi Sergey,

On Thu, Sep 17, 2020 at 08:00:28PM +0900, Sergey Senozhatsky wrote:
> On (20/09/17 11:21), Marc Zyngier wrote:
> > > > On Arm64, ftrace tracepoint "kvm_entry" doesn't contain the field "id"
> > > > or field "vcpu_id", thus it always reads out the "id" is 0 and it is
> 
> Right, Leo, I saw that. I put "id", because there is ... well, there is
> nothing we can put there. The trace points on arm64, unlike on s390 or x86,
> don't contain info which can let us map trace-event to vcpu_id (even
> (void *)vcpu would have been rather helpful).
> 
> > > > Essentially, this issue is caused by different archs using different
> > > > format for ftrace event "kvm_entry", on x86 it contains feild
> > > > "vcpu_id" but arm64 only just records "vcpu_pc".
> 
> Exactly. I wish trace-points were less of pain-points.
> 
> So 'perf kvm stat' on arm64 works, but it's not as feature rich as on other
> platforms; at the same it's better than nothing, I suppose.

Agree and I don't want to introduce complexity at here.  So it's fine
for me to merge your patch v4 firstly and later use a new patch set
to support "vcpu_id".

As you might have seen the discussion with Marc for new trace events
with "vcpu_id", another way is to add new trace events with "vcpu_id"
and then enable "perf kvm stat" based on new trace events.

Either way is okay for me; except maintainers have different thinking
for this.

Thanks,
Leo
Sergey Senozhatsky Sept. 18, 2020, 12:32 a.m. UTC | #11
On (20/09/17 12:53), Marc Zyngier wrote:
> Feel free to add a *new* tracepoint instead.

Wouldn't we want a whole bunch of new tracepoints in this case?
(almost all of the existing ones with the extra vcpu_id field).
Right now we have 3 types of events:
- events with no vcpu at all        // nil
- events with vcpu_pc               // "0x%016lx", __entry->vcpu_pc
- events with (void *)vcpu          // "vcpu: %p", __entry->vcpu

It might be helpful if we could filter out events by vcpu_id.
But this, basically, doubles the number of events in the ringbuffer.

	-ss
Marc Zyngier Sept. 18, 2020, 8:20 a.m. UTC | #12
On 2020-09-18 01:32, Sergey Senozhatsky wrote:
> On (20/09/17 12:53), Marc Zyngier wrote:
>> Feel free to add a *new* tracepoint instead.
> 
> Wouldn't we want a whole bunch of new tracepoints in this case?

Yes. I don't have a better solution as long as tracepoints are ABI.
Get someone to sign-off on it, and I'll happily change them.

> (almost all of the existing ones with the extra vcpu_id field).
> Right now we have 3 types of events:
> - events with no vcpu at all        // nil
> - events with vcpu_pc               // "0x%016lx", __entry->vcpu_pc
> - events with (void *)vcpu          // "vcpu: %p", __entry->vcpu
> 
> It might be helpful if we could filter out events by vcpu_id.
> But this, basically, doubles the number of events in the ringbuffer.

Only if you enable them both, right? You define new tracepoints that
do whatever you need them to do (hopefully in a cross-architecture
compliant way), and have perf to only use the new ones on arm64.
How would that double the number of events in the buffer?

         M.
Sergey Senozhatsky Sept. 18, 2020, 10:35 a.m. UTC | #13
On (20/09/18 09:20), Marc Zyngier wrote:
> On 2020-09-18 01:32, Sergey Senozhatsky wrote:
> > On (20/09/17 12:53), Marc Zyngier wrote:
> > > Feel free to add a *new* tracepoint instead.
> > 
> > Wouldn't we want a whole bunch of new tracepoints in this case?
> 
> Yes. I don't have a better solution as long as tracepoints are ABI.

Well, no one does.

> Get someone to sign-off on it, and I'll happily change them.

Sorry, I'm not sure I understand this sentence.

> > (almost all of the existing ones with the extra vcpu_id field).
> > Right now we have 3 types of events:
> > - events with no vcpu at all        // nil
> > - events with vcpu_pc               // "0x%016lx", __entry->vcpu_pc
> > - events with (void *)vcpu          // "vcpu: %p", __entry->vcpu
> > 
> > It might be helpful if we could filter out events by vcpu_id.
> > But this, basically, doubles the number of events in the ringbuffer.
> 
> Only if you enable them both, right?
[..]
> How would that double the number of events in the buffer?

Yes. I assume that many scripts do something like "capture kvm:* events",
so new and old events are enabled. Unless we want to keep new events in
something like kvm2:* namespace (which is unlikely to happen, I guess).

And `sudo ./perf stat -e 'kvm:*'` is not unseen. In fact, this is
literally the first thing mentioned at https://www.linux-kvm.org/page/Perf_events

So if we'll have something like

	trace_kvm_foo(vcpu);
+	trace_kvm_foo2(vcpu->id, vcpu);

for all arm64 kvm events, then we double the number of arm64 kvm:* events
in the ringbuffer, don't we? Maybe this is not a gigantic issue, but who
knows.

	-ss
Marc Zyngier Sept. 18, 2020, 10:55 a.m. UTC | #14
On 2020-09-18 11:35, Sergey Senozhatsky wrote:
> On (20/09/18 09:20), Marc Zyngier wrote:
>> On 2020-09-18 01:32, Sergey Senozhatsky wrote:
>> > On (20/09/17 12:53), Marc Zyngier wrote:
>> > > Feel free to add a *new* tracepoint instead.
>> >
>> > Wouldn't we want a whole bunch of new tracepoints in this case?
>> 
>> Yes. I don't have a better solution as long as tracepoints are ABI.
> 
> Well, no one does.
> 
>> Get someone to sign-off on it, and I'll happily change them.
> 
> Sorry, I'm not sure I understand this sentence.

What I meant is that the only way to make changes to existing 
tracepoints
would be to get someone like Linus to approve them.

It's all rhetorical anyway, so let's move on.

> 
>> > (almost all of the existing ones with the extra vcpu_id field).
>> > Right now we have 3 types of events:
>> > - events with no vcpu at all        // nil
>> > - events with vcpu_pc               // "0x%016lx", __entry->vcpu_pc
>> > - events with (void *)vcpu          // "vcpu: %p", __entry->vcpu
>> >
>> > It might be helpful if we could filter out events by vcpu_id.
>> > But this, basically, doubles the number of events in the ringbuffer.
>> 
>> Only if you enable them both, right?
> [..]
>> How would that double the number of events in the buffer?
> 
> Yes. I assume that many scripts do something like "capture kvm:* 
> events",
> so new and old events are enabled. Unless we want to keep new events in
> something like kvm2:* namespace (which is unlikely to happen, I guess).

I really don't mind. I actually like the namespacing, as it gives us
a notion of versioning, something tracepoints lack.. And it gives an
opportunity to argue about the name of the namespace.

> 
> And `sudo ./perf stat -e 'kvm:*'` is not unseen. In fact, this is
> literally the first thing mentioned at
> https://www.linux-kvm.org/page/Perf_events
> 
> So if we'll have something like
> 
> 	trace_kvm_foo(vcpu);
> +	trace_kvm_foo2(vcpu->id, vcpu);
> 
> for all arm64 kvm events, then we double the number of arm64 kvm:* 
> events
> in the ringbuffer, don't we? Maybe this is not a gigantic issue, but 
> who
> knows.

I don't think it's a problem, but I'm more in favour of the namespace
approach.

         M.
diff mbox series

Patch

diff --git a/tools/perf/arch/arm64/Makefile b/tools/perf/arch/arm64/Makefile
index dbef716a1913..fab3095fb5d0 100644
--- a/tools/perf/arch/arm64/Makefile
+++ b/tools/perf/arch/arm64/Makefile
@@ -4,6 +4,7 @@  PERF_HAVE_DWARF_REGS := 1
 endif
 PERF_HAVE_JITDUMP := 1
 PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
+HAVE_KVM_STAT_SUPPORT := 1
 
 #
 # Syscall table generation for perf
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
index 5c13438c7bd4..4cba12f4b741 100644
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
@@ -1,6 +1,7 @@ 
 perf-y += header.o
 perf-y += machine.o
 perf-y += perf_regs.o
+perf-y += kvm-stat.o
 perf-$(CONFIG_DWARF)     += dwarf-regs.o
 perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
 perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/arm64/util/arm64_exception_types.h b/tools/perf/arch/arm64/util/arm64_exception_types.h
new file mode 100644
index 000000000000..27c981ebe401
--- /dev/null
+++ b/tools/perf/arch/arm64/util/arm64_exception_types.h
@@ -0,0 +1,92 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#ifndef ARCH_PERF_ARM64_EXCEPTION_TYPES_H
+#define ARCH_PERF_ARM64_EXCEPTION_TYPES_H
+
+/* Per asm/virt.h */
+#define HVC_STUB_ERR		  0xbadca11
+
+/* Per asm/kvm_asm.h */
+#define ARM_EXCEPTION_IRQ		0
+#define ARM_EXCEPTION_EL1_SERROR	1
+#define ARM_EXCEPTION_TRAP		2
+#define ARM_EXCEPTION_IL		3
+/* The hyp-stub will return this for any kvm_call_hyp() call */
+#define ARM_EXCEPTION_HYP_GONE		HVC_STUB_ERR
+
+#define kvm_arm_exception_type					\
+	{ARM_EXCEPTION_IRQ,		"IRQ"		},	\
+	{ARM_EXCEPTION_EL1_SERROR,	"SERROR"	},	\
+	{ARM_EXCEPTION_TRAP,		"TRAP"		},	\
+	{ARM_EXCEPTION_IL,		"ILLEGAL"	},	\
+	{ARM_EXCEPTION_HYP_GONE,	"HYP_GONE"	}
+
+/* Per asm/esr.h */
+#define ESR_ELx_EC_UNKNOWN	(0x00)
+#define ESR_ELx_EC_WFx		(0x01)
+/* Unallocated EC: 0x02 */
+#define ESR_ELx_EC_CP15_32	(0x03)
+#define ESR_ELx_EC_CP15_64	(0x04)
+#define ESR_ELx_EC_CP14_MR	(0x05)
+#define ESR_ELx_EC_CP14_LS	(0x06)
+#define ESR_ELx_EC_FP_ASIMD	(0x07)
+#define ESR_ELx_EC_CP10_ID	(0x08)	/* EL2 only */
+#define ESR_ELx_EC_PAC		(0x09)	/* EL2 and above */
+/* Unallocated EC: 0x0A - 0x0B */
+#define ESR_ELx_EC_CP14_64	(0x0C)
+/* Unallocated EC: 0x0d */
+#define ESR_ELx_EC_ILL		(0x0E)
+/* Unallocated EC: 0x0F - 0x10 */
+#define ESR_ELx_EC_SVC32	(0x11)
+#define ESR_ELx_EC_HVC32	(0x12)	/* EL2 only */
+#define ESR_ELx_EC_SMC32	(0x13)	/* EL2 and above */
+/* Unallocated EC: 0x14 */
+#define ESR_ELx_EC_SVC64	(0x15)
+#define ESR_ELx_EC_HVC64	(0x16)	/* EL2 and above */
+#define ESR_ELx_EC_SMC64	(0x17)	/* EL2 and above */
+#define ESR_ELx_EC_SYS64	(0x18)
+#define ESR_ELx_EC_SVE		(0x19)
+#define ESR_ELx_EC_ERET		(0x1a)	/* EL2 only */
+/* Unallocated EC: 0x1b - 0x1E */
+#define ESR_ELx_EC_IMP_DEF	(0x1f)	/* EL3 only */
+#define ESR_ELx_EC_IABT_LOW	(0x20)
+#define ESR_ELx_EC_IABT_CUR	(0x21)
+#define ESR_ELx_EC_PC_ALIGN	(0x22)
+/* Unallocated EC: 0x23 */
+#define ESR_ELx_EC_DABT_LOW	(0x24)
+#define ESR_ELx_EC_DABT_CUR	(0x25)
+#define ESR_ELx_EC_SP_ALIGN	(0x26)
+/* Unallocated EC: 0x27 */
+#define ESR_ELx_EC_FP_EXC32	(0x28)
+/* Unallocated EC: 0x29 - 0x2B */
+#define ESR_ELx_EC_FP_EXC64	(0x2C)
+/* Unallocated EC: 0x2D - 0x2E */
+#define ESR_ELx_EC_SERROR	(0x2F)
+#define ESR_ELx_EC_BREAKPT_LOW	(0x30)
+#define ESR_ELx_EC_BREAKPT_CUR	(0x31)
+#define ESR_ELx_EC_SOFTSTP_LOW	(0x32)
+#define ESR_ELx_EC_SOFTSTP_CUR	(0x33)
+#define ESR_ELx_EC_WATCHPT_LOW	(0x34)
+#define ESR_ELx_EC_WATCHPT_CUR	(0x35)
+/* Unallocated EC: 0x36 - 0x37 */
+#define ESR_ELx_EC_BKPT32	(0x38)
+/* Unallocated EC: 0x39 */
+#define ESR_ELx_EC_VECTOR32	(0x3A)	/* EL2 only */
+/* Unallocated EC: 0x3B */
+#define ESR_ELx_EC_BRK64	(0x3C)
+/* Unallocated EC: 0x3D - 0x3F */
+#define ESR_ELx_EC_MAX		(0x3F)
+
+#define ECN(x) { ESR_ELx_EC_##x, #x }
+
+#define kvm_arm_exception_class \
+	ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \
+	ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(PAC), ECN(CP14_64), \
+	ECN(SVC64), ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(SVE), \
+	ECN(IMP_DEF), ECN(IABT_LOW), ECN(IABT_CUR), \
+	ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \
+	ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \
+	ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \
+	ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \
+	ECN(BKPT32), ECN(VECTOR32), ECN(BRK64)
+
+#endif /* ARCH_PERF_ARM64_EXCEPTION_TYPES_H */
diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/arch/arm64/util/kvm-stat.c
new file mode 100644
index 000000000000..32e58576f186
--- /dev/null
+++ b/tools/perf/arch/arm64/util/kvm-stat.c
@@ -0,0 +1,86 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <memory.h>
+#include "../../util/evsel.h"
+#include "../../util/kvm-stat.h"
+#include "arm64_exception_types.h"
+#include "debug.h"
+
+define_exit_reasons_table(arm64_exit_reasons, kvm_arm_exception_type);
+define_exit_reasons_table(arm64_trap_exit_reasons, kvm_arm_exception_class);
+
+const char *kvm_trap_exit_reason = "esr_ec";
+const char *vcpu_id_str = "id";
+const int decode_str_len = 20;
+const char *kvm_exit_reason = "ret";
+const char *kvm_entry_trace = "kvm:kvm_entry";
+const char *kvm_exit_trace = "kvm:kvm_exit";
+
+const char *kvm_events_tp[] = {
+	"kvm:kvm_entry",
+	"kvm:kvm_exit",
+	NULL,
+};
+
+static void event_get_key(struct evsel *evsel,
+			  struct perf_sample *sample,
+			  struct event_key *key)
+{
+	key->info = 0;
+	key->key = perf_evsel__intval(evsel, sample, kvm_exit_reason);
+	key->exit_reasons = arm64_exit_reasons;
+
+	/*
+	 * TRAP exceptions carry exception class info in esr_ec field
+	 * and, hence, we need to use a different exit_reasons table to
+	 * properly decode event's est_ec.
+	 */
+	if (key->key == ARM_EXCEPTION_TRAP) {
+		key->key = perf_evsel__intval(evsel, sample,
+					      kvm_trap_exit_reason);
+		key->exit_reasons = arm64_trap_exit_reasons;
+	}
+}
+
+static bool event_begin(struct evsel *evsel,
+			struct perf_sample *sample __maybe_unused,
+			struct event_key *key __maybe_unused)
+{
+	return !strcmp(evsel->name, kvm_entry_trace);
+}
+
+static bool event_end(struct evsel *evsel,
+		      struct perf_sample *sample,
+		      struct event_key *key)
+{
+	if (!strcmp(evsel->name, kvm_exit_trace)) {
+		event_get_key(evsel, sample, key);
+		return true;
+	}
+	return false;
+}
+
+static struct kvm_events_ops exit_events = {
+	.is_begin_event = event_begin,
+	.is_end_event	= event_end,
+	.decode_key	= exit_event_decode_key,
+	.name		= "VM-EXIT"
+};
+
+struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+	{
+		.name	= "vmexit",
+		.ops	= &exit_events,
+	},
+	{ NULL },
+};
+
+const char * const kvm_skip_events[] = {
+	NULL,
+};
+
+int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
+{
+	kvm->exit_reasons_isa = "arm64";
+	return 0;
+}