diff mbox series

[kvm-unit-tests,v2] x86: Add tests for Guest Processor Event Based Sampling (PEBS)

Message ID 20220728112119.58173-1-likexu@tencent.com (mailing list archive)
State New, archived
Headers show
Series [kvm-unit-tests,v2] x86: Add tests for Guest Processor Event Based Sampling (PEBS) | expand

Commit Message

Like Xu July 28, 2022, 11:21 a.m. UTC
From: Like Xu <likexu@tencent.com>

This unit-test is intended to test the KVM's support for
the Processor Event Based Sampling (PEBS) which is another
PMU feature on Intel processors (start from Ice Lake Server).

If a bit in PEBS_ENABLE is set to 1, its corresponding counter will
write at least one PEBS records (including partial state of the vcpu
at the time of the current hardware event) to the guest memory on
counter overflow, and trigger an interrupt at a specific DS state.
The format of a PEBS record can be configured by another register.

These tests cover most usage scenarios, for example there are some
specially constructed scenarios (not a typical behaviour of Linux
PEBS driver). It lowers the threshold for others to understand this
feature and opens up more exploration of KVM implementation or
hw feature itself.

Signed-off-by: Like Xu <likexu@tencent.com>
---
v1 -> v2 Changelog:
- replace unions with local and header helpers; (Sean)
- replace "return 0" with "return report_summary()"; (Sean)
- split checks up to provide more information if cap not advertise; (Sean)
v1: https://lore.kernel.org/kvm/20220721103549.49543-9-likexu@tencent.com/

 lib/x86/msr.h       |   1 +
 x86/Makefile.x86_64 |   1 +
 x86/pmu_pebs.c      | 486 ++++++++++++++++++++++++++++++++++++++++++++
 x86/unittests.cfg   |   7 +
 4 files changed, 495 insertions(+)
 create mode 100644 x86/pmu_pebs.c

Comments

Sean Christopherson Oct. 5, 2022, 7:14 p.m. UTC | #1
On Thu, Jul 28, 2022, Like Xu wrote:
> +#include "vm.h"
> +#include "types.h"
> +#include "processor.h"
> +#include "vmalloc.h"
> +#include "alloc_page.h"
> +
> +#define PC_VECTOR	32

PC?

> +
> +#define	X86_FEATURE_PDCM		(CPUID(0x1, 0, ECX, 15))

This belongs in lib/x86/processor.h, e.g. it'll also be used for the pmu_lbr tests.

> +#define PERF_CAP_PEBS_FORMAT           0xf00
> +#define PMU_CAP_FW_WRITES	(1ULL << 13)
> +#define PMU_CAP_PEBS_BASELINE	(1ULL << 14)
> +
> +#define INTEL_PMC_IDX_FIXED				       32
> +
> +#define GLOBAL_STATUS_BUFFER_OVF_BIT		62
> +#define GLOBAL_STATUS_BUFFER_OVF	BIT_ULL(GLOBAL_STATUS_BUFFER_OVF_BIT)
> +
> +#define EVNTSEL_USR_SHIFT       16
> +#define EVNTSEL_OS_SHIFT        17
> +#define EVNTSEL_EN_SHIF         22
> +
> +#define EVNTSEL_EN      (1 << EVNTSEL_EN_SHIF)
> +#define EVNTSEL_USR     (1 << EVNTSEL_USR_SHIFT)
> +#define EVNTSEL_OS      (1 << EVNTSEL_OS_SHIFT)
> +
> +#define PEBS_DATACFG_MEMINFO	BIT_ULL(0)
> +#define PEBS_DATACFG_GP	BIT_ULL(1)
> +#define PEBS_DATACFG_XMMS	BIT_ULL(2)
> +#define PEBS_DATACFG_LBRS	BIT_ULL(3)
> +
> +#define ICL_EVENTSEL_ADAPTIVE				(1ULL << 34)
> +#define PEBS_DATACFG_LBR_SHIFT	24
> +#define MAX_NUM_LBR_ENTRY	32

Given all the PMU stuff coming in, I think we need e.g. lib/x86/pmu.h to hold all
of the hardware-defined stuff, e.g. #defines and structs that are dictated by
hardware.

> +static inline u8 pebs_format(void)
> +{
> +	return (perf_cap & PERF_CAP_PEBS_FORMAT ) >> 8;
> +}
> +
> +static inline bool pebs_has_baseline(void)
> +{
> +	return perf_cap & PMU_CAP_PEBS_BASELINE;
> +}

These types of accessors can also go in pmu.h.  The easy thing is to just re-read
PERF_CAPABILITIES every time, the overhead of the VM-Exit to emulate the RDMSR
isn't meaningless in the grand scheme of the test.

> +static void pebs_enable(u64 bitmask, u64 pebs_data_cfg)
> +{
> +	static struct debug_store *ds;
> +	u64 baseline_extra_ctrl, fixed_ctr_ctrl = 0;
> +	unsigned int idx;
> +
> +	if (pebs_has_baseline())

This function can snapshot pebs_has_baseline() to avoid RDMSR on every touch.

> +		wrmsr(MSR_PEBS_DATA_CFG, pebs_data_cfg);
> +
> +	ds = (struct debug_store *)ds_bufer;
> +	ds->pebs_index = ds->pebs_buffer_base = (unsigned long)pebs_buffer;
> +	ds->pebs_absolute_maximum = (unsigned long)pebs_buffer + PAGE_SIZE;
> +	ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
> +		get_adaptive_pebs_record_size(pebs_data_cfg);
> +
> +	for (idx = 0; idx < pmu_nr_fixed_counters(); idx++) {
> +		if (!(BIT_ULL(INTEL_PMC_IDX_FIXED + idx) & bitmask))
> +			continue;
> +		baseline_extra_ctrl = pebs_has_baseline() ?
> +			(1ULL << (INTEL_PMC_IDX_FIXED + idx * 4)) : 0;

Init baseline_extra_ctrl to zero outside of the loop, then this can avoid the
ternary operator:

		if (has_baseline)
			baseline_extra_ctrl = BIT(INTEL_PMC_IDX_FIXED + idx * 4);

> +		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + idx, ctr_start_val);

Helpers (or macros?) to read/write counter MSRs would improve readability.

> +		fixed_ctr_ctrl |= (0xbULL << (idx * 4) | baseline_extra_ctrl);
> +	}
> +	if (fixed_ctr_ctrl)
> +		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, fixed_ctr_ctrl);
> +
> +	for (idx = 0; idx < max_nr_gp_events; idx++) {
> +		if (!(BIT_ULL(idx) & bitmask))
> +			continue;
> +		baseline_extra_ctrl = pebs_has_baseline() ?
> +			ICL_EVENTSEL_ADAPTIVE : 0;

Same thing as above, rely on the "has_baseline" not changing to avoid the ternary
operator.

> +		wrmsr(MSR_P6_EVNTSEL0 + idx,

Add a helper/macro instead of manually indexing?

> +		      EVNTSEL_EN | EVNTSEL_OS | EVNTSEL_USR |
> +		      intel_arch_events[idx] | baseline_extra_ctrl);
> +		wrmsr(gp_counter_base + idx, ctr_start_val);

Continuing the theme of code reuse, please add a lib/pmu.c and move common code
and variables there, e.g. tests shouldn't need to manually compute gp_counter_base.
A common "PMU init" routine would allow the library to provide helpers for accessing
GP counters too.

> +	}
> +
> +	wrmsr(MSR_IA32_DS_AREA,  (unsigned long)ds_bufer);
> +	wrmsr(MSR_IA32_PEBS_ENABLE, bitmask);
> +	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, bitmask);
> +}
> +
> +static void pmu_env_cleanup(void)

This is probably a good candidate for library code.  And maybe reset_pmu() or so
to provide a hint that this is often called _before_ running tests?

> +{
> +	unsigned int idx;
> +
> +	memset(ds_bufer, 0x0, PAGE_SIZE);
> +	memset(pebs_buffer, 0x0, PAGE_SIZE);
> +	wrmsr(MSR_IA32_PEBS_ENABLE, 0);
> +	wrmsr(MSR_IA32_DS_AREA,  0);
> +	if (pebs_has_baseline())
> +		wrmsr(MSR_PEBS_DATA_CFG, 0);
> +
> +	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
> +
> +	wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
> +	for (idx = 0; idx < pmu_nr_fixed_counters(); idx++) {

Curly braces aren't necessary.  And rather than call a function every time,
add a global struct in the library to track the PMU capabilities.

> +		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + idx, 0);
> +	}
> +
> +	for (idx = 0; idx < pmu_nr_gp_counters(); idx++) {
> +		wrmsr(MSR_P6_EVNTSEL0 + idx, 0);
> +		wrmsr(MSR_IA32_PERFCTR0 + idx, 0);
> +	}
> +
> +	wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, rdmsr(MSR_CORE_PERF_GLOBAL_STATUS));
> +}
> +
> +static inline void pebs_disable_1(void)
> +{
> +	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
> +}
> +
> +static inline void pebs_disable_2(void)
> +{
> +	wrmsr(MSR_IA32_PEBS_ENABLE, 0);
> +	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
> +}
> +
> +static void pebs_disable(unsigned int idx)
> +{
> +	if (idx % 2) {

Curly braces unnecessary.  That said, the helpers do not help.  It's much easier
to do:

	/* comment goes here */
	if (idx % 2)
		wrmsr(MSR_IA32_PEBS_ENABLE, 0);

	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);

Please add a comment, it's not at all obvious to me (non-PMU person) what this
code is doing.

> +		pebs_disable_1();
> +	} else {
> +		pebs_disable_2();
> +	}
> +}
> +
> +static void check_pebs_records(u64 bitmask, u64 pebs_data_cfg)
> +{
> +	struct pebs_basic *pebs_rec = (struct pebs_basic *)pebs_buffer;
> +	struct debug_store *ds = (struct debug_store *)ds_bufer;
> +	unsigned int pebs_record_size = get_adaptive_pebs_record_size(pebs_data_cfg);
> +	unsigned int count = 0;
> +	bool expected, pebs_idx_match, pebs_size_match, data_cfg_match;
> +	void *vernier;
> +
> +	expected = (ds->pebs_index == ds->pebs_buffer_base) && !pebs_rec->format_size;
> +	if (!(rdmsr(MSR_CORE_PERF_GLOBAL_STATUS) & GLOBAL_STATUS_BUFFER_OVF)) {
> +		report(expected, "No OVF irq, none PEBS records.");
> +		return;
> +	}
> +
> +	if (expected) {
> +		report(!expected, "A OVF irq, but none PEBS records.");
> +		return;
> +	}
> +
> +	expected = ds->pebs_index >= ds->pebs_interrupt_threshold;
> +	vernier = (void *)pebs_buffer;

Heh, I have zero clue what vernier means here.  Dictionary says:

  a small movable graduated scale for obtaining fractional parts of
  subdivisions on a fixed main scale of a barometer, sextant, or other measuring
  instrument.

but that doesn't help me understand what this is doing.

> +	do {
> +		pebs_rec = (struct pebs_basic *)vernier;
> +		pebs_record_size = pebs_rec->format_size >> 48;

Add a #define instead of open coding a magic number.

> +		pebs_idx_match =
> +			pebs_rec->applicable_counters & bitmask;
> +		pebs_size_match =
> +			pebs_record_size == get_adaptive_pebs_record_size(pebs_data_cfg);
> +		data_cfg_match =
> +			(pebs_rec->format_size & 0xffffffffffff) == pebs_data_cfg;

Please use GENMASK_ULL.

> +		expected = pebs_idx_match && pebs_size_match && data_cfg_match;
> +		report(expected,
> +		       "PEBS record (written seq %d) is verified (inclduing size, counters and cfg).", count);
> +		vernier = vernier + pebs_record_size;
> +		count++;
> +	} while (expected && (void *)vernier < (void *)ds->pebs_index);
> +
> +	if (!expected) {
> +		if (!pebs_idx_match)
> +			printf("FAIL: The applicable_counters (0x%lx) doesn't match with pmc_bitmask (0x%lx).\n",
> +			       pebs_rec->applicable_counters, bitmask);
> +		if (!pebs_size_match)
> +			printf("FAIL: The pebs_record_size (%d) doesn't match with MSR_PEBS_DATA_CFG (%d).\n",
> +			       pebs_record_size, get_adaptive_pebs_record_size(pebs_data_cfg));
> +		if (!data_cfg_match)
> +			printf("FAIL: The pebs_data_cfg (0x%lx) doesn't match with MSR_PEBS_DATA_CFG (0x%lx).\n",
> +			       pebs_rec->format_size & 0xffffffffffff, pebs_data_cfg);
> +	}
> +}
> +
> +static void check_one_counter(enum pmc_type type,
> +			      unsigned int idx, u64 pebs_data_cfg)
> +{
> +	report_prefix_pushf("%s counter %d (0x%lx)",
> +			    type == FIXED ? "Extended Fixed" : "GP", idx, ctr_start_val);
> +	pmu_env_cleanup();
> +	pebs_enable(BIT_ULL(type == FIXED ? INTEL_PMC_IDX_FIXED + idx : idx), pebs_data_cfg);

Please avoid burying ternary operators like this, it makes the code hard to follow
because it's difficult to visually identify what goes with what.  You can also
avoid copy+paste...

	int pebs_bit = BIT_ULL(type == FIXED ? INTEL_PMC_IDX_FIXED + idx : idx);

	...

	pebs_enable(pebs_bit, pebs_data_cfg);

	...

	check_pebs_records(pebs_bits, pebs_data_cfg);

> +	workload();
> +	pebs_disable(idx);
> +	check_pebs_records(BIT_ULL(type == FIXED ? INTEL_PMC_IDX_FIXED + idx : idx), pebs_data_cfg);
> +	report_prefix_pop();
> +}
> +
> +static void check_multiple_counters(u64 bitmask, u64 pebs_data_cfg)
> +{
> +	pmu_env_cleanup();
> +	pebs_enable(bitmask, pebs_data_cfg);
> +	workload2();


> +	pebs_disable(0);

Too much magic.  Looks like the intent is to trigger writes to both MSRs, but why?

> +	check_pebs_records(bitmask, pebs_data_cfg);
> +}
> +
> +static void check_pebs_counters(u64 pebs_data_cfg)
> +{
> +	unsigned int idx;
> +	u64 bitmask = 0;
> +
> +	for (idx = 0; idx < pmu_nr_fixed_counters(); idx++)
> +		check_one_counter(FIXED, idx, pebs_data_cfg);
> +
> +	for (idx = 0; idx < max_nr_gp_events; idx++)
> +		check_one_counter(GP, idx, pebs_data_cfg);
> +
> +	for (idx = 0; idx < pmu_nr_fixed_counters(); idx++)
> +		bitmask |= BIT_ULL(INTEL_PMC_IDX_FIXED + idx);
> +	for (idx = 0; idx < max_nr_gp_events; idx += 2)
> +		bitmask |= BIT_ULL(idx);
> +	report_prefix_pushf("Multiple (0x%lx)", bitmask);
> +	check_multiple_counters(bitmask, pebs_data_cfg);
> +	report_prefix_pop();
> +}
> +
> +int main(int ac, char **av)
> +{
> +	unsigned int i, j;
> +
> +	setup_vm();
> +
> +	max_nr_gp_events = MIN(pmu_nr_gp_counters(), ARRAY_SIZE(intel_arch_events));
> +
> +	printf("PMU version: %d\n", pmu_version());
> +	if (this_cpu_has(X86_FEATURE_PDCM))
> +		perf_cap = rdmsr(MSR_IA32_PERF_CAPABILITIES);
> +
> +	if (perf_cap & PMU_CAP_FW_WRITES)
> +		gp_counter_base = MSR_IA32_PMC0;
> +
> +	if (!is_intel()) {
> +		report_skip("PEBS is only supported on Intel CPUs (ICX or later)");

State exactly what check failed so that the user doesn't need to look at the code
to understand exactly what failed.  E.g. the "ICX or later" can be interpreted as
"the check failed because it's not ICX+", but that's not what the code does.

		report_skip("PEBS requires Intel ICX or later, non-Intel detected");

> +		return report_summary();
> +	} else if (pmu_version() < 2) {
> +		report_skip("Architectural PMU version is not high enough");

Again, unnecessarily vague.  Don't make the user read the code, provide all the info
in the error message.

		report_skip("PEBS required PMU version 2, reported version is %d",
			    pmu_version());
		
> +		return report_summary();
> +	} else if (!pebs_format()) {
> +		report_skip("PEBS not enumerated in PERF_CAPABILITIES");
> +		return report_summary();
> +	} else if (rdmsr(MSR_IA32_MISC_ENABLE) & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL) {
> +		report_skip("PEBS unavailable according to MISC_ENABLE");
> +		return report_summary();
> +	}
> +
> +	printf("PEBS format: %d\n", pebs_format());
> +	printf("PEBS GP counters: %d\n", pmu_nr_gp_counters());
> +	printf("PEBS Fixed counters: %d\n", pmu_nr_fixed_counters());
> +	printf("PEBS baseline (Adaptive PEBS): %d\n", pebs_has_baseline());
> +
> +	printf("Known reasons for none PEBS records:\n");
> +	printf("1. The selected event does not support PEBS;\n");
> +	printf("2. From a core pmu perspective, the vCPU and pCPU models are not same;\n");
> +	printf("3. Guest counter has not yet overflowed or been cross-mapped by the host;\n");

Printing this every time the test is run is confusing.  If the goal is to help
users debug failures, then a comment will probably suffice.  

> diff --git a/x86/unittests.cfg b/x86/unittests.cfg
> index 01d775e..d55db99 100644
> --- a/x86/unittests.cfg
> +++ b/x86/unittests.cfg
> @@ -198,6 +198,13 @@ check = /sys/module/kvm/parameters/ignore_msrs=N
>  check = /proc/sys/kernel/nmi_watchdog=0
>  accel = kvm
>  
> +[pmu_pebs]
> +arch = x86_64
> +file = pmu_pebs.flat
> +extra_params = -cpu host,migratable=no
> +check = /proc/sys/kernel/nmi_watchdog=0
> +accel = kvm

In a separate commit, add a group for this and all other PMU tests

  groups = pmu

so that it's easy to run all PMU tests, e.g. when making PMU KVM changes.
Like Xu Oct. 18, 2022, 9:01 a.m. UTC | #2
Most of the comments will be addressed in the next version.

On 6/10/2022 3:14 am, Sean Christopherson wrote:
> On Thu, Jul 28, 2022, Like Xu wrote:
>> +#include "vm.h"
>> +#include "types.h"
>> +#include "processor.h"
>> +#include "vmalloc.h"
>> +#include "alloc_page.h"
>> +
>> +#define PC_VECTOR	32
> 
> PC?

Part of legacy code, may be "performance counter vector" ?
It will be reused in the new lib/pmu.h.

> 
>> +
>> +#define	X86_FEATURE_PDCM		(CPUID(0x1, 0, ECX, 15))
> 
> This belongs in lib/x86/processor.h, e.g. it'll also be used for the pmu_lbr tests.

Applied.

> 
>> +#define PERF_CAP_PEBS_FORMAT           0xf00
>> +#define PMU_CAP_FW_WRITES	(1ULL << 13)
>> +#define PMU_CAP_PEBS_BASELINE	(1ULL << 14)
>> +
>> +#define INTEL_PMC_IDX_FIXED				       32
>> +
>> +#define GLOBAL_STATUS_BUFFER_OVF_BIT		62
>> +#define GLOBAL_STATUS_BUFFER_OVF	BIT_ULL(GLOBAL_STATUS_BUFFER_OVF_BIT)
>> +
>> +#define EVNTSEL_USR_SHIFT       16
>> +#define EVNTSEL_OS_SHIFT        17
>> +#define EVNTSEL_EN_SHIF         22
>> +
>> +#define EVNTSEL_EN      (1 << EVNTSEL_EN_SHIF)
>> +#define EVNTSEL_USR     (1 << EVNTSEL_USR_SHIFT)
>> +#define EVNTSEL_OS      (1 << EVNTSEL_OS_SHIFT)
>> +
>> +#define PEBS_DATACFG_MEMINFO	BIT_ULL(0)
>> +#define PEBS_DATACFG_GP	BIT_ULL(1)
>> +#define PEBS_DATACFG_XMMS	BIT_ULL(2)
>> +#define PEBS_DATACFG_LBRS	BIT_ULL(3)
>> +
>> +#define ICL_EVENTSEL_ADAPTIVE				(1ULL << 34)
>> +#define PEBS_DATACFG_LBR_SHIFT	24
>> +#define MAX_NUM_LBR_ENTRY	32
> 
> Given all the PMU stuff coming in, I think we need e.g. lib/x86/pmu.h to hold all
> of the hardware-defined stuff, e.g. #defines and structs that are dictated by
> hardware.

Applied.

> 
>> +static inline u8 pebs_format(void)
>> +{
>> +	return (perf_cap & PERF_CAP_PEBS_FORMAT ) >> 8;
>> +}
>> +
>> +static inline bool pebs_has_baseline(void)
>> +{
>> +	return perf_cap & PMU_CAP_PEBS_BASELINE;
>> +}
> 
> These types of accessors can also go in pmu.h.  The easy thing is to just re-read
> PERF_CAPABILITIES every time, the overhead of the VM-Exit to emulate the RDMSR
> isn't meaningless in the grand scheme of the test.

More helpers will be added into lib/pmu.h.

> 
>> +static void pebs_enable(u64 bitmask, u64 pebs_data_cfg)
>> +{
>> +	static struct debug_store *ds;
>> +	u64 baseline_extra_ctrl, fixed_ctr_ctrl = 0;
>> +	unsigned int idx;
>> +
>> +	if (pebs_has_baseline())
> 
> This function can snapshot pebs_has_baseline() to avoid RDMSR on every touch.

Applied.

> 
>> +		wrmsr(MSR_PEBS_DATA_CFG, pebs_data_cfg);
>> +
>> +	ds = (struct debug_store *)ds_bufer;
>> +	ds->pebs_index = ds->pebs_buffer_base = (unsigned long)pebs_buffer;
>> +	ds->pebs_absolute_maximum = (unsigned long)pebs_buffer + PAGE_SIZE;
>> +	ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
>> +		get_adaptive_pebs_record_size(pebs_data_cfg);
>> +
>> +	for (idx = 0; idx < pmu_nr_fixed_counters(); idx++) {
>> +		if (!(BIT_ULL(INTEL_PMC_IDX_FIXED + idx) & bitmask))
>> +			continue;
>> +		baseline_extra_ctrl = pebs_has_baseline() ?
>> +			(1ULL << (INTEL_PMC_IDX_FIXED + idx * 4)) : 0;
> 
> Init baseline_extra_ctrl to zero outside of the loop, then this can avoid the
> ternary operator:

Fine to me, and why the C ternary operator is not welcome.

> 
> 		if (has_baseline)
> 			baseline_extra_ctrl = BIT(INTEL_PMC_IDX_FIXED + idx * 4);
> 
>> +		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + idx, ctr_start_val);
> 
> Helpers (or macros?) to read/write counter MSRs would improve readability.

Emm, write_fixed_counter_value(idx, value);

> 
>> +		fixed_ctr_ctrl |= (0xbULL << (idx * 4) | baseline_extra_ctrl);
>> +	}
>> +	if (fixed_ctr_ctrl)
>> +		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, fixed_ctr_ctrl);
>> +
>> +	for (idx = 0; idx < max_nr_gp_events; idx++) {
>> +		if (!(BIT_ULL(idx) & bitmask))
>> +			continue;
>> +		baseline_extra_ctrl = pebs_has_baseline() ?
>> +			ICL_EVENTSEL_ADAPTIVE : 0;
> 
> Same thing as above, rely on the "has_baseline" not changing to avoid the ternary
> operator.
> 
>> +		wrmsr(MSR_P6_EVNTSEL0 + idx,
> 
> Add a helper/macro instead of manually indexing?

Emm, write_gp_event_select(...)

> 
>> +		      EVNTSEL_EN | EVNTSEL_OS | EVNTSEL_USR |
>> +		      intel_arch_events[idx] | baseline_extra_ctrl);
>> +		wrmsr(gp_counter_base + idx, ctr_start_val);
> 
> Continuing the theme of code reuse, please add a lib/pmu.c and move common code
> and variables there, e.g. tests shouldn't need to manually compute gp_counter_base.
> A common "PMU init" routine would allow the library to provide helpers for accessing
> GP counters too.

Applied, and some gloabl varibles are added to lib/pmu.c

> 
>> +	}
>> +
>> +	wrmsr(MSR_IA32_DS_AREA,  (unsigned long)ds_bufer);
>> +	wrmsr(MSR_IA32_PEBS_ENABLE, bitmask);
>> +	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, bitmask);
>> +}
>> +
>> +static void pmu_env_cleanup(void)
> 
> This is probably a good candidate for library code.  And maybe reset_pmu() or so
> to provide a hint that this is often called _before_ running tests?

It will be renamed to reset_pebs() in the pebs specific src file.

> 
>> +{
>> +	unsigned int idx;
>> +
>> +	memset(ds_bufer, 0x0, PAGE_SIZE);
>> +	memset(pebs_buffer, 0x0, PAGE_SIZE);
>> +	wrmsr(MSR_IA32_PEBS_ENABLE, 0);
>> +	wrmsr(MSR_IA32_DS_AREA,  0);
>> +	if (pebs_has_baseline())
>> +		wrmsr(MSR_PEBS_DATA_CFG, 0);
>> +
>> +	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
>> +
>> +	wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
>> +	for (idx = 0; idx < pmu_nr_fixed_counters(); idx++) {
> 
> Curly braces aren't necessary.  And rather than call a function every time,
> add a global struct in the library to track the PMU capabilities.

The this_cpu_perf_capabilities() may help.
And, reset_all_{gp, fixed}_counters() are added into lib/pmu.h

> 
>> +		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + idx, 0);
>> +	}
>> +
>> +	for (idx = 0; idx < pmu_nr_gp_counters(); idx++) {
>> +		wrmsr(MSR_P6_EVNTSEL0 + idx, 0);
>> +		wrmsr(MSR_IA32_PERFCTR0 + idx, 0);
>> +	}
>> +
>> +	wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, rdmsr(MSR_CORE_PERF_GLOBAL_STATUS));
>> +}
>> +
>> +static inline void pebs_disable_1(void)
>> +{
>> +	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
>> +}
>> +
>> +static inline void pebs_disable_2(void)
>> +{
>> +	wrmsr(MSR_IA32_PEBS_ENABLE, 0);
>> +	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
>> +}
>> +
>> +static void pebs_disable(unsigned int idx)
>> +{
>> +	if (idx % 2) {
> 
> Curly braces unnecessary.  That said, the helpers do not help.  It's much easier
> to do:
> 
> 	/* comment goes here */
> 	if (idx % 2)
> 		wrmsr(MSR_IA32_PEBS_ENABLE, 0);
> 
> 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);

Applied.

> 
> Please add a comment, it's not at all obvious to me (non-PMU person) what this
> code is doing.
> 
>> +		pebs_disable_1();
>> +	} else {
>> +		pebs_disable_2();
>> +	}
>> +}

If we only clear the PEBS_ENABLE bit, the counter will continue to increment.
In this very tiny time window, if the counter overflows no pebs record will be 
generated,
but a normal counter irq. Test this fully with two ways.

>> +
>> +static void check_pebs_records(u64 bitmask, u64 pebs_data_cfg)
>> +{
>> +	struct pebs_basic *pebs_rec = (struct pebs_basic *)pebs_buffer;
>> +	struct debug_store *ds = (struct debug_store *)ds_bufer;
>> +	unsigned int pebs_record_size = get_adaptive_pebs_record_size(pebs_data_cfg);
>> +	unsigned int count = 0;
>> +	bool expected, pebs_idx_match, pebs_size_match, data_cfg_match;
>> +	void *vernier;
>> +
>> +	expected = (ds->pebs_index == ds->pebs_buffer_base) && !pebs_rec->format_size;
>> +	if (!(rdmsr(MSR_CORE_PERF_GLOBAL_STATUS) & GLOBAL_STATUS_BUFFER_OVF)) {
>> +		report(expected, "No OVF irq, none PEBS records.");
>> +		return;
>> +	}
>> +
>> +	if (expected) {
>> +		report(!expected, "A OVF irq, but none PEBS records.");
>> +		return;
>> +	}
>> +
>> +	expected = ds->pebs_index >= ds->pebs_interrupt_threshold;
>> +	vernier = (void *)pebs_buffer;
> 
> Heh, I have zero clue what vernier means here.  Dictionary says:
> 
>    a small movable graduated scale for obtaining fractional parts of
>    subdivisions on a fixed main scale of a barometer, sextant, or other measuring
>    instrument.
> 
> but that doesn't help me understand what this is doing.

Rename it to "cur_record".

> 
>> +	do {
>> +		pebs_rec = (struct pebs_basic *)vernier;
>> +		pebs_record_size = pebs_rec->format_size >> 48;
> 
> Add a #define instead of open coding a magic number.

/* bits [63:48] provides the size of the current record in bytes */
#define	RECORD_SIZE_OFFSET	48

> 
>> +		pebs_idx_match =
>> +			pebs_rec->applicable_counters & bitmask;
>> +		pebs_size_match =
>> +			pebs_record_size == get_adaptive_pebs_record_size(pebs_data_cfg);
>> +		data_cfg_match =
>> +			(pebs_rec->format_size & 0xffffffffffff) == pebs_data_cfg;
> 
> Please use GENMASK_ULL.

GENMASK_ULL(47, 0)

> 
>> +		expected = pebs_idx_match && pebs_size_match && data_cfg_match;
>> +		report(expected,
>> +		       "PEBS record (written seq %d) is verified (inclduing size, counters and cfg).", count);
>> +		vernier = vernier + pebs_record_size;
>> +		count++;
>> +	} while (expected && (void *)vernier < (void *)ds->pebs_index);
>> +
>> +	if (!expected) {
>> +		if (!pebs_idx_match)
>> +			printf("FAIL: The applicable_counters (0x%lx) doesn't match with pmc_bitmask (0x%lx).\n",
>> +			       pebs_rec->applicable_counters, bitmask);
>> +		if (!pebs_size_match)
>> +			printf("FAIL: The pebs_record_size (%d) doesn't match with MSR_PEBS_DATA_CFG (%d).\n",
>> +			       pebs_record_size, get_adaptive_pebs_record_size(pebs_data_cfg));
>> +		if (!data_cfg_match)
>> +			printf("FAIL: The pebs_data_cfg (0x%lx) doesn't match with MSR_PEBS_DATA_CFG (0x%lx).\n",
>> +			       pebs_rec->format_size & 0xffffffffffff, pebs_data_cfg);
>> +	}
>> +}
>> +
>> +static void check_one_counter(enum pmc_type type,
>> +			      unsigned int idx, u64 pebs_data_cfg)
>> +{
>> +	report_prefix_pushf("%s counter %d (0x%lx)",
>> +			    type == FIXED ? "Extended Fixed" : "GP", idx, ctr_start_val);
>> +	pmu_env_cleanup();
>> +	pebs_enable(BIT_ULL(type == FIXED ? INTEL_PMC_IDX_FIXED + idx : idx), pebs_data_cfg);
> 
> Please avoid burying ternary operators like this, it makes the code hard to follow
> because it's difficult to visually identify what goes with what.  You can also
> avoid copy+paste...
> 
> 	int pebs_bit = BIT_ULL(type == FIXED ? INTEL_PMC_IDX_FIXED + idx : idx);
> 
> 	...
> 
> 	pebs_enable(pebs_bit, pebs_data_cfg);
> 
> 	...
> 
> 	check_pebs_records(pebs_bits, pebs_data_cfg);

Applied.

> 
>> +	workload();
>> +	pebs_disable(idx);
>> +	check_pebs_records(BIT_ULL(type == FIXED ? INTEL_PMC_IDX_FIXED + idx : idx), pebs_data_cfg);
>> +	report_prefix_pop();
>> +}
>> +
>> +static void check_multiple_counters(u64 bitmask, u64 pebs_data_cfg)
>> +{
>> +	pmu_env_cleanup();
>> +	pebs_enable(bitmask, pebs_data_cfg);
>> +	workload2();
> 
> 
>> +	pebs_disable(0);
> 
> Too much magic.  Looks like the intent is to trigger writes to both MSRs, but why?

In this case, more than one PEBS records will be generated (from more than one 
counters).

> 
>> +	check_pebs_records(bitmask, pebs_data_cfg);
>> +}
>> +
>> +static void check_pebs_counters(u64 pebs_data_cfg)
>> +{
>> +	unsigned int idx;
>> +	u64 bitmask = 0;
>> +
>> +	for (idx = 0; idx < pmu_nr_fixed_counters(); idx++)
>> +		check_one_counter(FIXED, idx, pebs_data_cfg);
>> +
>> +	for (idx = 0; idx < max_nr_gp_events; idx++)
>> +		check_one_counter(GP, idx, pebs_data_cfg);
>> +
>> +	for (idx = 0; idx < pmu_nr_fixed_counters(); idx++)
>> +		bitmask |= BIT_ULL(INTEL_PMC_IDX_FIXED + idx);
>> +	for (idx = 0; idx < max_nr_gp_events; idx += 2)
>> +		bitmask |= BIT_ULL(idx);
>> +	report_prefix_pushf("Multiple (0x%lx)", bitmask);
>> +	check_multiple_counters(bitmask, pebs_data_cfg);
>> +	report_prefix_pop();
>> +}
>> +
>> +int main(int ac, char **av)
>> +{
>> +	unsigned int i, j;
>> +
>> +	setup_vm();
>> +
>> +	max_nr_gp_events = MIN(pmu_nr_gp_counters(), ARRAY_SIZE(intel_arch_events));
>> +
>> +	printf("PMU version: %d\n", pmu_version());
>> +	if (this_cpu_has(X86_FEATURE_PDCM))
>> +		perf_cap = rdmsr(MSR_IA32_PERF_CAPABILITIES);
>> +
>> +	if (perf_cap & PMU_CAP_FW_WRITES)
>> +		gp_counter_base = MSR_IA32_PMC0;
>> +
>> +	if (!is_intel()) {
>> +		report_skip("PEBS is only supported on Intel CPUs (ICX or later)");
> 
> State exactly what check failed so that the user doesn't need to look at the code
> to understand exactly what failed.  E.g. the "ICX or later" can be interpreted as
> "the check failed because it's not ICX+", but that's not what the code does.

Applied.

> 
> 		report_skip("PEBS requires Intel ICX or later, non-Intel detected");
> 
>> +		return report_summary();
>> +	} else if (pmu_version() < 2) {
>> +		report_skip("Architectural PMU version is not high enough");
> 
> Again, unnecessarily vague.  Don't make the user read the code, provide all the info
> in the error message.
> 
> 		report_skip("PEBS required PMU version 2, reported version is %d",
> 			    pmu_version());

Applied.

> 		
>> +		return report_summary();
>> +	} else if (!pebs_format()) {
>> +		report_skip("PEBS not enumerated in PERF_CAPABILITIES");
>> +		return report_summary();
>> +	} else if (rdmsr(MSR_IA32_MISC_ENABLE) & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL) {
>> +		report_skip("PEBS unavailable according to MISC_ENABLE");
>> +		return report_summary();
>> +	}
>> +
>> +	printf("PEBS format: %d\n", pebs_format());
>> +	printf("PEBS GP counters: %d\n", pmu_nr_gp_counters());
>> +	printf("PEBS Fixed counters: %d\n", pmu_nr_fixed_counters());
>> +	printf("PEBS baseline (Adaptive PEBS): %d\n", pebs_has_baseline());
>> +
>> +	printf("Known reasons for none PEBS records:\n");
>> +	printf("1. The selected event does not support PEBS;\n");
>> +	printf("2. From a core pmu perspective, the vCPU and pCPU models are not same;\n");
>> +	printf("3. Guest counter has not yet overflowed or been cross-mapped by the host;\n");
> 
> Printing this every time the test is run is confusing.  If the goal is to help
> users debug failures, then a comment will probably suffice.

Applied.

> 
>> diff --git a/x86/unittests.cfg b/x86/unittests.cfg
>> index 01d775e..d55db99 100644
>> --- a/x86/unittests.cfg
>> +++ b/x86/unittests.cfg
>> @@ -198,6 +198,13 @@ check = /sys/module/kvm/parameters/ignore_msrs=N
>>   check = /proc/sys/kernel/nmi_watchdog=0
>>   accel = kvm
>>   
>> +[pmu_pebs]
>> +arch = x86_64
>> +file = pmu_pebs.flat
>> +extra_params = -cpu host,migratable=no
>> +check = /proc/sys/kernel/nmi_watchdog=0
>> +accel = kvm
> 
> In a separate commit, add a group for this and all other PMU tests
> 
>    groups = pmu
> 
> so that it's easy to run all PMU tests, e.g. when making PMU KVM changes.
Sean Christopherson Oct. 21, 2022, 6:51 p.m. UTC | #3
On Tue, Oct 18, 2022, Like Xu wrote:
> Most of the comments will be addressed in the next version.
> 
> On 6/10/2022 3:14 am, Sean Christopherson wrote:
> > On Thu, Jul 28, 2022, Like Xu wrote:
> > > +#include "vm.h"
> > > +#include "types.h"
> > > +#include "processor.h"
> > > +#include "vmalloc.h"
> > > +#include "alloc_page.h"
> > > +
> > > +#define PC_VECTOR	32
> > 
> > PC?
> 
> Part of legacy code, may be "performance counter vector" ?

Ah, it comes from the LVT Performance Counter Register".  

> It will be reused in the new lib/pmu.h.

Any objection to renaming it to PMI_VECTOR?  That's much more familiar for KVM
developers and it's still correct, e.g. it's the PMI vector that's programmed into
the LVT PC register.
diff mbox series

Patch

diff --git a/lib/x86/msr.h b/lib/x86/msr.h
index fa1c0c8..252e041 100644
--- a/lib/x86/msr.h
+++ b/lib/x86/msr.h
@@ -52,6 +52,7 @@ 
 #define MSR_IA32_MCG_CTL		0x0000017b
 
 #define MSR_IA32_PEBS_ENABLE		0x000003f1
+#define MSR_PEBS_DATA_CFG		0x000003f2
 #define MSR_IA32_DS_AREA		0x00000600
 #define MSR_IA32_PERF_CAPABILITIES	0x00000345
 
diff --git a/x86/Makefile.x86_64 b/x86/Makefile.x86_64
index 8f9463c..bd827fe 100644
--- a/x86/Makefile.x86_64
+++ b/x86/Makefile.x86_64
@@ -33,6 +33,7 @@  tests += $(TEST_DIR)/vmware_backdoors.$(exe)
 tests += $(TEST_DIR)/rdpru.$(exe)
 tests += $(TEST_DIR)/pks.$(exe)
 tests += $(TEST_DIR)/pmu_lbr.$(exe)
+tests += $(TEST_DIR)/pmu_pebs.$(exe)
 
 ifeq ($(CONFIG_EFI),y)
 tests += $(TEST_DIR)/amd_sev.$(exe)
diff --git a/x86/pmu_pebs.c b/x86/pmu_pebs.c
new file mode 100644
index 0000000..db4ecbf
--- /dev/null
+++ b/x86/pmu_pebs.c
@@ -0,0 +1,486 @@ 
+#include "x86/msr.h"
+#include "x86/processor.h"
+#include "x86/isr.h"
+#include "x86/apic.h"
+#include "x86/apic-defs.h"
+#include "x86/desc.h"
+#include "alloc.h"
+
+#include "vm.h"
+#include "types.h"
+#include "processor.h"
+#include "vmalloc.h"
+#include "alloc_page.h"
+
+#define PC_VECTOR	32
+
+#define	X86_FEATURE_PDCM		(CPUID(0x1, 0, ECX, 15))
+
+#define PERF_CAP_PEBS_FORMAT           0xf00
+#define PMU_CAP_FW_WRITES	(1ULL << 13)
+#define PMU_CAP_PEBS_BASELINE	(1ULL << 14)
+
+#define INTEL_PMC_IDX_FIXED				       32
+
+#define GLOBAL_STATUS_BUFFER_OVF_BIT		62
+#define GLOBAL_STATUS_BUFFER_OVF	BIT_ULL(GLOBAL_STATUS_BUFFER_OVF_BIT)
+
+#define EVNTSEL_USR_SHIFT       16
+#define EVNTSEL_OS_SHIFT        17
+#define EVNTSEL_EN_SHIF         22
+
+#define EVNTSEL_EN      (1 << EVNTSEL_EN_SHIF)
+#define EVNTSEL_USR     (1 << EVNTSEL_USR_SHIFT)
+#define EVNTSEL_OS      (1 << EVNTSEL_OS_SHIFT)
+
+#define PEBS_DATACFG_MEMINFO	BIT_ULL(0)
+#define PEBS_DATACFG_GP	BIT_ULL(1)
+#define PEBS_DATACFG_XMMS	BIT_ULL(2)
+#define PEBS_DATACFG_LBRS	BIT_ULL(3)
+
+#define ICL_EVENTSEL_ADAPTIVE				(1ULL << 34)
+#define PEBS_DATACFG_LBR_SHIFT	24
+#define MAX_NUM_LBR_ENTRY	32
+
+static u64 gp_counter_base = MSR_IA32_PERFCTR0;
+static unsigned int max_nr_gp_events;
+static unsigned long *ds_bufer;
+static unsigned long *pebs_buffer;
+static u64 ctr_start_val;
+static u64 perf_cap;
+
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_event_reset[64];
+};
+
+struct pebs_basic {
+	u64 format_size;
+	u64 ip;
+	u64 applicable_counters;
+	u64 tsc;
+};
+
+struct pebs_meminfo {
+	u64 address;
+	u64 aux;
+	u64 latency;
+	u64 tsx_tuning;
+};
+
+struct pebs_gprs {
+	u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di;
+	u64 r8, r9, r10, r11, r12, r13, r14, r15;
+};
+
+struct pebs_xmm {
+	u64 xmm[16*2];	/* two entries for each register */
+};
+
+struct lbr_entry {
+	u64 from;
+	u64 to;
+	u64 info;
+};
+
+enum pmc_type {
+	GP = 0,
+	FIXED,
+};
+
+static uint32_t intel_arch_events[] = {
+	0x00c4, /* PERF_COUNT_HW_BRANCH_INSTRUCTIONS */
+	0x00c5, /* PERF_COUNT_HW_BRANCH_MISSES */
+	0x0300, /* PERF_COUNT_HW_REF_CPU_CYCLES */
+	0x003c, /* PERF_COUNT_HW_CPU_CYCLES */
+	0x00c0, /* PERF_COUNT_HW_INSTRUCTIONS */
+	0x013c, /* PERF_COUNT_HW_BUS_CYCLES */
+	0x4f2e, /* PERF_COUNT_HW_CACHE_REFERENCES */
+	0x412e, /* PERF_COUNT_HW_CACHE_MISSES */
+};
+
+static u64 pebs_data_cfgs[] = {
+	PEBS_DATACFG_MEMINFO,
+	PEBS_DATACFG_GP,
+	PEBS_DATACFG_XMMS,
+	PEBS_DATACFG_LBRS | ((MAX_NUM_LBR_ENTRY -1) << PEBS_DATACFG_LBR_SHIFT),
+};
+
+/* Iterating each counter value is a waste of time, pick a few typical values. */
+static u64 counter_start_values[] = {
+	/* if PEBS counter doesn't overflow at all */
+	0,
+	0xfffffffffff0,
+	/* normal counter overflow to have PEBS records */
+	0xfffffffffffe,
+	/* test whether emulated instructions should trigger PEBS */
+	0xffffffffffff,
+};
+
+static inline u8 pebs_format(void)
+{
+	return (perf_cap & PERF_CAP_PEBS_FORMAT ) >> 8;
+}
+
+static inline bool pebs_has_baseline(void)
+{
+	return perf_cap & PMU_CAP_PEBS_BASELINE;
+}
+
+static unsigned int get_adaptive_pebs_record_size(u64 pebs_data_cfg)
+{
+	unsigned int sz = sizeof(struct pebs_basic);
+
+	if (!pebs_has_baseline())
+		return sz;
+
+	if (pebs_data_cfg & PEBS_DATACFG_MEMINFO)
+		sz += sizeof(struct pebs_meminfo);
+	if (pebs_data_cfg & PEBS_DATACFG_GP)
+		sz += sizeof(struct pebs_gprs);
+	if (pebs_data_cfg & PEBS_DATACFG_XMMS)
+		sz += sizeof(struct pebs_xmm);
+	if (pebs_data_cfg & PEBS_DATACFG_LBRS)
+		sz += MAX_NUM_LBR_ENTRY * sizeof(struct lbr_entry);
+
+	return sz;
+}
+
+static void cnt_overflow(isr_regs_t *regs)
+{
+	apic_write(APIC_EOI, 0);
+}
+
+static inline void workload(void)
+{
+	asm volatile(
+		"mov $0x0, %%eax\n"
+		"cmp $0x0, %%eax\n"
+		"jne label2\n"
+		"jne label2\n"
+		"jne label2\n"
+		"jne label2\n"
+		"mov $0x0, %%eax\n"
+		"cmp $0x0, %%eax\n"
+		"jne label2\n"
+		"jne label2\n"
+		"jne label2\n"
+		"jne label2\n"
+		"mov $0xa, %%eax\n"
+		"cpuid\n"
+		"mov $0xa, %%eax\n"
+		"cpuid\n"
+		"mov $0xa, %%eax\n"
+		"cpuid\n"
+		"mov $0xa, %%eax\n"
+		"cpuid\n"
+		"mov $0xa, %%eax\n"
+		"cpuid\n"
+		"mov $0xa, %%eax\n"
+		"cpuid\n"
+		"label2:\n"
+		:
+		:
+		: "eax", "ebx", "ecx", "edx");
+}
+
+static inline void workload2(void)
+{
+	asm volatile(
+		"mov $0x0, %%eax\n"
+		"cmp $0x0, %%eax\n"
+		"jne label3\n"
+		"jne label3\n"
+		"jne label3\n"
+		"jne label3\n"
+		"mov $0x0, %%eax\n"
+		"cmp $0x0, %%eax\n"
+		"jne label3\n"
+		"jne label3\n"
+		"jne label3\n"
+		"jne label3\n"
+		"mov $0xa, %%eax\n"
+		"cpuid\n"
+		"mov $0xa, %%eax\n"
+		"cpuid\n"
+		"mov $0xa, %%eax\n"
+		"cpuid\n"
+		"mov $0xa, %%eax\n"
+		"cpuid\n"
+		"mov $0xa, %%eax\n"
+		"cpuid\n"
+		"mov $0xa, %%eax\n"
+		"cpuid\n"
+		"label3:\n"
+		:
+		:
+		: "eax", "ebx", "ecx", "edx");
+}
+
+static void alloc_buffers(void)
+{
+	ds_bufer = alloc_page();
+	force_4k_page(ds_bufer);
+	memset(ds_bufer, 0x0, PAGE_SIZE);
+
+	pebs_buffer = alloc_page();
+	force_4k_page(pebs_buffer);
+	memset(pebs_buffer, 0x0, PAGE_SIZE);
+}
+
+static void free_buffers(void)
+{
+	if (ds_bufer)
+		free_page(ds_bufer);
+
+	if (pebs_buffer)
+		free_page(pebs_buffer);
+}
+
+static void pebs_enable(u64 bitmask, u64 pebs_data_cfg)
+{
+	static struct debug_store *ds;
+	u64 baseline_extra_ctrl, fixed_ctr_ctrl = 0;
+	unsigned int idx;
+
+	if (pebs_has_baseline())
+		wrmsr(MSR_PEBS_DATA_CFG, pebs_data_cfg);
+
+	ds = (struct debug_store *)ds_bufer;
+	ds->pebs_index = ds->pebs_buffer_base = (unsigned long)pebs_buffer;
+	ds->pebs_absolute_maximum = (unsigned long)pebs_buffer + PAGE_SIZE;
+	ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
+		get_adaptive_pebs_record_size(pebs_data_cfg);
+
+	for (idx = 0; idx < pmu_nr_fixed_counters(); idx++) {
+		if (!(BIT_ULL(INTEL_PMC_IDX_FIXED + idx) & bitmask))
+			continue;
+		baseline_extra_ctrl = pebs_has_baseline() ?
+			(1ULL << (INTEL_PMC_IDX_FIXED + idx * 4)) : 0;
+		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + idx, ctr_start_val);
+		fixed_ctr_ctrl |= (0xbULL << (idx * 4) | baseline_extra_ctrl);
+	}
+	if (fixed_ctr_ctrl)
+		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, fixed_ctr_ctrl);
+
+	for (idx = 0; idx < max_nr_gp_events; idx++) {
+		if (!(BIT_ULL(idx) & bitmask))
+			continue;
+		baseline_extra_ctrl = pebs_has_baseline() ?
+			ICL_EVENTSEL_ADAPTIVE : 0;
+		wrmsr(MSR_P6_EVNTSEL0 + idx,
+		      EVNTSEL_EN | EVNTSEL_OS | EVNTSEL_USR |
+		      intel_arch_events[idx] | baseline_extra_ctrl);
+		wrmsr(gp_counter_base + idx, ctr_start_val);
+	}
+
+	wrmsr(MSR_IA32_DS_AREA,  (unsigned long)ds_bufer);
+	wrmsr(MSR_IA32_PEBS_ENABLE, bitmask);
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, bitmask);
+}
+
+static void pmu_env_cleanup(void)
+{
+	unsigned int idx;
+
+	memset(ds_bufer, 0x0, PAGE_SIZE);
+	memset(pebs_buffer, 0x0, PAGE_SIZE);
+	wrmsr(MSR_IA32_PEBS_ENABLE, 0);
+	wrmsr(MSR_IA32_DS_AREA,  0);
+	if (pebs_has_baseline())
+		wrmsr(MSR_PEBS_DATA_CFG, 0);
+
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+	wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
+	for (idx = 0; idx < pmu_nr_fixed_counters(); idx++) {
+		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + idx, 0);
+	}
+
+	for (idx = 0; idx < pmu_nr_gp_counters(); idx++) {
+		wrmsr(MSR_P6_EVNTSEL0 + idx, 0);
+		wrmsr(MSR_IA32_PERFCTR0 + idx, 0);
+	}
+
+	wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, rdmsr(MSR_CORE_PERF_GLOBAL_STATUS));
+}
+
+static inline void pebs_disable_1(void)
+{
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static inline void pebs_disable_2(void)
+{
+	wrmsr(MSR_IA32_PEBS_ENABLE, 0);
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void pebs_disable(unsigned int idx)
+{
+	if (idx % 2) {
+		pebs_disable_1();
+	} else {
+		pebs_disable_2();
+	}
+}
+
+static void check_pebs_records(u64 bitmask, u64 pebs_data_cfg)
+{
+	struct pebs_basic *pebs_rec = (struct pebs_basic *)pebs_buffer;
+	struct debug_store *ds = (struct debug_store *)ds_bufer;
+	unsigned int pebs_record_size = get_adaptive_pebs_record_size(pebs_data_cfg);
+	unsigned int count = 0;
+	bool expected, pebs_idx_match, pebs_size_match, data_cfg_match;
+	void *vernier;
+
+	expected = (ds->pebs_index == ds->pebs_buffer_base) && !pebs_rec->format_size;
+	if (!(rdmsr(MSR_CORE_PERF_GLOBAL_STATUS) & GLOBAL_STATUS_BUFFER_OVF)) {
+		report(expected, "No OVF irq, none PEBS records.");
+		return;
+	}
+
+	if (expected) {
+		report(!expected, "A OVF irq, but none PEBS records.");
+		return;
+	}
+
+	expected = ds->pebs_index >= ds->pebs_interrupt_threshold;
+	vernier = (void *)pebs_buffer;
+	do {
+		pebs_rec = (struct pebs_basic *)vernier;
+		pebs_record_size = pebs_rec->format_size >> 48;
+		pebs_idx_match =
+			pebs_rec->applicable_counters & bitmask;
+		pebs_size_match =
+			pebs_record_size == get_adaptive_pebs_record_size(pebs_data_cfg);
+		data_cfg_match =
+			(pebs_rec->format_size & 0xffffffffffff) == pebs_data_cfg;
+		expected = pebs_idx_match && pebs_size_match && data_cfg_match;
+		report(expected,
+		       "PEBS record (written seq %d) is verified (inclduing size, counters and cfg).", count);
+		vernier = vernier + pebs_record_size;
+		count++;
+	} while (expected && (void *)vernier < (void *)ds->pebs_index);
+
+	if (!expected) {
+		if (!pebs_idx_match)
+			printf("FAIL: The applicable_counters (0x%lx) doesn't match with pmc_bitmask (0x%lx).\n",
+			       pebs_rec->applicable_counters, bitmask);
+		if (!pebs_size_match)
+			printf("FAIL: The pebs_record_size (%d) doesn't match with MSR_PEBS_DATA_CFG (%d).\n",
+			       pebs_record_size, get_adaptive_pebs_record_size(pebs_data_cfg));
+		if (!data_cfg_match)
+			printf("FAIL: The pebs_data_cfg (0x%lx) doesn't match with MSR_PEBS_DATA_CFG (0x%lx).\n",
+			       pebs_rec->format_size & 0xffffffffffff, pebs_data_cfg);
+	}
+}
+
+static void check_one_counter(enum pmc_type type,
+			      unsigned int idx, u64 pebs_data_cfg)
+{
+	report_prefix_pushf("%s counter %d (0x%lx)",
+			    type == FIXED ? "Extended Fixed" : "GP", idx, ctr_start_val);
+	pmu_env_cleanup();
+	pebs_enable(BIT_ULL(type == FIXED ? INTEL_PMC_IDX_FIXED + idx : idx), pebs_data_cfg);
+	workload();
+	pebs_disable(idx);
+	check_pebs_records(BIT_ULL(type == FIXED ? INTEL_PMC_IDX_FIXED + idx : idx), pebs_data_cfg);
+	report_prefix_pop();
+}
+
+static void check_multiple_counters(u64 bitmask, u64 pebs_data_cfg)
+{
+	pmu_env_cleanup();
+	pebs_enable(bitmask, pebs_data_cfg);
+	workload2();
+	pebs_disable(0);
+	check_pebs_records(bitmask, pebs_data_cfg);
+}
+
+static void check_pebs_counters(u64 pebs_data_cfg)
+{
+	unsigned int idx;
+	u64 bitmask = 0;
+
+	for (idx = 0; idx < pmu_nr_fixed_counters(); idx++)
+		check_one_counter(FIXED, idx, pebs_data_cfg);
+
+	for (idx = 0; idx < max_nr_gp_events; idx++)
+		check_one_counter(GP, idx, pebs_data_cfg);
+
+	for (idx = 0; idx < pmu_nr_fixed_counters(); idx++)
+		bitmask |= BIT_ULL(INTEL_PMC_IDX_FIXED + idx);
+	for (idx = 0; idx < max_nr_gp_events; idx += 2)
+		bitmask |= BIT_ULL(idx);
+	report_prefix_pushf("Multiple (0x%lx)", bitmask);
+	check_multiple_counters(bitmask, pebs_data_cfg);
+	report_prefix_pop();
+}
+
+int main(int ac, char **av)
+{
+	unsigned int i, j;
+
+	setup_vm();
+
+	max_nr_gp_events = MIN(pmu_nr_gp_counters(), ARRAY_SIZE(intel_arch_events));
+
+	printf("PMU version: %d\n", pmu_version());
+	if (this_cpu_has(X86_FEATURE_PDCM))
+		perf_cap = rdmsr(MSR_IA32_PERF_CAPABILITIES);
+
+	if (perf_cap & PMU_CAP_FW_WRITES)
+		gp_counter_base = MSR_IA32_PMC0;
+
+	if (!is_intel()) {
+		report_skip("PEBS is only supported on Intel CPUs (ICX or later)");
+		return report_summary();
+	} else if (pmu_version() < 2) {
+		report_skip("Architectural PMU version is not high enough");
+		return report_summary();
+	} else if (!pebs_format()) {
+		report_skip("PEBS not enumerated in PERF_CAPABILITIES");
+		return report_summary();
+	} else if (rdmsr(MSR_IA32_MISC_ENABLE) & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL) {
+		report_skip("PEBS unavailable according to MISC_ENABLE");
+		return report_summary();
+	}
+
+	printf("PEBS format: %d\n", pebs_format());
+	printf("PEBS GP counters: %d\n", pmu_nr_gp_counters());
+	printf("PEBS Fixed counters: %d\n", pmu_nr_fixed_counters());
+	printf("PEBS baseline (Adaptive PEBS): %d\n", pebs_has_baseline());
+
+	printf("Known reasons for none PEBS records:\n");
+	printf("1. The selected event does not support PEBS;\n");
+	printf("2. From a core pmu perspective, the vCPU and pCPU models are not same;\n");
+	printf("3. Guest counter has not yet overflowed or been cross-mapped by the host;\n");
+
+	handle_irq(PC_VECTOR, cnt_overflow);
+	alloc_buffers();
+
+	for (i = 0; i < ARRAY_SIZE(counter_start_values); i++) {
+		ctr_start_val = counter_start_values[i];
+		check_pebs_counters(0);
+		if (!pebs_has_baseline())
+			continue;
+
+		for (j = 0; j < ARRAY_SIZE(pebs_data_cfgs); j++) {
+			report_prefix_pushf("Adaptive (0x%lx)", pebs_data_cfgs[j]);
+			check_pebs_counters(pebs_data_cfgs[j]);
+			report_prefix_pop();
+		}
+	}
+
+	free_buffers();
+
+	return report_summary();
+}
diff --git a/x86/unittests.cfg b/x86/unittests.cfg
index 01d775e..d55db99 100644
--- a/x86/unittests.cfg
+++ b/x86/unittests.cfg
@@ -198,6 +198,13 @@  check = /sys/module/kvm/parameters/ignore_msrs=N
 check = /proc/sys/kernel/nmi_watchdog=0
 accel = kvm
 
+[pmu_pebs]
+arch = x86_64
+file = pmu_pebs.flat
+extra_params = -cpu host,migratable=no
+check = /proc/sys/kernel/nmi_watchdog=0
+accel = kvm
+
 [pmu_emulation]
 file = pmu.flat
 arch = x86_64