diff mbox series

[v6,06/16] KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS

Message ID 20210511024214.280733-7-like.xu@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/pmu: Add *basic* support to enable guest PEBS via DS | expand

Commit Message

Like Xu May 11, 2021, 2:42 a.m. UTC
If IA32_PERF_CAPABILITIES.PEBS_BASELINE [bit 14] is set, the
IA32_PEBS_ENABLE MSR exists and all architecturally enumerated fixed
and general purpose counters have corresponding bits in IA32_PEBS_ENABLE
that enable generation of PEBS records. The general-purpose counter bits
start at bit IA32_PEBS_ENABLE[0], and the fixed counter bits start at
bit IA32_PEBS_ENABLE[32].

When guest PEBS is enabled, the IA32_PEBS_ENABLE MSR will be
added to the perf_guest_switch_msr() and atomically switched during
the VMX transitions just like CORE_PERF_GLOBAL_CTRL MSR.

Based on whether the platform supports x86_pmu.pebs_vmx, it has also
refactored the way to add more msrs to arr[] in intel_guest_get_msrs()
for extensibility.

Originally-by: Andi Kleen <ak@linux.intel.com>
Co-developed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Co-developed-by: Luwei Kang <luwei.kang@intel.com>
Signed-off-by: Luwei Kang <luwei.kang@intel.com>
Signed-off-by: Like Xu <like.xu@linux.intel.com>
---
 arch/x86/events/intel/core.c     | 60 +++++++++++++++++++++-----------
 arch/x86/include/asm/kvm_host.h  |  3 ++
 arch/x86/include/asm/msr-index.h |  6 ++++
 arch/x86/kvm/vmx/pmu_intel.c     | 31 +++++++++++++++++
 4 files changed, 79 insertions(+), 21 deletions(-)

Comments

Peter Zijlstra May 17, 2021, 8:32 a.m. UTC | #1
On Tue, May 11, 2021 at 10:42:04AM +0800, Like Xu wrote:
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index 2f89fd599842..c791765f4761 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -3898,31 +3898,49 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
>  	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>  	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
>  	u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
> +	u64 pebs_mask = (x86_pmu.flags & PMU_FL_PEBS_ALL) ?
> +		cpuc->pebs_enabled : (cpuc->pebs_enabled & PEBS_COUNTER_MASK);
> +
> +	*nr = 0;
> +	arr[(*nr)++] = (struct perf_guest_switch_msr){
> +		.msr = MSR_CORE_PERF_GLOBAL_CTRL,
> +		.host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask,
> +		.guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask),
> +	};
>  
> +	if (!x86_pmu.pebs)
> +		return arr;
>  
> +	/*
> +	 * If PMU counter has PEBS enabled it is not enough to
> +	 * disable counter on a guest entry since PEBS memory
> +	 * write can overshoot guest entry and corrupt guest
> +	 * memory. Disabling PEBS solves the problem.
> +	 *
> +	 * Don't do this if the CPU already enforces it.
> +	 */
> +	if (x86_pmu.pebs_no_isolation) {
> +		arr[(*nr)++] = (struct perf_guest_switch_msr){
> +			.msr = MSR_IA32_PEBS_ENABLE,
> +			.host = cpuc->pebs_enabled,
> +			.guest = 0,
> +		};
> +		return arr;
>  	}
>  
> +	if (!x86_pmu.pebs_vmx)
> +		return arr;
> +
> +	arr[*nr] = (struct perf_guest_switch_msr){
> +		.msr = MSR_IA32_PEBS_ENABLE,
> +		.host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask,
> +		.guest = pebs_mask & ~cpuc->intel_ctrl_host_mask,
> +	};
> +
> +	/* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */
> +	arr[0].guest |= arr[*nr].guest;
> +
> +	++(*nr);
>  	return arr;
>  }

ISTR saying I was confused as heck by this function, I still don't see
clarifying comments :/

What's .host and .guest ?
Peter Zijlstra May 17, 2021, 8:33 a.m. UTC | #2
On Tue, May 11, 2021 at 10:42:04AM +0800, Like Xu wrote:
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index 2f89fd599842..c791765f4761 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -3898,31 +3898,49 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
>  	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>  	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
>  	u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
> +	u64 pebs_mask = (x86_pmu.flags & PMU_FL_PEBS_ALL) ?
> +		cpuc->pebs_enabled : (cpuc->pebs_enabled & PEBS_COUNTER_MASK);

> -	if (x86_pmu.flags & PMU_FL_PEBS_ALL)
> -		arr[0].guest &= ~cpuc->pebs_enabled;
> -	else
> -		arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
> -	*nr = 1;

Instead of endlessly mucking about with branches, do we want something
like this instead?

---
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 2521d03de5e0..bcfba11196c8 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2819,10 +2819,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 	 * counters from the GLOBAL_STATUS mask and we always process PEBS
 	 * events via drain_pebs().
 	 */
-	if (x86_pmu.flags & PMU_FL_PEBS_ALL)
-		status &= ~cpuc->pebs_enabled;
-	else
-		status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
+	status &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable);
 
 	/*
 	 * PEBS overflow sets bit 62 in the global status register
@@ -3862,10 +3859,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
 	arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
 	arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
 	arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask;
-	if (x86_pmu.flags & PMU_FL_PEBS_ALL)
-		arr[0].guest &= ~cpuc->pebs_enabled;
-	else
-		arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
+	arr[0].guest &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable);
 	*nr = 1;
 
 	if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) {
@@ -5546,6 +5540,7 @@ __init int intel_pmu_init(void)
 	x86_pmu.events_mask_len		= eax.split.mask_length;
 
 	x86_pmu.max_pebs_events		= min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+	x86_pmu.pebs_capable		= PEBS_COUNTER_MASK;
 
 	/*
 	 * Quirk: v2 perfmon does not report fixed-purpose events, so
@@ -5730,6 +5725,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.pebs_aliases = NULL;
 		x86_pmu.pebs_prec_dist = true;
 		x86_pmu.lbr_pt_coexist = true;
+		x86_pmu.pebs_capable = ~0ULL;
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_PEBS_ALL;
 		x86_pmu.get_event_constraints = glp_get_event_constraints;
@@ -6080,6 +6076,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.pebs_aliases = NULL;
 		x86_pmu.pebs_prec_dist = true;
 		x86_pmu.pebs_block = true;
+		x86_pmu.pebs_capable = ~0ULL;
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 		x86_pmu.flags |= PMU_FL_PEBS_ALL;
@@ -6123,6 +6120,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.pebs_aliases = NULL;
 		x86_pmu.pebs_prec_dist = true;
 		x86_pmu.pebs_block = true;
+		x86_pmu.pebs_capable = ~0ULL;
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 		x86_pmu.flags |= PMU_FL_PEBS_ALL;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 27fa85e7d4fd..6f3cf81ccb1b 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -805,6 +805,7 @@ struct x86_pmu {
 	void		(*pebs_aliases)(struct perf_event *event);
 	unsigned long	large_pebs_flags;
 	u64		rtm_abort_event;
+	u64		pebs_capable;
 
 	/*
 	 * Intel LBR
Xu, Like May 18, 2021, 8:13 a.m. UTC | #3
On 2021/5/17 16:33, Peter Zijlstra wrote:
> On Tue, May 11, 2021 at 10:42:04AM +0800, Like Xu wrote:
>> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
>> index 2f89fd599842..c791765f4761 100644
>> --- a/arch/x86/events/intel/core.c
>> +++ b/arch/x86/events/intel/core.c
>> @@ -3898,31 +3898,49 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
>>   	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>>   	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
>>   	u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
>> +	u64 pebs_mask = (x86_pmu.flags & PMU_FL_PEBS_ALL) ?
>> +		cpuc->pebs_enabled : (cpuc->pebs_enabled & PEBS_COUNTER_MASK);
>> -	if (x86_pmu.flags & PMU_FL_PEBS_ALL)
>> -		arr[0].guest &= ~cpuc->pebs_enabled;
>> -	else
>> -		arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
>> -	*nr = 1;
> Instead of endlessly mucking about with branches, do we want something
> like this instead?

Fine to me. How about the commit message for your below patch:

x86/perf/core: Add pebs_capable to store valid PEBS_COUNTER_MASK value

The value of pebs_counter_mask will be accessed frequently
for repeated use in the intel_guest_get_msrs(). So it can be
optimized instead of endlessly mucking about with branches.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

>
> ---
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index 2521d03de5e0..bcfba11196c8 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -2819,10 +2819,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
>   	 * counters from the GLOBAL_STATUS mask and we always process PEBS
>   	 * events via drain_pebs().
>   	 */
> -	if (x86_pmu.flags & PMU_FL_PEBS_ALL)
> -		status &= ~cpuc->pebs_enabled;
> -	else
> -		status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
> +	status &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable);
>   
>   	/*
>   	 * PEBS overflow sets bit 62 in the global status register
> @@ -3862,10 +3859,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
>   	arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
>   	arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
>   	arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask;
> -	if (x86_pmu.flags & PMU_FL_PEBS_ALL)
> -		arr[0].guest &= ~cpuc->pebs_enabled;
> -	else
> -		arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
> +	arr[0].guest &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable);
>   	*nr = 1;
>   
>   	if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) {
> @@ -5546,6 +5540,7 @@ __init int intel_pmu_init(void)
>   	x86_pmu.events_mask_len		= eax.split.mask_length;
>   
>   	x86_pmu.max_pebs_events		= min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
> +	x86_pmu.pebs_capable		= PEBS_COUNTER_MASK;
>   
>   	/*
>   	 * Quirk: v2 perfmon does not report fixed-purpose events, so
> @@ -5730,6 +5725,7 @@ __init int intel_pmu_init(void)
>   		x86_pmu.pebs_aliases = NULL;
>   		x86_pmu.pebs_prec_dist = true;
>   		x86_pmu.lbr_pt_coexist = true;
> +		x86_pmu.pebs_capable = ~0ULL;
>   		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
>   		x86_pmu.flags |= PMU_FL_PEBS_ALL;
>   		x86_pmu.get_event_constraints = glp_get_event_constraints;
> @@ -6080,6 +6076,7 @@ __init int intel_pmu_init(void)
>   		x86_pmu.pebs_aliases = NULL;
>   		x86_pmu.pebs_prec_dist = true;
>   		x86_pmu.pebs_block = true;
> +		x86_pmu.pebs_capable = ~0ULL;
>   		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
>   		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
>   		x86_pmu.flags |= PMU_FL_PEBS_ALL;
> @@ -6123,6 +6120,7 @@ __init int intel_pmu_init(void)
>   		x86_pmu.pebs_aliases = NULL;
>   		x86_pmu.pebs_prec_dist = true;
>   		x86_pmu.pebs_block = true;
> +		x86_pmu.pebs_capable = ~0ULL;
>   		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
>   		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
>   		x86_pmu.flags |= PMU_FL_PEBS_ALL;
> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
> index 27fa85e7d4fd..6f3cf81ccb1b 100644
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -805,6 +805,7 @@ struct x86_pmu {
>   	void		(*pebs_aliases)(struct perf_event *event);
>   	unsigned long	large_pebs_flags;
>   	u64		rtm_abort_event;
> +	u64		pebs_capable;
>   
>   	/*
>   	 * Intel LBR
Xu, Like May 18, 2021, 8:44 a.m. UTC | #4
On 2021/5/17 16:32, Peter Zijlstra wrote:
> On Tue, May 11, 2021 at 10:42:04AM +0800, Like Xu wrote:
>> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
>> index 2f89fd599842..c791765f4761 100644
>> --- a/arch/x86/events/intel/core.c
>> +++ b/arch/x86/events/intel/core.c
>> @@ -3898,31 +3898,49 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
>>   	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>>   	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
>>   	u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
>> +	u64 pebs_mask = (x86_pmu.flags & PMU_FL_PEBS_ALL) ?
>> +		cpuc->pebs_enabled : (cpuc->pebs_enabled & PEBS_COUNTER_MASK);
>> +
>> +	*nr = 0;
>> +	arr[(*nr)++] = (struct perf_guest_switch_msr){
>> +		.msr = MSR_CORE_PERF_GLOBAL_CTRL,
>> +		.host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask,
>> +		.guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask),
>> +	};
>>   
>> +	if (!x86_pmu.pebs)
>> +		return arr;
>>   
>> +	/*
>> +	 * If PMU counter has PEBS enabled it is not enough to
>> +	 * disable counter on a guest entry since PEBS memory
>> +	 * write can overshoot guest entry and corrupt guest
>> +	 * memory. Disabling PEBS solves the problem.
>> +	 *
>> +	 * Don't do this if the CPU already enforces it.
>> +	 */
>> +	if (x86_pmu.pebs_no_isolation) {
>> +		arr[(*nr)++] = (struct perf_guest_switch_msr){
>> +			.msr = MSR_IA32_PEBS_ENABLE,
>> +			.host = cpuc->pebs_enabled,
>> +			.guest = 0,
>> +		};
>> +		return arr;
>>   	}
>>   
>> +	if (!x86_pmu.pebs_vmx)
>> +		return arr;
>> +
>> +	arr[*nr] = (struct perf_guest_switch_msr){
>> +		.msr = MSR_IA32_PEBS_ENABLE,
>> +		.host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask,
>> +		.guest = pebs_mask & ~cpuc->intel_ctrl_host_mask,
>> +	};
>> +
>> +	/* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */
>> +	arr[0].guest |= arr[*nr].guest;
>> +
>> +	++(*nr);
>>   	return arr;
>>   }
> ISTR saying I was confused as heck by this function, I still don't see
> clarifying comments :/
>
> What's .host and .guest ?

Will adding the following comments help you ?

+/*
+ * Currently, the only caller of this function is the 
atomic_switch_perf_msrs().
+ * The host perf conext helps to prepare the values of the real hardware for
+ * a set of msrs that need to be switched atomically in a vmx transaction.
+ *
+ * For example, the pseudocode needed to add a new msr should look like:
+ *
+ * arr[(*nr)++] = (struct perf_guest_switch_msr){
+ *     .msr = the hardware msr address,
+ *     .host = the value the hardware has when it doesn't run a guest,
+ *     .guest = the value the hardware has when it runs a guest,
+ * };
+ *
+ * These values have nothing to do with the emulated values the guest sees
+ * when it uses {RD,WR}MSR, which should be handled in the KVM context.
+ */
  static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void 
*data)
Peter Zijlstra May 18, 2021, 1:42 p.m. UTC | #5
On Tue, May 18, 2021 at 04:44:13PM +0800, Xu, Like wrote:
> Will adding the following comments help you ?
> 
> +/*
> + * Currently, the only caller of this function is the atomic_switch_perf_msrs().
> + * The host perf conext helps to prepare the values of the real hardware for
> + * a set of msrs that need to be switched atomically in a vmx transaction.
> + *
> + * For example, the pseudocode needed to add a new msr should look like:
> + *
> + * arr[(*nr)++] = (struct perf_guest_switch_msr){
> + *     .msr = the hardware msr address,
> + *     .host = the value the hardware has when it doesn't run a guest,
> + *     .guest = the value the hardware has when it runs a guest,

So personally I think the .host and .guest naming is terrible here,
because both values are host values. But I don't know enough about virt
to know if there's accepted nomencature for this.

> + * };
> + *
> + * These values have nothing to do with the emulated values the guest sees
> + * when it uses {RD,WR}MSR, which should be handled in the KVM context.
> + */
>  static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)

Yes, now at least one can understand wth this function does, even though
the actual naming is still horrible. Thanks!

Additionally, would it make sense to add a pointer to the KVM code that
does the emulation for each MSR listed in this function?
diff mbox series

Patch

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 2f89fd599842..c791765f4761 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3898,31 +3898,49 @@  static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
 	u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
+	u64 pebs_mask = (x86_pmu.flags & PMU_FL_PEBS_ALL) ?
+		cpuc->pebs_enabled : (cpuc->pebs_enabled & PEBS_COUNTER_MASK);
+
+	*nr = 0;
+	arr[(*nr)++] = (struct perf_guest_switch_msr){
+		.msr = MSR_CORE_PERF_GLOBAL_CTRL,
+		.host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask,
+		.guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask),
+	};
 
-	arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
-	arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
-	arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask;
-	if (x86_pmu.flags & PMU_FL_PEBS_ALL)
-		arr[0].guest &= ~cpuc->pebs_enabled;
-	else
-		arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
-	*nr = 1;
+	if (!x86_pmu.pebs)
+		return arr;
 
-	if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) {
-		/*
-		 * If PMU counter has PEBS enabled it is not enough to
-		 * disable counter on a guest entry since PEBS memory
-		 * write can overshoot guest entry and corrupt guest
-		 * memory. Disabling PEBS solves the problem.
-		 *
-		 * Don't do this if the CPU already enforces it.
-		 */
-		arr[1].msr = MSR_IA32_PEBS_ENABLE;
-		arr[1].host = cpuc->pebs_enabled;
-		arr[1].guest = 0;
-		*nr = 2;
+	/*
+	 * If PMU counter has PEBS enabled it is not enough to
+	 * disable counter on a guest entry since PEBS memory
+	 * write can overshoot guest entry and corrupt guest
+	 * memory. Disabling PEBS solves the problem.
+	 *
+	 * Don't do this if the CPU already enforces it.
+	 */
+	if (x86_pmu.pebs_no_isolation) {
+		arr[(*nr)++] = (struct perf_guest_switch_msr){
+			.msr = MSR_IA32_PEBS_ENABLE,
+			.host = cpuc->pebs_enabled,
+			.guest = 0,
+		};
+		return arr;
 	}
 
+	if (!x86_pmu.pebs_vmx)
+		return arr;
+
+	arr[*nr] = (struct perf_guest_switch_msr){
+		.msr = MSR_IA32_PEBS_ENABLE,
+		.host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask,
+		.guest = pebs_mask & ~cpuc->intel_ctrl_host_mask,
+	};
+
+	/* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */
+	arr[0].guest |= arr[*nr].guest;
+
+	++(*nr);
 	return arr;
 }
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 49b421bd3dd8..0a42079560ac 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -473,6 +473,9 @@  struct kvm_pmu {
 	DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX);
 	DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX);
 
+	u64 pebs_enable;
+	u64 pebs_enable_mask;
+
 	/*
 	 * The gate to release perf_events not marked in
 	 * pmc_in_use only once in a vcpu time slice.
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 742d89a00721..1ab3f280f3a9 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -189,6 +189,12 @@ 
 #define PERF_CAP_PT_IDX			16
 
 #define MSR_PEBS_LD_LAT_THRESHOLD	0x000003f6
+#define PERF_CAP_PEBS_TRAP             BIT_ULL(6)
+#define PERF_CAP_ARCH_REG              BIT_ULL(7)
+#define PERF_CAP_PEBS_FORMAT           0xf00
+#define PERF_CAP_PEBS_BASELINE         BIT_ULL(14)
+#define PERF_CAP_PEBS_MASK	(PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
+				 PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE)
 
 #define MSR_IA32_RTIT_CTL		0x00000570
 #define RTIT_CTL_TRACEEN		BIT(0)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index ac7fe714e6c1..9938b485c31c 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -220,6 +220,9 @@  static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
 	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
 		ret = pmu->version > 1;
 		break;
+	case MSR_IA32_PEBS_ENABLE:
+		ret = vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT;
+		break;
 	default:
 		ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
 			get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
@@ -367,6 +370,9 @@  static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
 		msr_info->data = pmu->global_ovf_ctrl;
 		return 0;
+	case MSR_IA32_PEBS_ENABLE:
+		msr_info->data = pmu->pebs_enable;
+		return 0;
 	default:
 		if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
 		    (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -427,6 +433,14 @@  static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 0;
 		}
 		break;
+	case MSR_IA32_PEBS_ENABLE:
+		if (pmu->pebs_enable == data)
+			return 0;
+		if (!(data & pmu->pebs_enable_mask)) {
+			pmu->pebs_enable = data;
+			return 0;
+		}
+		break;
 	default:
 		if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
 		    (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -479,6 +493,7 @@  static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
 	pmu->version = 0;
 	pmu->reserved_bits = 0xffffffff00200000ull;
 	pmu->fixed_ctr_ctrl_mask = ~0ull;
+	pmu->pebs_enable_mask = ~0ull;
 
 	entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
 	if (!entry)
@@ -545,6 +560,22 @@  static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
 
 	if (lbr_desc->records.nr)
 		bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1);
+
+	if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT) {
+		if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_BASELINE) {
+			pmu->pebs_enable_mask = ~pmu->global_ctrl;
+			pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
+			for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+				pmu->fixed_ctr_ctrl_mask &=
+					~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
+			}
+		} else {
+			pmu->pebs_enable_mask =
+				~((1ull << pmu->nr_arch_gp_counters) - 1);
+		}
+	} else {
+		vcpu->arch.perf_capabilities &= ~PERF_CAP_PEBS_MASK;
+	}
 }
 
 static void intel_pmu_init(struct kvm_vcpu *vcpu)