diff mbox series

[v5,06/16] KVM: x86/pmu: Reprogram PEBS event to emulate guest PEBS counter

Message ID 20210415032016.166201-7-like.xu@linux.intel.com (mailing list archive)
State New
Headers show
Series KVM: x86/pmu: Add basic support to enable guest PEBS via DS | expand

Commit Message

Like Xu April 15, 2021, 3:20 a.m. UTC
When a guest counter is configured as a PEBS counter through
IA32_PEBS_ENABLE, a guest PEBS event will be reprogrammed by
configuring a non-zero precision level in the perf_event_attr.

The guest PEBS overflow PMI bit would be set in the guest
GLOBAL_STATUS MSR when PEBS facility generates a PEBS
overflow PMI based on guest IA32_DS_AREA MSR.

Even with the same counter index and the same event code and
mask, guest PEBS events will not be reused for non-PEBS events.

Originally-by: Andi Kleen <ak@linux.intel.com>
Co-developed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Like Xu <like.xu@linux.intel.com>
---
 arch/x86/kvm/pmu.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

Comments

Liuxiangdong April 19, 2021, 8:11 a.m. UTC | #1
On 2021/4/15 11:20, Like Xu wrote:
> When a guest counter is configured as a PEBS counter through
> IA32_PEBS_ENABLE, a guest PEBS event will be reprogrammed by
> configuring a non-zero precision level in the perf_event_attr.
>
> The guest PEBS overflow PMI bit would be set in the guest
> GLOBAL_STATUS MSR when PEBS facility generates a PEBS
> overflow PMI based on guest IA32_DS_AREA MSR.
>
> Even with the same counter index and the same event code and
> mask, guest PEBS events will not be reused for non-PEBS events.
>
> Originally-by: Andi Kleen <ak@linux.intel.com>
> Co-developed-by: Kan Liang <kan.liang@linux.intel.com>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> Signed-off-by: Like Xu <like.xu@linux.intel.com>
> ---
>   arch/x86/kvm/pmu.c | 34 ++++++++++++++++++++++++++++++++--
>   1 file changed, 32 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index 827886c12c16..0f86c1142f17 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -74,11 +74,21 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event,
>   {
>   	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
>   	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
> +	bool skip_pmi = false;
>   
>   	if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
> -		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
> +		if (perf_event->attr.precise_ip) {
> +			/* Indicate PEBS overflow PMI to guest. */
> +			skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
> +						      (unsigned long *)&pmu->global_status);
> +		} else {
> +			__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
> +		}
>   		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
>   
> +		if (skip_pmi)
> +			return;
> +
>   		/*
>   		 * Inject PMI. If vcpu was in a guest mode during NMI PMI
>   		 * can be ejected on a guest mode re-entry. Otherwise we can't
> @@ -99,6 +109,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>   				  bool exclude_kernel, bool intr,
>   				  bool in_tx, bool in_tx_cp)
>   {
> +	struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu);
>   	struct perf_event *event;
>   	struct perf_event_attr attr = {
>   		.type = type,
> @@ -110,6 +121,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>   		.exclude_kernel = exclude_kernel,
>   		.config = config,
>   	};
> +	bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
>   

pebs_enable is defined in patch 07, but used here(in patch 06).
Maybe we can change the patches order in next patch version if necessary.

>   	attr.sample_period = get_sample_period(pmc, pmc->counter);
>   
> @@ -124,9 +136,23 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>   		attr.sample_period = 0;
>   		attr.config |= HSW_IN_TX_CHECKPOINTED;
>   	}
> +	if (pebs) {
> +		/*
> +		 * The non-zero precision level of guest event makes the ordinary
> +		 * guest event becomes a guest PEBS event and triggers the host
> +		 * PEBS PMI handler to determine whether the PEBS overflow PMI
> +		 * comes from the host counters or the guest.
> +		 *
> +		 * For most PEBS hardware events, the difference in the software
> +		 * precision levels of guest and host PEBS events will not affect
> +		 * the accuracy of the PEBS profiling result, because the "event IP"
> +		 * in the PEBS record is calibrated on the guest side.
> +		 */
> +		attr.precise_ip = 1;
> +	}
>   
>   	event = perf_event_create_kernel_counter(&attr, -1, current,
> -						 intr ? kvm_perf_overflow_intr :
> +						 (intr || pebs) ? kvm_perf_overflow_intr :
>   						 kvm_perf_overflow, pmc);
>   	if (IS_ERR(event)) {
>   		pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
> @@ -161,6 +187,10 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
>   			      get_sample_period(pmc, pmc->counter)))
>   		return false;
>   
> +	if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) &&
> +	    pmc->perf_event->attr.precise_ip)
> +		return false;
> +
>   	/* reuse perf_event to serve as pmc_reprogram_counter() does*/
>   	perf_event_enable(pmc->perf_event);
>
Xu, Like April 19, 2021, 8:17 a.m. UTC | #2
On 2021/4/19 16:11, Liuxiangdong wrote:
>
>
> On 2021/4/15 11:20, Like Xu wrote:
>> When a guest counter is configured as a PEBS counter through
>> IA32_PEBS_ENABLE, a guest PEBS event will be reprogrammed by
>> configuring a non-zero precision level in the perf_event_attr.
>>
>> The guest PEBS overflow PMI bit would be set in the guest
>> GLOBAL_STATUS MSR when PEBS facility generates a PEBS
>> overflow PMI based on guest IA32_DS_AREA MSR.
>>
>> Even with the same counter index and the same event code and
>> mask, guest PEBS events will not be reused for non-PEBS events.
>>
>> Originally-by: Andi Kleen <ak@linux.intel.com>
>> Co-developed-by: Kan Liang <kan.liang@linux.intel.com>
>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>> Signed-off-by: Like Xu <like.xu@linux.intel.com>
>> ---
>>   arch/x86/kvm/pmu.c | 34 ++++++++++++++++++++++++++++++++--
>>   1 file changed, 32 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
>> index 827886c12c16..0f86c1142f17 100644
>> --- a/arch/x86/kvm/pmu.c
>> +++ b/arch/x86/kvm/pmu.c
>> @@ -74,11 +74,21 @@ static void kvm_perf_overflow_intr(struct perf_event 
>> *perf_event,
>>   {
>>       struct kvm_pmc *pmc = perf_event->overflow_handler_context;
>>       struct kvm_pmu *pmu = pmc_to_pmu(pmc);
>> +    bool skip_pmi = false;
>>         if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
>> -        __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
>> +        if (perf_event->attr.precise_ip) {
>> +            /* Indicate PEBS overflow PMI to guest. */
>> +            skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
>> +                              (unsigned long *)&pmu->global_status);
>> +        } else {
>> +            __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
>> +        }
>>           kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
>>   +        if (skip_pmi)
>> +            return;
>> +
>>           /*
>>            * Inject PMI. If vcpu was in a guest mode during NMI PMI
>>            * can be ejected on a guest mode re-entry. Otherwise we can't
>> @@ -99,6 +109,7 @@ static void pmc_reprogram_counter(struct kvm_pmc 
>> *pmc, u32 type,
>>                     bool exclude_kernel, bool intr,
>>                     bool in_tx, bool in_tx_cp)
>>   {
>> +    struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu);
>>       struct perf_event *event;
>>       struct perf_event_attr attr = {
>>           .type = type,
>> @@ -110,6 +121,7 @@ static void pmc_reprogram_counter(struct kvm_pmc 
>> *pmc, u32 type,
>>           .exclude_kernel = exclude_kernel,
>>           .config = config,
>>       };
>> +    bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
>
> pebs_enable is defined in patch 07, but used here(in patch 06).
> Maybe we can change the patches order in next patch version if necessary.

Thanks! I'll fix it.

>
>>       attr.sample_period = get_sample_period(pmc, pmc->counter);
>>   @@ -124,9 +136,23 @@ static void pmc_reprogram_counter(struct kvm_pmc 
>> *pmc, u32 type,
>>           attr.sample_period = 0;
>>           attr.config |= HSW_IN_TX_CHECKPOINTED;
>>       }
>> +    if (pebs) {
>> +        /*
>> +         * The non-zero precision level of guest event makes the ordinary
>> +         * guest event becomes a guest PEBS event and triggers the host
>> +         * PEBS PMI handler to determine whether the PEBS overflow PMI
>> +         * comes from the host counters or the guest.
>> +         *
>> +         * For most PEBS hardware events, the difference in the software
>> +         * precision levels of guest and host PEBS events will not affect
>> +         * the accuracy of the PEBS profiling result, because the 
>> "event IP"
>> +         * in the PEBS record is calibrated on the guest side.
>> +         */
>> +        attr.precise_ip = 1;
>> +    }
>>         event = perf_event_create_kernel_counter(&attr, -1, current,
>> -                         intr ? kvm_perf_overflow_intr :
>> +                         (intr || pebs) ? kvm_perf_overflow_intr :
>>                            kvm_perf_overflow, pmc);
>>       if (IS_ERR(event)) {
>>           pr_debug_ratelimited("kvm_pmu: event creation failed %ld for 
>> pmc->idx = %d\n",
>> @@ -161,6 +187,10 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
>>                     get_sample_period(pmc, pmc->counter)))
>>           return false;
>>   +    if (!test_bit(pmc->idx, (unsigned long 
>> *)&pmc_to_pmu(pmc)->pebs_enable) &&
>> +        pmc->perf_event->attr.precise_ip)
>> +        return false;
>> +
>>       /* reuse perf_event to serve as pmc_reprogram_counter() does*/
>>       perf_event_enable(pmc->perf_event);
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 827886c12c16..0f86c1142f17 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -74,11 +74,21 @@  static void kvm_perf_overflow_intr(struct perf_event *perf_event,
 {
 	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
 	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+	bool skip_pmi = false;
 
 	if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
-		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+		if (perf_event->attr.precise_ip) {
+			/* Indicate PEBS overflow PMI to guest. */
+			skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
+						      (unsigned long *)&pmu->global_status);
+		} else {
+			__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+		}
 		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
 
+		if (skip_pmi)
+			return;
+
 		/*
 		 * Inject PMI. If vcpu was in a guest mode during NMI PMI
 		 * can be ejected on a guest mode re-entry. Otherwise we can't
@@ -99,6 +109,7 @@  static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 				  bool exclude_kernel, bool intr,
 				  bool in_tx, bool in_tx_cp)
 {
+	struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu);
 	struct perf_event *event;
 	struct perf_event_attr attr = {
 		.type = type,
@@ -110,6 +121,7 @@  static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 		.exclude_kernel = exclude_kernel,
 		.config = config,
 	};
+	bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
 
 	attr.sample_period = get_sample_period(pmc, pmc->counter);
 
@@ -124,9 +136,23 @@  static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 		attr.sample_period = 0;
 		attr.config |= HSW_IN_TX_CHECKPOINTED;
 	}
+	if (pebs) {
+		/*
+		 * The non-zero precision level of guest event makes the ordinary
+		 * guest event becomes a guest PEBS event and triggers the host
+		 * PEBS PMI handler to determine whether the PEBS overflow PMI
+		 * comes from the host counters or the guest.
+		 *
+		 * For most PEBS hardware events, the difference in the software
+		 * precision levels of guest and host PEBS events will not affect
+		 * the accuracy of the PEBS profiling result, because the "event IP"
+		 * in the PEBS record is calibrated on the guest side.
+		 */
+		attr.precise_ip = 1;
+	}
 
 	event = perf_event_create_kernel_counter(&attr, -1, current,
-						 intr ? kvm_perf_overflow_intr :
+						 (intr || pebs) ? kvm_perf_overflow_intr :
 						 kvm_perf_overflow, pmc);
 	if (IS_ERR(event)) {
 		pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
@@ -161,6 +187,10 @@  static bool pmc_resume_counter(struct kvm_pmc *pmc)
 			      get_sample_period(pmc, pmc->counter)))
 		return false;
 
+	if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) &&
+	    pmc->perf_event->attr.precise_ip)
+		return false;
+
 	/* reuse perf_event to serve as pmc_reprogram_counter() does*/
 	perf_event_enable(pmc->perf_event);