diff mbox series

[v2,30/54] KVM: x86/pmu: Implement the save/restore of PMU state for Intel CPU

Message ID 20240506053020.3911940-31-mizhang@google.com (mailing list archive)
State New
Headers show
Series Mediated Passthrough vPMU 2.0 for x86 | expand

Commit Message

Mingwei Zhang May 6, 2024, 5:29 a.m. UTC
Implement the save/restore of PMU state for pasthrough PMU in Intel. In
passthrough mode, KVM owns exclusively the PMU HW when control flow goes to
the scope of passthrough PMU. Thus, KVM needs to save the host PMU state
and gains the full HW PMU ownership. On the contrary, host regains the
ownership of PMU HW from KVM when control flow leaves the scope of
passthrough PMU.

Implement PMU context switches for Intel CPUs and opptunistically use
rdpmcl() instead of rdmsrl() when reading counters since the former has
lower latency in Intel CPUs.

Co-developed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Mingwei Zhang <mizhang@google.com>
---
 arch/x86/kvm/pmu.c           | 46 ++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/pmu_intel.c | 41 +++++++++++++++++++++++++++++++-
 2 files changed, 86 insertions(+), 1 deletion(-)

Comments

Mi, Dapeng May 14, 2024, 8:08 a.m. UTC | #1
On 5/6/2024 1:29 PM, Mingwei Zhang wrote:
> Implement the save/restore of PMU state for pasthrough PMU in Intel. In
> passthrough mode, KVM owns exclusively the PMU HW when control flow goes to
> the scope of passthrough PMU. Thus, KVM needs to save the host PMU state
> and gains the full HW PMU ownership. On the contrary, host regains the
> ownership of PMU HW from KVM when control flow leaves the scope of
> passthrough PMU.
>
> Implement PMU context switches for Intel CPUs and opptunistically use
> rdpmcl() instead of rdmsrl() when reading counters since the former has
> lower latency in Intel CPUs.

It looks rdpmcl() optimization is removed from this patch, right? The
description is not identical with code.


>
> Co-developed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
> Signed-off-by: Mingwei Zhang <mizhang@google.com>

The commit tags doesn't follow the rules in Linux document, we need to
change it in next version.

https://docs.kernel.org/process/submitting-patches.html#:~:text=Co%2Ddeveloped%2Dby%3A%20states,work%20on%20a%20single%20patch.

> ---
>  arch/x86/kvm/pmu.c           | 46 ++++++++++++++++++++++++++++++++++++
>  arch/x86/kvm/vmx/pmu_intel.c | 41 +++++++++++++++++++++++++++++++-
>  2 files changed, 86 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index 782b564bdf96..13197472e31d 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -1068,14 +1068,60 @@ void kvm_pmu_passthrough_pmu_msrs(struct kvm_vcpu *vcpu)
>  
>  void kvm_pmu_save_pmu_context(struct kvm_vcpu *vcpu)
>  {
> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +	struct kvm_pmc *pmc;
> +	u32 i;
> +
>  	lockdep_assert_irqs_disabled();
>  
>  	static_call_cond(kvm_x86_pmu_save_pmu_context)(vcpu);
> +
> +	/*
> +	 * Clear hardware selector MSR content and its counter to avoid
> +	 * leakage and also avoid this guest GP counter get accidentally
> +	 * enabled during host running when host enable global ctrl.
> +	 */
> +	for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
> +		pmc = &pmu->gp_counters[i];
> +		rdmsrl(pmc->msr_counter, pmc->counter);

I understood we want to use common code to manipulate PMU MSRs as much as
possible, but I don't think we should sacrifice performance. rdpmcl() has
better performance than rdmsrl(). If AMD CPUs doesn't support rdpmc
instruction, I think we should move this into vendor specific
xxx_save/restore_pmu_context helpers().


> +		rdmsrl(pmc->msr_eventsel, pmc->eventsel);
> +		if (pmc->counter)
> +			wrmsrl(pmc->msr_counter, 0);
> +		if (pmc->eventsel)
> +			wrmsrl(pmc->msr_eventsel, 0);
> +	}
> +
> +	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> +		pmc = &pmu->fixed_counters[i];
> +		rdmsrl(pmc->msr_counter, pmc->counter);
> +		if (pmc->counter)
> +			wrmsrl(pmc->msr_counter, 0);
> +	}
>  }
>  
>  void kvm_pmu_restore_pmu_context(struct kvm_vcpu *vcpu)
>  {
> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +	struct kvm_pmc *pmc;
> +	int i;
> +
>  	lockdep_assert_irqs_disabled();
>  
>  	static_call_cond(kvm_x86_pmu_restore_pmu_context)(vcpu);
> +
> +	/*
> +	 * No need to zero out unexposed GP/fixed counters/selectors since RDPMC
> +	 * in this case will be intercepted. Accessing to these counters and
> +	 * selectors will cause #GP in the guest.
> +	 */
> +	for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
> +		pmc = &pmu->gp_counters[i];
> +		wrmsrl(pmc->msr_counter, pmc->counter);
> +		wrmsrl(pmc->msr_eventsel, pmu->gp_counters[i].eventsel);
> +	}
> +
> +	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> +		pmc = &pmu->fixed_counters[i];
> +		wrmsrl(pmc->msr_counter, pmc->counter);
> +	}
>  }
> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> index 7852ba25a240..a23cf9ca224e 100644
> --- a/arch/x86/kvm/vmx/pmu_intel.c
> +++ b/arch/x86/kvm/vmx/pmu_intel.c
> @@ -572,7 +572,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
>  	}
>  
>  	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> -		pmu->fixed_counters[i].msr_eventsel = MSR_CORE_PERF_FIXED_CTR_CTRL;
> +		pmu->fixed_counters[i].msr_eventsel = 0;
Why to initialize msr_eventsel to 0 instead of the real MSR address here?
>  		pmu->fixed_counters[i].msr_counter = MSR_CORE_PERF_FIXED_CTR0 + i;
>  	}
>  }
> @@ -799,6 +799,43 @@ static void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu)
>  	vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, MSR_TYPE_RW, msr_intercept);
>  }
>  
> +static void intel_save_guest_pmu_context(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +
> +	/* Global ctrl register is already saved at VM-exit. */
> +	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status);
> +	/* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */
> +	if (pmu->global_status)
> +		wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status);
> +
> +	rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl);
> +	/*
> +	 * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and
> +	 * also avoid these guest fixed counters get accidentially enabled
> +	 * during host running when host enable global ctrl.
> +	 */
> +	if (pmu->fixed_ctr_ctrl)
> +		wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
> +}
> +
> +static void intel_restore_guest_pmu_context(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +	u64 global_status, toggle;
> +
> +	/* Clear host global_ctrl MSR if non-zero. */
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
> +	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status);
> +	toggle = pmu->global_status ^ global_status;
> +	if (global_status & toggle)
> +		wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle);
> +	if (pmu->global_status & toggle)
> +		wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle);
> +
> +	wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl);
> +}
> +
>  struct kvm_pmu_ops intel_pmu_ops __initdata = {
>  	.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
>  	.msr_idx_to_pmc = intel_msr_idx_to_pmc,
> @@ -812,6 +849,8 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
>  	.cleanup = intel_pmu_cleanup,
>  	.is_rdpmc_passthru_allowed = intel_is_rdpmc_passthru_allowed,
>  	.passthrough_pmu_msrs = intel_passthrough_pmu_msrs,
> +	.save_pmu_context = intel_save_guest_pmu_context,
> +	.restore_pmu_context = intel_restore_guest_pmu_context,
>  	.EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
>  	.MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC,
>  	.MIN_NR_GP_COUNTERS = 1,
diff mbox series

Patch

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 782b564bdf96..13197472e31d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -1068,14 +1068,60 @@  void kvm_pmu_passthrough_pmu_msrs(struct kvm_vcpu *vcpu)
 
 void kvm_pmu_save_pmu_context(struct kvm_vcpu *vcpu)
 {
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+	u32 i;
+
 	lockdep_assert_irqs_disabled();
 
 	static_call_cond(kvm_x86_pmu_save_pmu_context)(vcpu);
+
+	/*
+	 * Clear hardware selector MSR content and its counter to avoid
+	 * leakage and also avoid this guest GP counter get accidentally
+	 * enabled during host running when host enable global ctrl.
+	 */
+	for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
+		pmc = &pmu->gp_counters[i];
+		rdmsrl(pmc->msr_counter, pmc->counter);
+		rdmsrl(pmc->msr_eventsel, pmc->eventsel);
+		if (pmc->counter)
+			wrmsrl(pmc->msr_counter, 0);
+		if (pmc->eventsel)
+			wrmsrl(pmc->msr_eventsel, 0);
+	}
+
+	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+		pmc = &pmu->fixed_counters[i];
+		rdmsrl(pmc->msr_counter, pmc->counter);
+		if (pmc->counter)
+			wrmsrl(pmc->msr_counter, 0);
+	}
 }
 
 void kvm_pmu_restore_pmu_context(struct kvm_vcpu *vcpu)
 {
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+	int i;
+
 	lockdep_assert_irqs_disabled();
 
 	static_call_cond(kvm_x86_pmu_restore_pmu_context)(vcpu);
+
+	/*
+	 * No need to zero out unexposed GP/fixed counters/selectors since RDPMC
+	 * in this case will be intercepted. Accessing to these counters and
+	 * selectors will cause #GP in the guest.
+	 */
+	for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
+		pmc = &pmu->gp_counters[i];
+		wrmsrl(pmc->msr_counter, pmc->counter);
+		wrmsrl(pmc->msr_eventsel, pmu->gp_counters[i].eventsel);
+	}
+
+	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+		pmc = &pmu->fixed_counters[i];
+		wrmsrl(pmc->msr_counter, pmc->counter);
+	}
 }
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 7852ba25a240..a23cf9ca224e 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -572,7 +572,7 @@  static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
 	}
 
 	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
-		pmu->fixed_counters[i].msr_eventsel = MSR_CORE_PERF_FIXED_CTR_CTRL;
+		pmu->fixed_counters[i].msr_eventsel = 0;
 		pmu->fixed_counters[i].msr_counter = MSR_CORE_PERF_FIXED_CTR0 + i;
 	}
 }
@@ -799,6 +799,43 @@  static void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu)
 	vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, MSR_TYPE_RW, msr_intercept);
 }
 
+static void intel_save_guest_pmu_context(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+	/* Global ctrl register is already saved at VM-exit. */
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status);
+	/* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */
+	if (pmu->global_status)
+		wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status);
+
+	rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl);
+	/*
+	 * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and
+	 * also avoid these guest fixed counters get accidentially enabled
+	 * during host running when host enable global ctrl.
+	 */
+	if (pmu->fixed_ctr_ctrl)
+		wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
+}
+
+static void intel_restore_guest_pmu_context(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	u64 global_status, toggle;
+
+	/* Clear host global_ctrl MSR if non-zero. */
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status);
+	toggle = pmu->global_status ^ global_status;
+	if (global_status & toggle)
+		wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle);
+	if (pmu->global_status & toggle)
+		wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle);
+
+	wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl);
+}
+
 struct kvm_pmu_ops intel_pmu_ops __initdata = {
 	.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
 	.msr_idx_to_pmc = intel_msr_idx_to_pmc,
@@ -812,6 +849,8 @@  struct kvm_pmu_ops intel_pmu_ops __initdata = {
 	.cleanup = intel_pmu_cleanup,
 	.is_rdpmc_passthru_allowed = intel_is_rdpmc_passthru_allowed,
 	.passthrough_pmu_msrs = intel_passthrough_pmu_msrs,
+	.save_pmu_context = intel_save_guest_pmu_context,
+	.restore_pmu_context = intel_restore_guest_pmu_context,
 	.EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
 	.MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC,
 	.MIN_NR_GP_COUNTERS = 1,