diff mbox series

[RFC,41/41] KVM: nVMX: Add nested virtualization support for passthrough PMU

Message ID 20240126085444.324918-42-xiong.y.zhang@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/pmu: Introduce passthrough vPM | expand

Commit Message

Xiong Zhang Jan. 26, 2024, 8:54 a.m. UTC
From: Mingwei Zhang <mizhang@google.com>

Add nested virtualization support for passthrough PMU by combining the MSR
interception bitmaps of vmcs01 and vmcs12. Readers may argue even without
this patch, nested virtualization works for passthrough PMU because L1 will
see Perfmon v2 and will have to use legacy vPMU implementation if it is
Linux. However, any assumption made on L1 may be invalid, e.g., L1 may not
even be Linux.

If both L0 and L1 pass through PMU MSRs, the correct behavior is to allow
MSR access from L2 directly touch HW MSRs, since both L0 and L1 passthrough
the access.

However, in current implementation, if without adding anything for nested,
KVM always set MSR interception bits in vmcs02. This leads to the fact that
L0 will emulate all MSR read/writes for L2, leading to errors, since the
current passthrough vPMU never implements set_msr() and get_msr() for any
counter access except counter accesses from the VMM side.

So fix the issue by setting up the correct MSR interception for PMU MSRs.

Signed-off-by: Mingwei Zhang <mizhang@google.com>
---
 arch/x86/kvm/vmx/nested.c | 52 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

Comments

Sean Christopherson April 11, 2024, 11:21 p.m. UTC | #1
On Fri, Jan 26, 2024, Xiong Zhang wrote:
> From: Mingwei Zhang <mizhang@google.com>
> 
> Add nested virtualization support for passthrough PMU by combining the MSR
> interception bitmaps of vmcs01 and vmcs12. Readers may argue even without
> this patch, nested virtualization works for passthrough PMU because L1 will
> see Perfmon v2 and will have to use legacy vPMU implementation if it is
> Linux. However, any assumption made on L1 may be invalid, e.g., L1 may not
> even be Linux.
> 
> If both L0 and L1 pass through PMU MSRs, the correct behavior is to allow
> MSR access from L2 directly touch HW MSRs, since both L0 and L1 passthrough
> the access.
> 
> However, in current implementation, if without adding anything for nested,
> KVM always set MSR interception bits in vmcs02. This leads to the fact that
> L0 will emulate all MSR read/writes for L2, leading to errors, since the
> current passthrough vPMU never implements set_msr() and get_msr() for any
> counter access except counter accesses from the VMM side.
> 
> So fix the issue by setting up the correct MSR interception for PMU MSRs.
> 
> Signed-off-by: Mingwei Zhang <mizhang@google.com>
> ---
>  arch/x86/kvm/vmx/nested.c | 52 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 52 insertions(+)
> 
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index c5ec0ef51ff7..95e1c78152da 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -561,6 +561,55 @@ static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
>  						   msr_bitmap_l0, msr);
>  }
>  
> +/* Pass PMU MSRs to nested VM if L0 and L1 are set to passthrough. */
> +static void nested_vmx_set_passthru_pmu_intercept_for_msr(struct kvm_vcpu *vcpu,

Heh, 50 instances of passthrough, and then someone decides to shave a few characters
with passthru :-)  Long live mediated PMU!!!

> +							  unsigned long *msr_bitmap_l1,
> +							  unsigned long *msr_bitmap_l0)
> +{
> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	int i;
> +
> +	for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
> +		nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
> +						 msr_bitmap_l0,
> +						 MSR_ARCH_PERFMON_EVENTSEL0 + i,
> +						 MSR_TYPE_RW);
> +		nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
> +						 msr_bitmap_l0,
> +						 MSR_IA32_PERFCTR0 + i,
> +						 MSR_TYPE_RW);
> +		nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
> +						 msr_bitmap_l0,
> +						 MSR_IA32_PMC0 + i,
> +						 MSR_TYPE_RW);
> +	}
> +
> +	for (i = 0; i < vcpu_to_pmu(vcpu)->nr_arch_fixed_counters; i++) {

Curly braces aren't needed, and this can use "pmu" instead of "vcpu_to_pmu".

> +		nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
> +						 msr_bitmap_l0,
> +						 MSR_CORE_PERF_FIXED_CTR0 + i,
> +						 MSR_TYPE_RW);
> +	}
> +	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
> +					 msr_bitmap_l0,
> +					 MSR_CORE_PERF_FIXED_CTR_CTRL,
> +					 MSR_TYPE_RW);
> +
> +	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
> +					 msr_bitmap_l0,
> +					 MSR_CORE_PERF_GLOBAL_STATUS,
> +					 MSR_TYPE_RW);
> +	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
> +					 msr_bitmap_l0,
> +					 MSR_CORE_PERF_GLOBAL_CTRL,
> +					 MSR_TYPE_RW);
> +	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
> +					 msr_bitmap_l0,
> +					 MSR_CORE_PERF_GLOBAL_OVF_CTRL,
> +					 MSR_TYPE_RW);
> +}
> +
>  /*
>   * Merge L0's and L1's MSR bitmap, return false to indicate that
>   * we do not use the hardware.
> @@ -660,6 +709,9 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>  	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
>  					 MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
>  
> +	if (is_passthrough_pmu_enabled(vcpu))
> +		nested_vmx_set_passthru_pmu_intercept_for_msr(vcpu, msr_bitmap_l1, msr_bitmap_l0);

More code that's probably cleaner if the helper handles the PMU type.

	nested_vmx_set_pmu_msr_intercepts(vcpu, msr_bitmap_l1, msr_bitmap_l0);

and then

	if (!enable_mediated_pmu || !pmu->version)
		return;

> +
>  	kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
>  
>  	vmx->nested.force_msr_bitmap_recalc = false;
> -- 
> 2.34.1
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index c5ec0ef51ff7..95e1c78152da 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -561,6 +561,55 @@  static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
 						   msr_bitmap_l0, msr);
 }
 
+/* Pass PMU MSRs to nested VM if L0 and L1 are set to passthrough. */
+static void nested_vmx_set_passthru_pmu_intercept_for_msr(struct kvm_vcpu *vcpu,
+							  unsigned long *msr_bitmap_l1,
+							  unsigned long *msr_bitmap_l0)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int i;
+
+	for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
+		nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
+						 msr_bitmap_l0,
+						 MSR_ARCH_PERFMON_EVENTSEL0 + i,
+						 MSR_TYPE_RW);
+		nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
+						 msr_bitmap_l0,
+						 MSR_IA32_PERFCTR0 + i,
+						 MSR_TYPE_RW);
+		nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
+						 msr_bitmap_l0,
+						 MSR_IA32_PMC0 + i,
+						 MSR_TYPE_RW);
+	}
+
+	for (i = 0; i < vcpu_to_pmu(vcpu)->nr_arch_fixed_counters; i++) {
+		nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
+						 msr_bitmap_l0,
+						 MSR_CORE_PERF_FIXED_CTR0 + i,
+						 MSR_TYPE_RW);
+	}
+	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
+					 msr_bitmap_l0,
+					 MSR_CORE_PERF_FIXED_CTR_CTRL,
+					 MSR_TYPE_RW);
+
+	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
+					 msr_bitmap_l0,
+					 MSR_CORE_PERF_GLOBAL_STATUS,
+					 MSR_TYPE_RW);
+	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
+					 msr_bitmap_l0,
+					 MSR_CORE_PERF_GLOBAL_CTRL,
+					 MSR_TYPE_RW);
+	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1,
+					 msr_bitmap_l0,
+					 MSR_CORE_PERF_GLOBAL_OVF_CTRL,
+					 MSR_TYPE_RW);
+}
+
 /*
  * Merge L0's and L1's MSR bitmap, return false to indicate that
  * we do not use the hardware.
@@ -660,6 +709,9 @@  static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
 					 MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
 
+	if (is_passthrough_pmu_enabled(vcpu))
+		nested_vmx_set_passthru_pmu_intercept_for_msr(vcpu, msr_bitmap_l1, msr_bitmap_l0);
+
 	kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
 
 	vmx->nested.force_msr_bitmap_recalc = false;