Message ID | 20240126085444.324918-24-xiong.y.zhang@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: x86/pmu: Introduce passthrough vPM | expand |
On Fri, Jan 26, 2024, Xiong Zhang wrote: > static void intel_save_pmu_context(struct kvm_vcpu *vcpu) > { > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > + struct kvm_pmc *pmc; > + u32 i; > + > + if (pmu->version != 2) { > + pr_warn("only PerfMon v2 is supported for passthrough PMU"); > + return; > + } > + > + /* Global ctrl register is already saved at VM-exit. */ > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); > + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ > + if (pmu->global_status) > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); > + > + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { > + pmc = &pmu->gp_counters[i]; > + rdpmcl(i, pmc->counter); > + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel); > + /* > + * Clear hardware PERFMON_EVENTSELx and its counter to avoid > + * leakage and also avoid this guest GP counter get accidentally > + * enabled during host running when host enable global ctrl. > + */ > + if (pmc->eventsel) > + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0); > + if (pmc->counter) > + wrmsrl(MSR_IA32_PMC0 + i, 0); > + } > + > + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); > + /* > + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and > + * also avoid these guest fixed counters get accidentially enabled > + * during host running when host enable global ctrl. > + */ > + if (pmu->fixed_ctr_ctrl) > + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); > + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > + pmc = &pmu->fixed_counters[i]; > + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); > + if (pmc->counter) > + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0); > + } For the next RFC, please make that it includes AMD support. Mostly because I'm pretty all of this code can be in common x86. The fixed counters are ugly, but pmu->nr_arch_fixed_counters is guaranteed to '0' on AMD, so it's _just_ ugly, i.e. not functionally problematic.
On Fri, Jan 26, 2024, Xiong Zhang wrote: > From: Dapeng Mi <dapeng1.mi@linux.intel.com> > > Implement the save/restore of PMU state for pasthrough PMU in Intel. In > passthrough mode, KVM owns exclusively the PMU HW when control flow goes to > the scope of passthrough PMU. Thus, KVM needs to save the host PMU state > and gains the full HW PMU ownership. On the contrary, host regains the > ownership of PMU HW from KVM when control flow leaves the scope of > passthrough PMU. > > Implement PMU context switches for Intel CPUs and opptunistically use > rdpmcl() instead of rdmsrl() when reading counters since the former has > lower latency in Intel CPUs. > > Co-developed-by: Mingwei Zhang <mizhang@google.com> > Signed-off-by: Mingwei Zhang <mizhang@google.com> > Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com> > --- > arch/x86/kvm/vmx/pmu_intel.c | 73 ++++++++++++++++++++++++++++++++++++ > 1 file changed, 73 insertions(+) > > diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c > index 0d58fe7d243e..f79bebe7093d 100644 > --- a/arch/x86/kvm/vmx/pmu_intel.c > +++ b/arch/x86/kvm/vmx/pmu_intel.c > @@ -823,10 +823,83 @@ void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) > > static void intel_save_pmu_context(struct kvm_vcpu *vcpu) I would prefer there be a "guest" in there somewhere, e.g. intel_save_guest_pmu_context(). > { > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > + struct kvm_pmc *pmc; > + u32 i; > + > + if (pmu->version != 2) { > + pr_warn("only PerfMon v2 is supported for passthrough PMU"); > + return; > + } > + > + /* Global ctrl register is already saved at VM-exit. */ > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); > + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ > + if (pmu->global_status) > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); > + > + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { > + pmc = &pmu->gp_counters[i]; > + rdpmcl(i, pmc->counter); > + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel); > + /* > + * Clear hardware PERFMON_EVENTSELx and its counter to avoid > + * leakage and also avoid this guest GP counter get accidentally > + * enabled during host running when host enable global ctrl. > + */ > + if (pmc->eventsel) > + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0); > + if (pmc->counter) > + wrmsrl(MSR_IA32_PMC0 + i, 0); This doesn't make much sense. The kernel already has full access to the guest, I don't see what is gained by zeroing out the MSRs just to hide them from perf. Similarly, if perf enables a counter if PERF_GLOBAL_CTRL without first restoring the event selector, we gots problems. Same thing for the fixed counters below. Can't this just be? for (i = 0; i < pmu->nr_arch_gp_counters; i++) rdpmcl(i, pmu->gp_counters[i].counter); for (i = 0; i < pmu->nr_arch_fixed_counters; i++) rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmu->fixed_counters[i].counter); > + } > + > + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); > + /* > + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and > + * also avoid these guest fixed counters get accidentially enabled > + * during host running when host enable global ctrl. > + */ > + if (pmu->fixed_ctr_ctrl) > + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); > + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > + pmc = &pmu->fixed_counters[i]; > + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); > + if (pmc->counter) > + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0); > + } > } > > static void intel_restore_pmu_context(struct kvm_vcpu *vcpu) > { > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > + struct kvm_pmc *pmc; > + u64 global_status; > + int i; > + > + if (pmu->version != 2) { > + pr_warn("only PerfMon v2 is supported for passthrough PMU"); > + return; > + } > + > + /* Clear host global_ctrl and global_status MSR if non-zero. */ > + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now? > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); > + if (global_status) > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status); This seems especially silly, isn't the full MSR being written below? Or am I misunderstanding how these things work? > + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status); > + > + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { > + pmc = &pmu->gp_counters[i]; > + wrmsrl(MSR_IA32_PMC0 + i, pmc->counter); > + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, pmc->eventsel); > + } > + > + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); > + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > + pmc = &pmu->fixed_counters[i]; > + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, pmc->counter); > + } > } > > struct kvm_pmu_ops intel_pmu_ops __initdata = { > -- > 2.34.1 >
On Thu, Apr 11, 2024 at 2:44 PM Sean Christopherson <seanjc@google.com> wrote: > > On Fri, Jan 26, 2024, Xiong Zhang wrote: > > From: Dapeng Mi <dapeng1.mi@linux.intel.com> > > > > Implement the save/restore of PMU state for pasthrough PMU in Intel. In > > passthrough mode, KVM owns exclusively the PMU HW when control flow goes to > > the scope of passthrough PMU. Thus, KVM needs to save the host PMU state > > and gains the full HW PMU ownership. On the contrary, host regains the > > ownership of PMU HW from KVM when control flow leaves the scope of > > passthrough PMU. > > > > Implement PMU context switches for Intel CPUs and opptunistically use > > rdpmcl() instead of rdmsrl() when reading counters since the former has > > lower latency in Intel CPUs. > > > > Co-developed-by: Mingwei Zhang <mizhang@google.com> > > Signed-off-by: Mingwei Zhang <mizhang@google.com> > > Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com> > > --- > > arch/x86/kvm/vmx/pmu_intel.c | 73 ++++++++++++++++++++++++++++++++++++ > > 1 file changed, 73 insertions(+) > > > > diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c > > index 0d58fe7d243e..f79bebe7093d 100644 > > --- a/arch/x86/kvm/vmx/pmu_intel.c > > +++ b/arch/x86/kvm/vmx/pmu_intel.c > > @@ -823,10 +823,83 @@ void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) > > > > static void intel_save_pmu_context(struct kvm_vcpu *vcpu) > > I would prefer there be a "guest" in there somewhere, e.g. intel_save_guest_pmu_context(). > > > { > > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > > + struct kvm_pmc *pmc; > > + u32 i; > > + > > + if (pmu->version != 2) { > > + pr_warn("only PerfMon v2 is supported for passthrough PMU"); > > + return; > > + } > > + > > + /* Global ctrl register is already saved at VM-exit. */ > > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); > > + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ > > + if (pmu->global_status) > > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); > > + > > + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { > > + pmc = &pmu->gp_counters[i]; > > + rdpmcl(i, pmc->counter); > > + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel); > > + /* > > + * Clear hardware PERFMON_EVENTSELx and its counter to avoid > > + * leakage and also avoid this guest GP counter get accidentally > > + * enabled during host running when host enable global ctrl. > > + */ > > + if (pmc->eventsel) > > + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0); > > + if (pmc->counter) > > + wrmsrl(MSR_IA32_PMC0 + i, 0); > > This doesn't make much sense. The kernel already has full access to the guest, > I don't see what is gained by zeroing out the MSRs just to hide them from perf. > > Similarly, if perf enables a counter if PERF_GLOBAL_CTRL without first restoring > the event selector, we gots problems. > > Same thing for the fixed counters below. Can't this just be? > > for (i = 0; i < pmu->nr_arch_gp_counters; i++) > rdpmcl(i, pmu->gp_counters[i].counter); > > for (i = 0; i < pmu->nr_arch_fixed_counters; i++) > rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, > pmu->fixed_counters[i].counter); > > > + } > > + > > + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); > > + /* > > + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and > > + * also avoid these guest fixed counters get accidentially enabled > > + * during host running when host enable global ctrl. > > + */ > > + if (pmu->fixed_ctr_ctrl) > > + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); > > + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > > + pmc = &pmu->fixed_counters[i]; > > + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); > > + if (pmc->counter) > > + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0); > > + } > > } > > > > static void intel_restore_pmu_context(struct kvm_vcpu *vcpu) > > { > > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > > + struct kvm_pmc *pmc; > > + u64 global_status; > > + int i; > > + > > + if (pmu->version != 2) { > > + pr_warn("only PerfMon v2 is supported for passthrough PMU"); > > + return; > > + } > > + > > + /* Clear host global_ctrl and global_status MSR if non-zero. */ > > + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); > > Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now? > > > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); > > + if (global_status) > > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status); > > This seems especially silly, isn't the full MSR being written below? Or am I > misunderstanding how these things work? LOL! You expect CPU design to follow basic logic?!? Writing a 1 to a bit in IA32_PERF_GLOBAL_STATUS_SET sets the corresponding bit in IA32_PERF_GLOBAL_STATUS to 1. Writing a 0 to a bit in to IA32_PERF_GLOBAL_STATUS_SET is a nop. To clear a bit in IA32_PERF_GLOBAL_STATUS, you need to write a 1 to the corresponding bit in IA32_PERF_GLOBAL_STATUS_RESET (aka IA32_PERF_GLOBAL_OVF_CTRL). > > + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status); > > + > > + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { > > + pmc = &pmu->gp_counters[i]; > > + wrmsrl(MSR_IA32_PMC0 + i, pmc->counter); > > + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, pmc->eventsel); > > + } > > + > > + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); > > + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > > + pmc = &pmu->fixed_counters[i]; > > + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, pmc->counter); > > + } > > } > > > > struct kvm_pmu_ops intel_pmu_ops __initdata = { > > -- > > 2.34.1 > >
On Thu, Apr 11, 2024, Jim Mattson wrote: > On Thu, Apr 11, 2024 at 2:44 PM Sean Christopherson <seanjc@google.com> wrote: > > > + /* Clear host global_ctrl and global_status MSR if non-zero. */ > > > + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); > > > > Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now? > > > > > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); > > > + if (global_status) > > > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status); > > > > This seems especially silly, isn't the full MSR being written below? Or am I > > misunderstanding how these things work? > > LOL! You expect CPU design to follow basic logic?!? > > Writing a 1 to a bit in IA32_PERF_GLOBAL_STATUS_SET sets the > corresponding bit in IA32_PERF_GLOBAL_STATUS to 1. > > Writing a 0 to a bit in to IA32_PERF_GLOBAL_STATUS_SET is a nop. > > To clear a bit in IA32_PERF_GLOBAL_STATUS, you need to write a 1 to > the corresponding bit in IA32_PERF_GLOBAL_STATUS_RESET (aka > IA32_PERF_GLOBAL_OVF_CTRL). If only C had a way to annotate what the code is doing. :-) > > > + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status); IIUC, that means this should be: if (pmu->global_status) wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status); or even better: toggle = pmu->global_status ^ global_status; if (global_status & toggle) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle); if (pmu->global_status & toggle) wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle);
On 4/12/2024 5:26 AM, Sean Christopherson wrote: > On Fri, Jan 26, 2024, Xiong Zhang wrote: >> static void intel_save_pmu_context(struct kvm_vcpu *vcpu) >> { >> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); >> + struct kvm_pmc *pmc; >> + u32 i; >> + >> + if (pmu->version != 2) { >> + pr_warn("only PerfMon v2 is supported for passthrough PMU"); >> + return; >> + } >> + >> + /* Global ctrl register is already saved at VM-exit. */ >> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); >> + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ >> + if (pmu->global_status) >> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); >> + >> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { >> + pmc = &pmu->gp_counters[i]; >> + rdpmcl(i, pmc->counter); >> + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel); >> + /* >> + * Clear hardware PERFMON_EVENTSELx and its counter to avoid >> + * leakage and also avoid this guest GP counter get accidentally >> + * enabled during host running when host enable global ctrl. >> + */ >> + if (pmc->eventsel) >> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0); >> + if (pmc->counter) >> + wrmsrl(MSR_IA32_PMC0 + i, 0); >> + } >> + >> + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); >> + /* >> + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and >> + * also avoid these guest fixed counters get accidentially enabled >> + * during host running when host enable global ctrl. >> + */ >> + if (pmu->fixed_ctr_ctrl) >> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); >> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { >> + pmc = &pmu->fixed_counters[i]; >> + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); >> + if (pmc->counter) >> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0); >> + } > For the next RFC, please make that it includes AMD support. Mostly because I'm > pretty all of this code can be in common x86. The fixed counters are ugly, > but pmu->nr_arch_fixed_counters is guaranteed to '0' on AMD, so it's _just_ ugly, > i.e. not functionally problematic. Sure. I believe Mingwei would integrate AMD supporting patches in next version. Yeah, I agree there could be a part of code which can be put into common x86/pmu, but there are still some vendor specific code, we still keep an vendor specific callback.
On 4/12/2024 5:44 AM, Sean Christopherson wrote: > On Fri, Jan 26, 2024, Xiong Zhang wrote: >> From: Dapeng Mi <dapeng1.mi@linux.intel.com> >> >> Implement the save/restore of PMU state for pasthrough PMU in Intel. In >> passthrough mode, KVM owns exclusively the PMU HW when control flow goes to >> the scope of passthrough PMU. Thus, KVM needs to save the host PMU state >> and gains the full HW PMU ownership. On the contrary, host regains the >> ownership of PMU HW from KVM when control flow leaves the scope of >> passthrough PMU. >> >> Implement PMU context switches for Intel CPUs and opptunistically use >> rdpmcl() instead of rdmsrl() when reading counters since the former has >> lower latency in Intel CPUs. >> >> Co-developed-by: Mingwei Zhang <mizhang@google.com> >> Signed-off-by: Mingwei Zhang <mizhang@google.com> >> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com> >> --- >> arch/x86/kvm/vmx/pmu_intel.c | 73 ++++++++++++++++++++++++++++++++++++ >> 1 file changed, 73 insertions(+) >> >> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c >> index 0d58fe7d243e..f79bebe7093d 100644 >> --- a/arch/x86/kvm/vmx/pmu_intel.c >> +++ b/arch/x86/kvm/vmx/pmu_intel.c >> @@ -823,10 +823,83 @@ void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) >> >> static void intel_save_pmu_context(struct kvm_vcpu *vcpu) > I would prefer there be a "guest" in there somewhere, e.g. intel_save_guest_pmu_context(). Yeah. It looks clearer. > >> { >> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); >> + struct kvm_pmc *pmc; >> + u32 i; >> + >> + if (pmu->version != 2) { >> + pr_warn("only PerfMon v2 is supported for passthrough PMU"); >> + return; >> + } >> + >> + /* Global ctrl register is already saved at VM-exit. */ >> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); >> + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ >> + if (pmu->global_status) >> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); >> + >> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { >> + pmc = &pmu->gp_counters[i]; >> + rdpmcl(i, pmc->counter); >> + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel); >> + /* >> + * Clear hardware PERFMON_EVENTSELx and its counter to avoid >> + * leakage and also avoid this guest GP counter get accidentally >> + * enabled during host running when host enable global ctrl. >> + */ >> + if (pmc->eventsel) >> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0); >> + if (pmc->counter) >> + wrmsrl(MSR_IA32_PMC0 + i, 0); > This doesn't make much sense. The kernel already has full access to the guest, > I don't see what is gained by zeroing out the MSRs just to hide them from perf. It's necessary to clear the EVENTSELx MSRs for both GP and fixed counters. Considering this case, Guest uses GP counter 2, but Host doesn't use it. So if the EVENTSEL2 MSR is not cleared here, the GP counter 2 would be enabled unexpectedly on host later since Host perf always enable all validate bits in PERF_GLOBAL_CTRL MSR. That would cause issues. Yeah, the clearing for PMCx MSR should be unnecessary . > > Similarly, if perf enables a counter if PERF_GLOBAL_CTRL without first restoring > the event selector, we gots problems. > > Same thing for the fixed counters below. Can't this just be? > > for (i = 0; i < pmu->nr_arch_gp_counters; i++) > rdpmcl(i, pmu->gp_counters[i].counter); > > for (i = 0; i < pmu->nr_arch_fixed_counters; i++) > rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, > pmu->fixed_counters[i].counter); > >> + } >> + >> + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); >> + /* >> + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and >> + * also avoid these guest fixed counters get accidentially enabled >> + * during host running when host enable global ctrl. >> + */ >> + if (pmu->fixed_ctr_ctrl) >> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); >> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { >> + pmc = &pmu->fixed_counters[i]; >> + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); >> + if (pmc->counter) >> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0); >> + } >> } >> >> static void intel_restore_pmu_context(struct kvm_vcpu *vcpu) >> { >> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); >> + struct kvm_pmc *pmc; >> + u64 global_status; >> + int i; >> + >> + if (pmu->version != 2) { >> + pr_warn("only PerfMon v2 is supported for passthrough PMU"); >> + return; >> + } >> + >> + /* Clear host global_ctrl and global_status MSR if non-zero. */ >> + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); > Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now? As previous comments say, host perf always enable all counters in PERF_GLOBAL_CTRL by default. The reason to clear PERF_GLOBAL_CTRL here is to ensure all counters in disabled state and the later counter manipulation (writing MSR) won't cause any race condition or unexpected behavior on HW. > >> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); >> + if (global_status) >> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status); > This seems especially silly, isn't the full MSR being written below? Or am I > misunderstanding how these things work? I think Jim's comment has already explain why we need to do this. > >> + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status); >> + >> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { >> + pmc = &pmu->gp_counters[i]; >> + wrmsrl(MSR_IA32_PMC0 + i, pmc->counter); >> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, pmc->eventsel); >> + } >> + >> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); >> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { >> + pmc = &pmu->fixed_counters[i]; >> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, pmc->counter); >> + } >> } >> >> struct kvm_pmu_ops intel_pmu_ops __initdata = { >> -- >> 2.34.1 >>
On 4/12/2024 7:31 AM, Sean Christopherson wrote: > On Thu, Apr 11, 2024, Jim Mattson wrote: >> On Thu, Apr 11, 2024 at 2:44 PM Sean Christopherson <seanjc@google.com> wrote: >>>> + /* Clear host global_ctrl and global_status MSR if non-zero. */ >>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); >>> Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now? >>> >>>> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); >>>> + if (global_status) >>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status); >>> This seems especially silly, isn't the full MSR being written below? Or am I >>> misunderstanding how these things work? >> LOL! You expect CPU design to follow basic logic?!? >> >> Writing a 1 to a bit in IA32_PERF_GLOBAL_STATUS_SET sets the >> corresponding bit in IA32_PERF_GLOBAL_STATUS to 1. >> >> Writing a 0 to a bit in to IA32_PERF_GLOBAL_STATUS_SET is a nop. >> >> To clear a bit in IA32_PERF_GLOBAL_STATUS, you need to write a 1 to >> the corresponding bit in IA32_PERF_GLOBAL_STATUS_RESET (aka >> IA32_PERF_GLOBAL_OVF_CTRL). > If only C had a way to annotate what the code is doing. :-) > >>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status); > IIUC, that means this should be: > > if (pmu->global_status) > wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status); > > or even better: > > toggle = pmu->global_status ^ global_status; > if (global_status & toggle) > wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle); > if (pmu->global_status & toggle) > wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle); Thanks, it looks better. Since PMU v4+, the MSR CORE_PERF_GLOBAL_OVF_CTRL is renamed to CORE_PERF_GLOBAL_STATUS_RESET with supporting more bits. CORE_PERF_GLOBAL_STATUS_RESET looks more easily understand just from name than CORE_PERF_GLOBAL_OVF_CTRL, I would prefer use this name in next version.
On Sat, Apr 13, 2024, Mi, Dapeng wrote: > > On 4/12/2024 5:44 AM, Sean Christopherson wrote: > > On Fri, Jan 26, 2024, Xiong Zhang wrote: > > > From: Dapeng Mi <dapeng1.mi@linux.intel.com> > > > > > > Implement the save/restore of PMU state for pasthrough PMU in Intel. In > > > passthrough mode, KVM owns exclusively the PMU HW when control flow goes to > > > the scope of passthrough PMU. Thus, KVM needs to save the host PMU state > > > and gains the full HW PMU ownership. On the contrary, host regains the > > > ownership of PMU HW from KVM when control flow leaves the scope of > > > passthrough PMU. > > > > > > Implement PMU context switches for Intel CPUs and opptunistically use > > > rdpmcl() instead of rdmsrl() when reading counters since the former has > > > lower latency in Intel CPUs. > > > > > > Co-developed-by: Mingwei Zhang <mizhang@google.com> > > > Signed-off-by: Mingwei Zhang <mizhang@google.com> > > > Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com> > > > --- > > > arch/x86/kvm/vmx/pmu_intel.c | 73 ++++++++++++++++++++++++++++++++++++ > > > 1 file changed, 73 insertions(+) > > > > > > diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c > > > index 0d58fe7d243e..f79bebe7093d 100644 > > > --- a/arch/x86/kvm/vmx/pmu_intel.c > > > +++ b/arch/x86/kvm/vmx/pmu_intel.c > > > @@ -823,10 +823,83 @@ void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) > > > static void intel_save_pmu_context(struct kvm_vcpu *vcpu) > > I would prefer there be a "guest" in there somewhere, e.g. intel_save_guest_pmu_context(). > Yeah. It looks clearer. > > > > > { > > > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > > > + struct kvm_pmc *pmc; > > > + u32 i; > > > + > > > + if (pmu->version != 2) { > > > + pr_warn("only PerfMon v2 is supported for passthrough PMU"); > > > + return; > > > + } > > > + > > > + /* Global ctrl register is already saved at VM-exit. */ > > > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); > > > + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ > > > + if (pmu->global_status) > > > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); > > > + > > > + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { > > > + pmc = &pmu->gp_counters[i]; > > > + rdpmcl(i, pmc->counter); > > > + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel); > > > + /* > > > + * Clear hardware PERFMON_EVENTSELx and its counter to avoid > > > + * leakage and also avoid this guest GP counter get accidentally > > > + * enabled during host running when host enable global ctrl. > > > + */ > > > + if (pmc->eventsel) > > > + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0); > > > + if (pmc->counter) > > > + wrmsrl(MSR_IA32_PMC0 + i, 0); > > This doesn't make much sense. The kernel already has full access to the guest, > > I don't see what is gained by zeroing out the MSRs just to hide them from perf. > > It's necessary to clear the EVENTSELx MSRs for both GP and fixed counters. > Considering this case, Guest uses GP counter 2, but Host doesn't use it. So > if the EVENTSEL2 MSR is not cleared here, the GP counter 2 would be enabled > unexpectedly on host later since Host perf always enable all validate bits > in PERF_GLOBAL_CTRL MSR. That would cause issues. > > Yeah, the clearing for PMCx MSR should be unnecessary . > Why is clearing for PMCx MSR unnecessary? Do we want to leaking counter values to the host? NO. Not in cloud usage. Please make changes to this patch with **extreme** caution. According to our past experience, if there is a bug somewhere, there is a catch here (normally). Thanks. -Mingwei > > > > > Similarly, if perf enables a counter if PERF_GLOBAL_CTRL without first restoring > > the event selector, we gots problems. > > > > Same thing for the fixed counters below. Can't this just be? > > > > for (i = 0; i < pmu->nr_arch_gp_counters; i++) > > rdpmcl(i, pmu->gp_counters[i].counter); > > > > for (i = 0; i < pmu->nr_arch_fixed_counters; i++) > > rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, > > pmu->fixed_counters[i].counter); > > > > > + } > > > + > > > + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); > > > + /* > > > + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and > > > + * also avoid these guest fixed counters get accidentially enabled > > > + * during host running when host enable global ctrl. > > > + */ > > > + if (pmu->fixed_ctr_ctrl) > > > + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); > > > + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > > > + pmc = &pmu->fixed_counters[i]; > > > + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); > > > + if (pmc->counter) > > > + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0); > > > + } > > > } > > > static void intel_restore_pmu_context(struct kvm_vcpu *vcpu) > > > { > > > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > > > + struct kvm_pmc *pmc; > > > + u64 global_status; > > > + int i; > > > + > > > + if (pmu->version != 2) { > > > + pr_warn("only PerfMon v2 is supported for passthrough PMU"); > > > + return; > > > + } > > > + > > > + /* Clear host global_ctrl and global_status MSR if non-zero. */ > > > + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); > > Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now? > > As previous comments say, host perf always enable all counters in > PERF_GLOBAL_CTRL by default. The reason to clear PERF_GLOBAL_CTRL here is to > ensure all counters in disabled state and the later counter manipulation > (writing MSR) won't cause any race condition or unexpected behavior on HW. > > > > > > > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); > > > + if (global_status) > > > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status); > > This seems especially silly, isn't the full MSR being written below? Or am I > > misunderstanding how these things work? > > I think Jim's comment has already explain why we need to do this. > > > > > > + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status); > > > + > > > + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { > > > + pmc = &pmu->gp_counters[i]; > > > + wrmsrl(MSR_IA32_PMC0 + i, pmc->counter); > > > + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, pmc->eventsel); > > > + } > > > + > > > + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); > > > + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > > > + pmc = &pmu->fixed_counters[i]; > > > + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, pmc->counter); > > > + } > > > } > > > struct kvm_pmu_ops intel_pmu_ops __initdata = { > > > -- > > > 2.34.1 > > >
On 4/13/2024 11:34 AM, Mingwei Zhang wrote: > On Sat, Apr 13, 2024, Mi, Dapeng wrote: >> On 4/12/2024 5:44 AM, Sean Christopherson wrote: >>> On Fri, Jan 26, 2024, Xiong Zhang wrote: >>>> From: Dapeng Mi <dapeng1.mi@linux.intel.com> >>>> >>>> Implement the save/restore of PMU state for pasthrough PMU in Intel. In >>>> passthrough mode, KVM owns exclusively the PMU HW when control flow goes to >>>> the scope of passthrough PMU. Thus, KVM needs to save the host PMU state >>>> and gains the full HW PMU ownership. On the contrary, host regains the >>>> ownership of PMU HW from KVM when control flow leaves the scope of >>>> passthrough PMU. >>>> >>>> Implement PMU context switches for Intel CPUs and opptunistically use >>>> rdpmcl() instead of rdmsrl() when reading counters since the former has >>>> lower latency in Intel CPUs. >>>> >>>> Co-developed-by: Mingwei Zhang <mizhang@google.com> >>>> Signed-off-by: Mingwei Zhang <mizhang@google.com> >>>> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com> >>>> --- >>>> arch/x86/kvm/vmx/pmu_intel.c | 73 ++++++++++++++++++++++++++++++++++++ >>>> 1 file changed, 73 insertions(+) >>>> >>>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c >>>> index 0d58fe7d243e..f79bebe7093d 100644 >>>> --- a/arch/x86/kvm/vmx/pmu_intel.c >>>> +++ b/arch/x86/kvm/vmx/pmu_intel.c >>>> @@ -823,10 +823,83 @@ void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) >>>> static void intel_save_pmu_context(struct kvm_vcpu *vcpu) >>> I would prefer there be a "guest" in there somewhere, e.g. intel_save_guest_pmu_context(). >> Yeah. It looks clearer. >>>> { >>>> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); >>>> + struct kvm_pmc *pmc; >>>> + u32 i; >>>> + >>>> + if (pmu->version != 2) { >>>> + pr_warn("only PerfMon v2 is supported for passthrough PMU"); >>>> + return; >>>> + } >>>> + >>>> + /* Global ctrl register is already saved at VM-exit. */ >>>> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); >>>> + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ >>>> + if (pmu->global_status) >>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); >>>> + >>>> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { >>>> + pmc = &pmu->gp_counters[i]; >>>> + rdpmcl(i, pmc->counter); >>>> + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel); >>>> + /* >>>> + * Clear hardware PERFMON_EVENTSELx and its counter to avoid >>>> + * leakage and also avoid this guest GP counter get accidentally >>>> + * enabled during host running when host enable global ctrl. >>>> + */ >>>> + if (pmc->eventsel) >>>> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0); >>>> + if (pmc->counter) >>>> + wrmsrl(MSR_IA32_PMC0 + i, 0); >>> This doesn't make much sense. The kernel already has full access to the guest, >>> I don't see what is gained by zeroing out the MSRs just to hide them from perf. >> It's necessary to clear the EVENTSELx MSRs for both GP and fixed counters. >> Considering this case, Guest uses GP counter 2, but Host doesn't use it. So >> if the EVENTSEL2 MSR is not cleared here, the GP counter 2 would be enabled >> unexpectedly on host later since Host perf always enable all validate bits >> in PERF_GLOBAL_CTRL MSR. That would cause issues. >> >> Yeah, the clearing for PMCx MSR should be unnecessary . >> > Why is clearing for PMCx MSR unnecessary? Do we want to leaking counter > values to the host? NO. Not in cloud usage. No, this place is clearing the guest counter value instead of host counter value. Host always has method to see guest value in a normal VM if he want. I don't see its necessity, it's just a overkill and introduce extra overhead to write MSRs. > > Please make changes to this patch with **extreme** caution. > > According to our past experience, if there is a bug somewhere, > there is a catch here (normally). > > Thanks. > -Mingwei >>> Similarly, if perf enables a counter if PERF_GLOBAL_CTRL without first restoring >>> the event selector, we gots problems. >>> >>> Same thing for the fixed counters below. Can't this just be? >>> >>> for (i = 0; i < pmu->nr_arch_gp_counters; i++) >>> rdpmcl(i, pmu->gp_counters[i].counter); >>> >>> for (i = 0; i < pmu->nr_arch_fixed_counters; i++) >>> rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, >>> pmu->fixed_counters[i].counter); >>> >>>> + } >>>> + >>>> + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); >>>> + /* >>>> + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and >>>> + * also avoid these guest fixed counters get accidentially enabled >>>> + * during host running when host enable global ctrl. >>>> + */ >>>> + if (pmu->fixed_ctr_ctrl) >>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); >>>> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { >>>> + pmc = &pmu->fixed_counters[i]; >>>> + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); >>>> + if (pmc->counter) >>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0); >>>> + } >>>> } >>>> static void intel_restore_pmu_context(struct kvm_vcpu *vcpu) >>>> { >>>> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); >>>> + struct kvm_pmc *pmc; >>>> + u64 global_status; >>>> + int i; >>>> + >>>> + if (pmu->version != 2) { >>>> + pr_warn("only PerfMon v2 is supported for passthrough PMU"); >>>> + return; >>>> + } >>>> + >>>> + /* Clear host global_ctrl and global_status MSR if non-zero. */ >>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); >>> Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now? >> As previous comments say, host perf always enable all counters in >> PERF_GLOBAL_CTRL by default. The reason to clear PERF_GLOBAL_CTRL here is to >> ensure all counters in disabled state and the later counter manipulation >> (writing MSR) won't cause any race condition or unexpected behavior on HW. >> >> >>>> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); >>>> + if (global_status) >>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status); >>> This seems especially silly, isn't the full MSR being written below? Or am I >>> misunderstanding how these things work? >> I think Jim's comment has already explain why we need to do this. >> >>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status); >>>> + >>>> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { >>>> + pmc = &pmu->gp_counters[i]; >>>> + wrmsrl(MSR_IA32_PMC0 + i, pmc->counter); >>>> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, pmc->eventsel); >>>> + } >>>> + >>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); >>>> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { >>>> + pmc = &pmu->fixed_counters[i]; >>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, pmc->counter); >>>> + } >>>> } >>>> struct kvm_pmu_ops intel_pmu_ops __initdata = { >>>> -- >>>> 2.34.1 >>>>
On Fri, Apr 12, 2024 at 9:25 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > On 4/13/2024 11:34 AM, Mingwei Zhang wrote: > > On Sat, Apr 13, 2024, Mi, Dapeng wrote: > >> On 4/12/2024 5:44 AM, Sean Christopherson wrote: > >>> On Fri, Jan 26, 2024, Xiong Zhang wrote: > >>>> From: Dapeng Mi <dapeng1.mi@linux.intel.com> > >>>> > >>>> Implement the save/restore of PMU state for pasthrough PMU in Intel. In > >>>> passthrough mode, KVM owns exclusively the PMU HW when control flow goes to > >>>> the scope of passthrough PMU. Thus, KVM needs to save the host PMU state > >>>> and gains the full HW PMU ownership. On the contrary, host regains the > >>>> ownership of PMU HW from KVM when control flow leaves the scope of > >>>> passthrough PMU. > >>>> > >>>> Implement PMU context switches for Intel CPUs and opptunistically use > >>>> rdpmcl() instead of rdmsrl() when reading counters since the former has > >>>> lower latency in Intel CPUs. > >>>> > >>>> Co-developed-by: Mingwei Zhang <mizhang@google.com> > >>>> Signed-off-by: Mingwei Zhang <mizhang@google.com> > >>>> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com> > >>>> --- > >>>> arch/x86/kvm/vmx/pmu_intel.c | 73 ++++++++++++++++++++++++++++++++++++ > >>>> 1 file changed, 73 insertions(+) > >>>> > >>>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c > >>>> index 0d58fe7d243e..f79bebe7093d 100644 > >>>> --- a/arch/x86/kvm/vmx/pmu_intel.c > >>>> +++ b/arch/x86/kvm/vmx/pmu_intel.c > >>>> @@ -823,10 +823,83 @@ void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) > >>>> static void intel_save_pmu_context(struct kvm_vcpu *vcpu) > >>> I would prefer there be a "guest" in there somewhere, e.g. intel_save_guest_pmu_context(). > >> Yeah. It looks clearer. > >>>> { > >>>> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > >>>> + struct kvm_pmc *pmc; > >>>> + u32 i; > >>>> + > >>>> + if (pmu->version != 2) { > >>>> + pr_warn("only PerfMon v2 is supported for passthrough PMU"); > >>>> + return; > >>>> + } > >>>> + > >>>> + /* Global ctrl register is already saved at VM-exit. */ > >>>> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); > >>>> + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ > >>>> + if (pmu->global_status) > >>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); > >>>> + > >>>> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { > >>>> + pmc = &pmu->gp_counters[i]; > >>>> + rdpmcl(i, pmc->counter); > >>>> + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel); > >>>> + /* > >>>> + * Clear hardware PERFMON_EVENTSELx and its counter to avoid > >>>> + * leakage and also avoid this guest GP counter get accidentally > >>>> + * enabled during host running when host enable global ctrl. > >>>> + */ > >>>> + if (pmc->eventsel) > >>>> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0); > >>>> + if (pmc->counter) > >>>> + wrmsrl(MSR_IA32_PMC0 + i, 0); > >>> This doesn't make much sense. The kernel already has full access to the guest, > >>> I don't see what is gained by zeroing out the MSRs just to hide them from perf. > >> It's necessary to clear the EVENTSELx MSRs for both GP and fixed counters. > >> Considering this case, Guest uses GP counter 2, but Host doesn't use it. So > >> if the EVENTSEL2 MSR is not cleared here, the GP counter 2 would be enabled > >> unexpectedly on host later since Host perf always enable all validate bits > >> in PERF_GLOBAL_CTRL MSR. That would cause issues. > >> > >> Yeah, the clearing for PMCx MSR should be unnecessary . > >> > > Why is clearing for PMCx MSR unnecessary? Do we want to leaking counter > > values to the host? NO. Not in cloud usage. > > No, this place is clearing the guest counter value instead of host > counter value. Host always has method to see guest value in a normal VM > if he want. I don't see its necessity, it's just a overkill and > introduce extra overhead to write MSRs. > I am curious how the perf subsystem solves the problem? Does perf subsystem in the host only scrubbing the selector but not the counter value when doing the context switch? > > > > > Please make changes to this patch with **extreme** caution. > > > > According to our past experience, if there is a bug somewhere, > > there is a catch here (normally). > > > > Thanks. > > -Mingwei > >>> Similarly, if perf enables a counter if PERF_GLOBAL_CTRL without first restoring > >>> the event selector, we gots problems. > >>> > >>> Same thing for the fixed counters below. Can't this just be? > >>> > >>> for (i = 0; i < pmu->nr_arch_gp_counters; i++) > >>> rdpmcl(i, pmu->gp_counters[i].counter); > >>> > >>> for (i = 0; i < pmu->nr_arch_fixed_counters; i++) > >>> rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, > >>> pmu->fixed_counters[i].counter); > >>> > >>>> + } > >>>> + > >>>> + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); > >>>> + /* > >>>> + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and > >>>> + * also avoid these guest fixed counters get accidentially enabled > >>>> + * during host running when host enable global ctrl. > >>>> + */ > >>>> + if (pmu->fixed_ctr_ctrl) > >>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); > >>>> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > >>>> + pmc = &pmu->fixed_counters[i]; > >>>> + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); > >>>> + if (pmc->counter) > >>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0); > >>>> + } > >>>> } > >>>> static void intel_restore_pmu_context(struct kvm_vcpu *vcpu) > >>>> { > >>>> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > >>>> + struct kvm_pmc *pmc; > >>>> + u64 global_status; > >>>> + int i; > >>>> + > >>>> + if (pmu->version != 2) { > >>>> + pr_warn("only PerfMon v2 is supported for passthrough PMU"); > >>>> + return; > >>>> + } > >>>> + > >>>> + /* Clear host global_ctrl and global_status MSR if non-zero. */ > >>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); > >>> Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now? > >> As previous comments say, host perf always enable all counters in > >> PERF_GLOBAL_CTRL by default. The reason to clear PERF_GLOBAL_CTRL here is to > >> ensure all counters in disabled state and the later counter manipulation > >> (writing MSR) won't cause any race condition or unexpected behavior on HW. > >> > >> > >>>> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); > >>>> + if (global_status) > >>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status); > >>> This seems especially silly, isn't the full MSR being written below? Or am I > >>> misunderstanding how these things work? > >> I think Jim's comment has already explain why we need to do this. > >> > >>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status); > >>>> + > >>>> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { > >>>> + pmc = &pmu->gp_counters[i]; > >>>> + wrmsrl(MSR_IA32_PMC0 + i, pmc->counter); > >>>> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, pmc->eventsel); > >>>> + } > >>>> + > >>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); > >>>> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > >>>> + pmc = &pmu->fixed_counters[i]; > >>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, pmc->counter); > >>>> + } > >>>> } > >>>> struct kvm_pmu_ops intel_pmu_ops __initdata = { > >>>> -- > >>>> 2.34.1 > >>>>
On 4/15/2024 2:06 PM, Mingwei Zhang wrote: > On Fri, Apr 12, 2024 at 9:25 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: >> >> On 4/13/2024 11:34 AM, Mingwei Zhang wrote: >>> On Sat, Apr 13, 2024, Mi, Dapeng wrote: >>>> On 4/12/2024 5:44 AM, Sean Christopherson wrote: >>>>> On Fri, Jan 26, 2024, Xiong Zhang wrote: >>>>>> From: Dapeng Mi <dapeng1.mi@linux.intel.com> >>>>>> >>>>>> Implement the save/restore of PMU state for pasthrough PMU in Intel. In >>>>>> passthrough mode, KVM owns exclusively the PMU HW when control flow goes to >>>>>> the scope of passthrough PMU. Thus, KVM needs to save the host PMU state >>>>>> and gains the full HW PMU ownership. On the contrary, host regains the >>>>>> ownership of PMU HW from KVM when control flow leaves the scope of >>>>>> passthrough PMU. >>>>>> >>>>>> Implement PMU context switches for Intel CPUs and opptunistically use >>>>>> rdpmcl() instead of rdmsrl() when reading counters since the former has >>>>>> lower latency in Intel CPUs. >>>>>> >>>>>> Co-developed-by: Mingwei Zhang <mizhang@google.com> >>>>>> Signed-off-by: Mingwei Zhang <mizhang@google.com> >>>>>> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com> >>>>>> --- >>>>>> arch/x86/kvm/vmx/pmu_intel.c | 73 ++++++++++++++++++++++++++++++++++++ >>>>>> 1 file changed, 73 insertions(+) >>>>>> >>>>>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c >>>>>> index 0d58fe7d243e..f79bebe7093d 100644 >>>>>> --- a/arch/x86/kvm/vmx/pmu_intel.c >>>>>> +++ b/arch/x86/kvm/vmx/pmu_intel.c >>>>>> @@ -823,10 +823,83 @@ void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) >>>>>> static void intel_save_pmu_context(struct kvm_vcpu *vcpu) >>>>> I would prefer there be a "guest" in there somewhere, e.g. intel_save_guest_pmu_context(). >>>> Yeah. It looks clearer. >>>>>> { >>>>>> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); >>>>>> + struct kvm_pmc *pmc; >>>>>> + u32 i; >>>>>> + >>>>>> + if (pmu->version != 2) { >>>>>> + pr_warn("only PerfMon v2 is supported for passthrough PMU"); >>>>>> + return; >>>>>> + } >>>>>> + >>>>>> + /* Global ctrl register is already saved at VM-exit. */ >>>>>> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); >>>>>> + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ >>>>>> + if (pmu->global_status) >>>>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); >>>>>> + >>>>>> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { >>>>>> + pmc = &pmu->gp_counters[i]; >>>>>> + rdpmcl(i, pmc->counter); >>>>>> + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel); >>>>>> + /* >>>>>> + * Clear hardware PERFMON_EVENTSELx and its counter to avoid >>>>>> + * leakage and also avoid this guest GP counter get accidentally >>>>>> + * enabled during host running when host enable global ctrl. >>>>>> + */ >>>>>> + if (pmc->eventsel) >>>>>> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0); >>>>>> + if (pmc->counter) >>>>>> + wrmsrl(MSR_IA32_PMC0 + i, 0); >>>>> This doesn't make much sense. The kernel already has full access to the guest, >>>>> I don't see what is gained by zeroing out the MSRs just to hide them from perf. >>>> It's necessary to clear the EVENTSELx MSRs for both GP and fixed counters. >>>> Considering this case, Guest uses GP counter 2, but Host doesn't use it. So >>>> if the EVENTSEL2 MSR is not cleared here, the GP counter 2 would be enabled >>>> unexpectedly on host later since Host perf always enable all validate bits >>>> in PERF_GLOBAL_CTRL MSR. That would cause issues. >>>> >>>> Yeah, the clearing for PMCx MSR should be unnecessary . >>>> >>> Why is clearing for PMCx MSR unnecessary? Do we want to leaking counter >>> values to the host? NO. Not in cloud usage. >> No, this place is clearing the guest counter value instead of host >> counter value. Host always has method to see guest value in a normal VM >> if he want. I don't see its necessity, it's just a overkill and >> introduce extra overhead to write MSRs. >> > I am curious how the perf subsystem solves the problem? Does perf > subsystem in the host only scrubbing the selector but not the counter > value when doing the context switch? When context switch happens, perf code would schedule out the old events and schedule in the new events. When scheduling out, the ENABLE bit of EVENTSELx MSR would be cleared, and when scheduling in, the EVENTSELx and PMCx MSRs would be overwritten with new event's attr.config and sample_period separately. Of course, these is only for the case when there are new events to be programmed on the PMC. If no new events, the PMCx MSR would keep stall value and won't be cleared. Anyway, I don't see any reason that PMCx MSR must be cleared. > >>> Please make changes to this patch with **extreme** caution. >>> >>> According to our past experience, if there is a bug somewhere, >>> there is a catch here (normally). >>> >>> Thanks. >>> -Mingwei >>>>> Similarly, if perf enables a counter if PERF_GLOBAL_CTRL without first restoring >>>>> the event selector, we gots problems. >>>>> >>>>> Same thing for the fixed counters below. Can't this just be? >>>>> >>>>> for (i = 0; i < pmu->nr_arch_gp_counters; i++) >>>>> rdpmcl(i, pmu->gp_counters[i].counter); >>>>> >>>>> for (i = 0; i < pmu->nr_arch_fixed_counters; i++) >>>>> rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, >>>>> pmu->fixed_counters[i].counter); >>>>> >>>>>> + } >>>>>> + >>>>>> + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); >>>>>> + /* >>>>>> + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and >>>>>> + * also avoid these guest fixed counters get accidentially enabled >>>>>> + * during host running when host enable global ctrl. >>>>>> + */ >>>>>> + if (pmu->fixed_ctr_ctrl) >>>>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); >>>>>> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { >>>>>> + pmc = &pmu->fixed_counters[i]; >>>>>> + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); >>>>>> + if (pmc->counter) >>>>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0); >>>>>> + } >>>>>> } >>>>>> static void intel_restore_pmu_context(struct kvm_vcpu *vcpu) >>>>>> { >>>>>> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); >>>>>> + struct kvm_pmc *pmc; >>>>>> + u64 global_status; >>>>>> + int i; >>>>>> + >>>>>> + if (pmu->version != 2) { >>>>>> + pr_warn("only PerfMon v2 is supported for passthrough PMU"); >>>>>> + return; >>>>>> + } >>>>>> + >>>>>> + /* Clear host global_ctrl and global_status MSR if non-zero. */ >>>>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); >>>>> Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now? >>>> As previous comments say, host perf always enable all counters in >>>> PERF_GLOBAL_CTRL by default. The reason to clear PERF_GLOBAL_CTRL here is to >>>> ensure all counters in disabled state and the later counter manipulation >>>> (writing MSR) won't cause any race condition or unexpected behavior on HW. >>>> >>>> >>>>>> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); >>>>>> + if (global_status) >>>>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status); >>>>> This seems especially silly, isn't the full MSR being written below? Or am I >>>>> misunderstanding how these things work? >>>> I think Jim's comment has already explain why we need to do this. >>>> >>>>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status); >>>>>> + >>>>>> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { >>>>>> + pmc = &pmu->gp_counters[i]; >>>>>> + wrmsrl(MSR_IA32_PMC0 + i, pmc->counter); >>>>>> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, pmc->eventsel); >>>>>> + } >>>>>> + >>>>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); >>>>>> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { >>>>>> + pmc = &pmu->fixed_counters[i]; >>>>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, pmc->counter); >>>>>> + } >>>>>> } >>>>>> struct kvm_pmu_ops intel_pmu_ops __initdata = { >>>>>> -- >>>>>> 2.34.1 >>>>>>
On Mon, Apr 15, 2024 at 3:04 AM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > On 4/15/2024 2:06 PM, Mingwei Zhang wrote: > > On Fri, Apr 12, 2024 at 9:25 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > >> > >> On 4/13/2024 11:34 AM, Mingwei Zhang wrote: > >>> On Sat, Apr 13, 2024, Mi, Dapeng wrote: > >>>> On 4/12/2024 5:44 AM, Sean Christopherson wrote: > >>>>> On Fri, Jan 26, 2024, Xiong Zhang wrote: > >>>>>> From: Dapeng Mi <dapeng1.mi@linux.intel.com> > >>>>>> > >>>>>> Implement the save/restore of PMU state for pasthrough PMU in Intel. In > >>>>>> passthrough mode, KVM owns exclusively the PMU HW when control flow goes to > >>>>>> the scope of passthrough PMU. Thus, KVM needs to save the host PMU state > >>>>>> and gains the full HW PMU ownership. On the contrary, host regains the > >>>>>> ownership of PMU HW from KVM when control flow leaves the scope of > >>>>>> passthrough PMU. > >>>>>> > >>>>>> Implement PMU context switches for Intel CPUs and opptunistically use > >>>>>> rdpmcl() instead of rdmsrl() when reading counters since the former has > >>>>>> lower latency in Intel CPUs. > >>>>>> > >>>>>> Co-developed-by: Mingwei Zhang <mizhang@google.com> > >>>>>> Signed-off-by: Mingwei Zhang <mizhang@google.com> > >>>>>> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com> > >>>>>> --- > >>>>>> arch/x86/kvm/vmx/pmu_intel.c | 73 ++++++++++++++++++++++++++++++++++++ > >>>>>> 1 file changed, 73 insertions(+) > >>>>>> > >>>>>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c > >>>>>> index 0d58fe7d243e..f79bebe7093d 100644 > >>>>>> --- a/arch/x86/kvm/vmx/pmu_intel.c > >>>>>> +++ b/arch/x86/kvm/vmx/pmu_intel.c > >>>>>> @@ -823,10 +823,83 @@ void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) > >>>>>> static void intel_save_pmu_context(struct kvm_vcpu *vcpu) > >>>>> I would prefer there be a "guest" in there somewhere, e.g. intel_save_guest_pmu_context(). > >>>> Yeah. It looks clearer. > >>>>>> { > >>>>>> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > >>>>>> + struct kvm_pmc *pmc; > >>>>>> + u32 i; > >>>>>> + > >>>>>> + if (pmu->version != 2) { > >>>>>> + pr_warn("only PerfMon v2 is supported for passthrough PMU"); > >>>>>> + return; > >>>>>> + } > >>>>>> + > >>>>>> + /* Global ctrl register is already saved at VM-exit. */ > >>>>>> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); > >>>>>> + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ > >>>>>> + if (pmu->global_status) > >>>>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); > >>>>>> + > >>>>>> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { > >>>>>> + pmc = &pmu->gp_counters[i]; > >>>>>> + rdpmcl(i, pmc->counter); > >>>>>> + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel); > >>>>>> + /* > >>>>>> + * Clear hardware PERFMON_EVENTSELx and its counter to avoid > >>>>>> + * leakage and also avoid this guest GP counter get accidentally > >>>>>> + * enabled during host running when host enable global ctrl. > >>>>>> + */ > >>>>>> + if (pmc->eventsel) > >>>>>> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0); > >>>>>> + if (pmc->counter) > >>>>>> + wrmsrl(MSR_IA32_PMC0 + i, 0); > >>>>> This doesn't make much sense. The kernel already has full access to the guest, > >>>>> I don't see what is gained by zeroing out the MSRs just to hide them from perf. > >>>> It's necessary to clear the EVENTSELx MSRs for both GP and fixed counters. > >>>> Considering this case, Guest uses GP counter 2, but Host doesn't use it. So > >>>> if the EVENTSEL2 MSR is not cleared here, the GP counter 2 would be enabled > >>>> unexpectedly on host later since Host perf always enable all validate bits > >>>> in PERF_GLOBAL_CTRL MSR. That would cause issues. > >>>> > >>>> Yeah, the clearing for PMCx MSR should be unnecessary . > >>>> > >>> Why is clearing for PMCx MSR unnecessary? Do we want to leaking counter > >>> values to the host? NO. Not in cloud usage. > >> No, this place is clearing the guest counter value instead of host > >> counter value. Host always has method to see guest value in a normal VM > >> if he want. I don't see its necessity, it's just a overkill and > >> introduce extra overhead to write MSRs. > >> > > I am curious how the perf subsystem solves the problem? Does perf > > subsystem in the host only scrubbing the selector but not the counter > > value when doing the context switch? > > When context switch happens, perf code would schedule out the old events > and schedule in the new events. When scheduling out, the ENABLE bit of > EVENTSELx MSR would be cleared, and when scheduling in, the EVENTSELx > and PMCx MSRs would be overwritten with new event's attr.config and > sample_period separately. Of course, these is only for the case when > there are new events to be programmed on the PMC. If no new events, the > PMCx MSR would keep stall value and won't be cleared. > > Anyway, I don't see any reason that PMCx MSR must be cleared. > I don't have a strong opinion on the upstream version. But since both the mediated vPMU and perf are clients of PMU HW, leaving PMC values uncleared when transition out of the vPMU boundary is leaking info technically. Alternatively, doing the clearing at vcpu loop boundary should be sufficient if considering performance overhead. Thanks -Mingwei > > > > > >>> Please make changes to this patch with **extreme** caution. > >>> > >>> According to our past experience, if there is a bug somewhere, > >>> there is a catch here (normally). > >>> > >>> Thanks. > >>> -Mingwei > >>>>> Similarly, if perf enables a counter if PERF_GLOBAL_CTRL without first restoring > >>>>> the event selector, we gots problems. > >>>>> > >>>>> Same thing for the fixed counters below. Can't this just be? > >>>>> > >>>>> for (i = 0; i < pmu->nr_arch_gp_counters; i++) > >>>>> rdpmcl(i, pmu->gp_counters[i].counter); > >>>>> > >>>>> for (i = 0; i < pmu->nr_arch_fixed_counters; i++) > >>>>> rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, > >>>>> pmu->fixed_counters[i].counter); > >>>>> > >>>>>> + } > >>>>>> + > >>>>>> + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); > >>>>>> + /* > >>>>>> + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and > >>>>>> + * also avoid these guest fixed counters get accidentially enabled > >>>>>> + * during host running when host enable global ctrl. > >>>>>> + */ > >>>>>> + if (pmu->fixed_ctr_ctrl) > >>>>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); > >>>>>> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > >>>>>> + pmc = &pmu->fixed_counters[i]; > >>>>>> + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); > >>>>>> + if (pmc->counter) > >>>>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0); > >>>>>> + } > >>>>>> } > >>>>>> static void intel_restore_pmu_context(struct kvm_vcpu *vcpu) > >>>>>> { > >>>>>> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > >>>>>> + struct kvm_pmc *pmc; > >>>>>> + u64 global_status; > >>>>>> + int i; > >>>>>> + > >>>>>> + if (pmu->version != 2) { > >>>>>> + pr_warn("only PerfMon v2 is supported for passthrough PMU"); > >>>>>> + return; > >>>>>> + } > >>>>>> + > >>>>>> + /* Clear host global_ctrl and global_status MSR if non-zero. */ > >>>>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); > >>>>> Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now? > >>>> As previous comments say, host perf always enable all counters in > >>>> PERF_GLOBAL_CTRL by default. The reason to clear PERF_GLOBAL_CTRL here is to > >>>> ensure all counters in disabled state and the later counter manipulation > >>>> (writing MSR) won't cause any race condition or unexpected behavior on HW. > >>>> > >>>> > >>>>>> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); > >>>>>> + if (global_status) > >>>>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status); > >>>>> This seems especially silly, isn't the full MSR being written below? Or am I > >>>>> misunderstanding how these things work? > >>>> I think Jim's comment has already explain why we need to do this. > >>>> > >>>>>> + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status); > >>>>>> + > >>>>>> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { > >>>>>> + pmc = &pmu->gp_counters[i]; > >>>>>> + wrmsrl(MSR_IA32_PMC0 + i, pmc->counter); > >>>>>> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, pmc->eventsel); > >>>>>> + } > >>>>>> + > >>>>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); > >>>>>> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > >>>>>> + pmc = &pmu->fixed_counters[i]; > >>>>>> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, pmc->counter); > >>>>>> + } > >>>>>> } > >>>>>> struct kvm_pmu_ops intel_pmu_ops __initdata = { > >>>>>> -- > >>>>>> 2.34.1 > >>>>>>
On Mon, Apr 15, 2024, Mingwei Zhang wrote: > On Mon, Apr 15, 2024 at 3:04 AM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > On 4/15/2024 2:06 PM, Mingwei Zhang wrote: > > > On Fri, Apr 12, 2024 at 9:25 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > >>>> It's necessary to clear the EVENTSELx MSRs for both GP and fixed counters. > > >>>> Considering this case, Guest uses GP counter 2, but Host doesn't use it. So > > >>>> if the EVENTSEL2 MSR is not cleared here, the GP counter 2 would be enabled > > >>>> unexpectedly on host later since Host perf always enable all validate bits > > >>>> in PERF_GLOBAL_CTRL MSR. That would cause issues. > > >>>> > > >>>> Yeah, the clearing for PMCx MSR should be unnecessary . > > >>>> > > >>> Why is clearing for PMCx MSR unnecessary? Do we want to leaking counter > > >>> values to the host? NO. Not in cloud usage. > > >> No, this place is clearing the guest counter value instead of host > > >> counter value. Host always has method to see guest value in a normal VM > > >> if he want. I don't see its necessity, it's just a overkill and > > >> introduce extra overhead to write MSRs. > > >> > > > I am curious how the perf subsystem solves the problem? Does perf > > > subsystem in the host only scrubbing the selector but not the counter > > > value when doing the context switch? > > > > When context switch happens, perf code would schedule out the old events > > and schedule in the new events. When scheduling out, the ENABLE bit of > > EVENTSELx MSR would be cleared, and when scheduling in, the EVENTSELx > > and PMCx MSRs would be overwritten with new event's attr.config and > > sample_period separately. Of course, these is only for the case when > > there are new events to be programmed on the PMC. If no new events, the > > PMCx MSR would keep stall value and won't be cleared. > > > > Anyway, I don't see any reason that PMCx MSR must be cleared. > > > > I don't have a strong opinion on the upstream version. But since both > the mediated vPMU and perf are clients of PMU HW, leaving PMC values > uncleared when transition out of the vPMU boundary is leaking info > technically. I'm not objecting to ensuring guest PMCs can't be read by any entity that's not in the guest's TCB, which is what I would consider a true leak. I'm objecting to blindly clearing all PMCs, and more specifically objecting to *KVM* clearing PMCs when saving guest state without coordinating with perf in any way. I am ok if we start with (or default to) a "safe" implementation that zeroes all PMCs when switching to host context, but I want KVM and perf to work together to do the context switches, e.g. so that we don't end up with code where KVM writes to all PMC MSRs and that perf also immediately writes to all PMC MSRs. One my biggest complaints with the current vPMU code is that the roles and responsibilities between KVM and perf are poorly defined, which leads to suboptimal and hard to maintain code. Case in point, I'm pretty sure leaving guest values in PMCs _would_ leak guest state to userspace processes that have RDPMC permissions, as the PMCs might not be dirty from perf's perspective (see perf_clear_dirty_counters()). Blindly clearing PMCs in KVM "solves" that problem, but in doing so makes the overall code brittle because it's not clear whether KVM _needs_ to clear PMCs, or if KVM is just being paranoid.
On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson <seanjc@google.com> wrote: > > On Mon, Apr 15, 2024, Mingwei Zhang wrote: > > On Mon, Apr 15, 2024 at 3:04 AM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > On 4/15/2024 2:06 PM, Mingwei Zhang wrote: > > > > On Fri, Apr 12, 2024 at 9:25 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > >>>> It's necessary to clear the EVENTSELx MSRs for both GP and fixed counters. > > > >>>> Considering this case, Guest uses GP counter 2, but Host doesn't use it. So > > > >>>> if the EVENTSEL2 MSR is not cleared here, the GP counter 2 would be enabled > > > >>>> unexpectedly on host later since Host perf always enable all validate bits > > > >>>> in PERF_GLOBAL_CTRL MSR. That would cause issues. > > > >>>> > > > >>>> Yeah, the clearing for PMCx MSR should be unnecessary . > > > >>>> > > > >>> Why is clearing for PMCx MSR unnecessary? Do we want to leaking counter > > > >>> values to the host? NO. Not in cloud usage. > > > >> No, this place is clearing the guest counter value instead of host > > > >> counter value. Host always has method to see guest value in a normal VM > > > >> if he want. I don't see its necessity, it's just a overkill and > > > >> introduce extra overhead to write MSRs. > > > >> > > > > I am curious how the perf subsystem solves the problem? Does perf > > > > subsystem in the host only scrubbing the selector but not the counter > > > > value when doing the context switch? > > > > > > When context switch happens, perf code would schedule out the old events > > > and schedule in the new events. When scheduling out, the ENABLE bit of > > > EVENTSELx MSR would be cleared, and when scheduling in, the EVENTSELx > > > and PMCx MSRs would be overwritten with new event's attr.config and > > > sample_period separately. Of course, these is only for the case when > > > there are new events to be programmed on the PMC. If no new events, the > > > PMCx MSR would keep stall value and won't be cleared. > > > > > > Anyway, I don't see any reason that PMCx MSR must be cleared. > > > > > > > I don't have a strong opinion on the upstream version. But since both > > the mediated vPMU and perf are clients of PMU HW, leaving PMC values > > uncleared when transition out of the vPMU boundary is leaking info > > technically. > > I'm not objecting to ensuring guest PMCs can't be read by any entity that's not > in the guest's TCB, which is what I would consider a true leak. I'm objecting > to blindly clearing all PMCs, and more specifically objecting to *KVM* clearing > PMCs when saving guest state without coordinating with perf in any way. > > I am ok if we start with (or default to) a "safe" implementation that zeroes all > PMCs when switching to host context, but I want KVM and perf to work together to > do the context switches, e.g. so that we don't end up with code where KVM writes > to all PMC MSRs and that perf also immediately writes to all PMC MSRs. I am fully aligned with you on this. > > One my biggest complaints with the current vPMU code is that the roles and > responsibilities between KVM and perf are poorly defined, which leads to suboptimal > and hard to maintain code. > > Case in point, I'm pretty sure leaving guest values in PMCs _would_ leak guest > state to userspace processes that have RDPMC permissions, as the PMCs might not > be dirty from perf's perspective (see perf_clear_dirty_counters()). > > Blindly clearing PMCs in KVM "solves" that problem, but in doing so makes the > overall code brittle because it's not clear whether KVM _needs_ to clear PMCs, > or if KVM is just being paranoid. So once this rolls out, perf and vPMU are clients directly to PMU HW. Faithful cleaning (blind cleaning) has to be the baseline implementation, until both clients agree to a "deal" between them. Currently, there is no such deal, but I believe we could have one via future discussion. Thanks. -Mingwei
On Mon, Apr 15, 2024, Mingwei Zhang wrote: > On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson <seanjc@google.com> wrote: > > One my biggest complaints with the current vPMU code is that the roles and > > responsibilities between KVM and perf are poorly defined, which leads to suboptimal > > and hard to maintain code. > > > > Case in point, I'm pretty sure leaving guest values in PMCs _would_ leak guest > > state to userspace processes that have RDPMC permissions, as the PMCs might not > > be dirty from perf's perspective (see perf_clear_dirty_counters()). > > > > Blindly clearing PMCs in KVM "solves" that problem, but in doing so makes the > > overall code brittle because it's not clear whether KVM _needs_ to clear PMCs, > > or if KVM is just being paranoid. > > So once this rolls out, perf and vPMU are clients directly to PMU HW. I don't think this is a statement we want to make, as it opens a discussion that we won't win. Nor do I think it's one we *need* to make. KVM doesn't need to be on equal footing with perf in terms of owning/managing PMU hardware, KVM just needs a few APIs to allow faithfully and accurately virtualizing a guest PMU. > Faithful cleaning (blind cleaning) has to be the baseline > implementation, until both clients agree to a "deal" between them. > Currently, there is no such deal, but I believe we could have one via > future discussion. What I am saying is that there needs to be a "deal" in place before this code is merged. It doesn't need to be anything fancy, e.g. perf can still pave over PMCs it doesn't immediately load, as opposed to using cpu_hw_events.dirty to lazily do the clearing. But perf and KVM need to work together from the get go, i.e. I don't want KVM doing something without regard to what perf does, and vice versa.
On Mon, Apr 15, 2024, Sean Christopherson wrote: > On Mon, Apr 15, 2024, Mingwei Zhang wrote: > > On Mon, Apr 15, 2024 at 3:04 AM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > On 4/15/2024 2:06 PM, Mingwei Zhang wrote: > > > > On Fri, Apr 12, 2024 at 9:25 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > >>>> It's necessary to clear the EVENTSELx MSRs for both GP and fixed counters. > > > >>>> Considering this case, Guest uses GP counter 2, but Host doesn't use it. So > > > >>>> if the EVENTSEL2 MSR is not cleared here, the GP counter 2 would be enabled > > > >>>> unexpectedly on host later since Host perf always enable all validate bits > > > >>>> in PERF_GLOBAL_CTRL MSR. That would cause issues. > > > >>>> > > > >>>> Yeah, the clearing for PMCx MSR should be unnecessary . > > > >>>> > > > >>> Why is clearing for PMCx MSR unnecessary? Do we want to leaking counter > > > >>> values to the host? NO. Not in cloud usage. > > > >> No, this place is clearing the guest counter value instead of host > > > >> counter value. Host always has method to see guest value in a normal VM > > > >> if he want. I don't see its necessity, it's just a overkill and > > > >> introduce extra overhead to write MSRs. > > > >> > > > > I am curious how the perf subsystem solves the problem? Does perf > > > > subsystem in the host only scrubbing the selector but not the counter > > > > value when doing the context switch? > > > > > > When context switch happens, perf code would schedule out the old events > > > and schedule in the new events. When scheduling out, the ENABLE bit of > > > EVENTSELx MSR would be cleared, and when scheduling in, the EVENTSELx > > > and PMCx MSRs would be overwritten with new event's attr.config and > > > sample_period separately. Of course, these is only for the case when > > > there are new events to be programmed on the PMC. If no new events, the > > > PMCx MSR would keep stall value and won't be cleared. > > > > > > Anyway, I don't see any reason that PMCx MSR must be cleared. > > > > > > > I don't have a strong opinion on the upstream version. But since both > > the mediated vPMU and perf are clients of PMU HW, leaving PMC values > > uncleared when transition out of the vPMU boundary is leaking info > > technically. > > I'm not objecting to ensuring guest PMCs can't be read by any entity that's not > in the guest's TCB, which is what I would consider a true leak. I'm objecting > to blindly clearing all PMCs, and more specifically objecting to *KVM* clearing > PMCs when saving guest state without coordinating with perf in any way. Agree. blindly clearing PMCs is the basic implementation. I am thinking about what coordination between perf and KVM as well. > > I am ok if we start with (or default to) a "safe" implementation that zeroes all > PMCs when switching to host context, but I want KVM and perf to work together to > do the context switches, e.g. so that we don't end up with code where KVM writes > to all PMC MSRs and that perf also immediately writes to all PMC MSRs. Sure. Point taken. > > One my biggest complaints with the current vPMU code is that the roles and > responsibilities between KVM and perf are poorly defined, which leads to suboptimal > and hard to maintain code. Right. > > Case in point, I'm pretty sure leaving guest values in PMCs _would_ leak guest > state to userspace processes that have RDPMC permissions, as the PMCs might not > be dirty from perf's perspective (see perf_clear_dirty_counters()). > ah. This is a good point. switch_mm_irqs_off() => cr4_update_pce_mm() => /* * Clear the existing dirty counters to * prevent the leak for an RDPMC task. */ perf_clear_dirty_counters() So perf does clear dirty counter values on process context switch. This is nice to know. perf_clear_dirty_counters() clear the counter values according to cpuc->dirty except for those assigned counters. > Blindly clearing PMCs in KVM "solves" that problem, but in doing so makes the > overall code brittle because it's not clear whether KVM _needs_ to clear PMCs, > or if KVM is just being paranoid. There is a difference between KVM and perf subsystem on PMU context switch. The latter has the notion of "perf_events", while the former currently does not. It is quite hard for KVM to know which counters are really "in use". Another point I want to raise up to you is that, KVM PMU context switch and Perf PMU context switch happens at different timing: - The former is a context switch between guest/host state of the same process, happening at VM-enter/exit boundary. - The latter is a context switch beteen two host-level processes. - The former happens before the latter. - Current design has no PMC partitioning between host/guest due to arch limitation. From the above, I feel that it might be impossible to combine them or to add coordination? Unless we do the KVM PMU context switch at vcpu loop boundary... Thanks. -Mingwei
On Thu, Apr 18, 2024, Mingwei Zhang wrote: > On Mon, Apr 15, 2024, Sean Christopherson wrote: > > On Mon, Apr 15, 2024, Mingwei Zhang wrote: > > > On Mon, Apr 15, 2024 at 3:04 AM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > > On 4/15/2024 2:06 PM, Mingwei Zhang wrote: > > > > > On Fri, Apr 12, 2024 at 9:25 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > > >>>> It's necessary to clear the EVENTSELx MSRs for both GP and fixed counters. > > > > >>>> Considering this case, Guest uses GP counter 2, but Host doesn't use it. So > > > > >>>> if the EVENTSEL2 MSR is not cleared here, the GP counter 2 would be enabled > > > > >>>> unexpectedly on host later since Host perf always enable all validate bits > > > > >>>> in PERF_GLOBAL_CTRL MSR. That would cause issues. > > > > >>>> > > > > >>>> Yeah, the clearing for PMCx MSR should be unnecessary . > > > > >>>> > > > > >>> Why is clearing for PMCx MSR unnecessary? Do we want to leaking counter > > > > >>> values to the host? NO. Not in cloud usage. > > > > >> No, this place is clearing the guest counter value instead of host > > > > >> counter value. Host always has method to see guest value in a normal VM > > > > >> if he want. I don't see its necessity, it's just a overkill and > > > > >> introduce extra overhead to write MSRs. > > > > >> > > > > > I am curious how the perf subsystem solves the problem? Does perf > > > > > subsystem in the host only scrubbing the selector but not the counter > > > > > value when doing the context switch? > > > > > > > > When context switch happens, perf code would schedule out the old events > > > > and schedule in the new events. When scheduling out, the ENABLE bit of > > > > EVENTSELx MSR would be cleared, and when scheduling in, the EVENTSELx > > > > and PMCx MSRs would be overwritten with new event's attr.config and > > > > sample_period separately. Of course, these is only for the case when > > > > there are new events to be programmed on the PMC. If no new events, the > > > > PMCx MSR would keep stall value and won't be cleared. > > > > > > > > Anyway, I don't see any reason that PMCx MSR must be cleared. > > > > > > > > > > I don't have a strong opinion on the upstream version. But since both > > > the mediated vPMU and perf are clients of PMU HW, leaving PMC values > > > uncleared when transition out of the vPMU boundary is leaking info > > > technically. > > > > I'm not objecting to ensuring guest PMCs can't be read by any entity that's not > > in the guest's TCB, which is what I would consider a true leak. I'm objecting > > to blindly clearing all PMCs, and more specifically objecting to *KVM* clearing > > PMCs when saving guest state without coordinating with perf in any way. > > Agree. blindly clearing PMCs is the basic implementation. I am thinking > about what coordination between perf and KVM as well. > > > > > I am ok if we start with (or default to) a "safe" implementation that zeroes all > > PMCs when switching to host context, but I want KVM and perf to work together to > > do the context switches, e.g. so that we don't end up with code where KVM writes > > to all PMC MSRs and that perf also immediately writes to all PMC MSRs. > > Sure. Point taken. > > > > One my biggest complaints with the current vPMU code is that the roles and > > responsibilities between KVM and perf are poorly defined, which leads to suboptimal > > and hard to maintain code. > > Right. > > > > Case in point, I'm pretty sure leaving guest values in PMCs _would_ leak guest > > state to userspace processes that have RDPMC permissions, as the PMCs might not > > be dirty from perf's perspective (see perf_clear_dirty_counters()). > > > > ah. This is a good point. > > switch_mm_irqs_off() => > cr4_update_pce_mm() => > /* > * Clear the existing dirty counters to > * prevent the leak for an RDPMC task. > */ FYI, for the elaboration of "an RDPMC task". when echo 2> /sys/devices/cpu/rdpmc, kernel set CR4.PCE to 1. Once that is done, rdpmc instruction is no longer a priviledged instruction. It is allowed for all tasks to execute in userspace. Thanks. -Mingwei > perf_clear_dirty_counters() > > So perf does clear dirty counter values on process context switch. This > is nice to know. > > perf_clear_dirty_counters() clear the counter values according to > cpuc->dirty except for those assigned counters. > > > Blindly clearing PMCs in KVM "solves" that problem, but in doing so makes the > > overall code brittle because it's not clear whether KVM _needs_ to clear PMCs, > > or if KVM is just being paranoid. > > There is a difference between KVM and perf subsystem on PMU context > switch. The latter has the notion of "perf_events", while the former > currently does not. It is quite hard for KVM to know which counters are > really "in use". > > Another point I want to raise up to you is that, KVM PMU context switch > and Perf PMU context switch happens at different timing: > > - The former is a context switch between guest/host state of the same > process, happening at VM-enter/exit boundary. > - The latter is a context switch beteen two host-level processes. > - The former happens before the latter. > - Current design has no PMC partitioning between host/guest due to > arch limitation. > > From the above, I feel that it might be impossible to combine them or to > add coordination? Unless we do the KVM PMU context switch at vcpu loop > boundary... > > Thanks. > -Mingwei
On 4/19/2024 5:21 AM, Mingwei Zhang wrote: > On Mon, Apr 15, 2024, Sean Christopherson wrote: >> On Mon, Apr 15, 2024, Mingwei Zhang wrote: >>> On Mon, Apr 15, 2024 at 3:04 AM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: >>>> On 4/15/2024 2:06 PM, Mingwei Zhang wrote: >>>>> On Fri, Apr 12, 2024 at 9:25 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: >>>>>>>> It's necessary to clear the EVENTSELx MSRs for both GP and fixed counters. >>>>>>>> Considering this case, Guest uses GP counter 2, but Host doesn't use it. So >>>>>>>> if the EVENTSEL2 MSR is not cleared here, the GP counter 2 would be enabled >>>>>>>> unexpectedly on host later since Host perf always enable all validate bits >>>>>>>> in PERF_GLOBAL_CTRL MSR. That would cause issues. >>>>>>>> >>>>>>>> Yeah, the clearing for PMCx MSR should be unnecessary . >>>>>>>> >>>>>>> Why is clearing for PMCx MSR unnecessary? Do we want to leaking counter >>>>>>> values to the host? NO. Not in cloud usage. >>>>>> No, this place is clearing the guest counter value instead of host >>>>>> counter value. Host always has method to see guest value in a normal VM >>>>>> if he want. I don't see its necessity, it's just a overkill and >>>>>> introduce extra overhead to write MSRs. >>>>>> >>>>> I am curious how the perf subsystem solves the problem? Does perf >>>>> subsystem in the host only scrubbing the selector but not the counter >>>>> value when doing the context switch? >>>> When context switch happens, perf code would schedule out the old events >>>> and schedule in the new events. When scheduling out, the ENABLE bit of >>>> EVENTSELx MSR would be cleared, and when scheduling in, the EVENTSELx >>>> and PMCx MSRs would be overwritten with new event's attr.config and >>>> sample_period separately. Of course, these is only for the case when >>>> there are new events to be programmed on the PMC. If no new events, the >>>> PMCx MSR would keep stall value and won't be cleared. >>>> >>>> Anyway, I don't see any reason that PMCx MSR must be cleared. >>>> >>> I don't have a strong opinion on the upstream version. But since both >>> the mediated vPMU and perf are clients of PMU HW, leaving PMC values >>> uncleared when transition out of the vPMU boundary is leaking info >>> technically. >> I'm not objecting to ensuring guest PMCs can't be read by any entity that's not >> in the guest's TCB, which is what I would consider a true leak. I'm objecting >> to blindly clearing all PMCs, and more specifically objecting to *KVM* clearing >> PMCs when saving guest state without coordinating with perf in any way. > Agree. blindly clearing PMCs is the basic implementation. I am thinking > about what coordination between perf and KVM as well. > >> I am ok if we start with (or default to) a "safe" implementation that zeroes all >> PMCs when switching to host context, but I want KVM and perf to work together to >> do the context switches, e.g. so that we don't end up with code where KVM writes >> to all PMC MSRs and that perf also immediately writes to all PMC MSRs. > Sure. Point taken. >> One my biggest complaints with the current vPMU code is that the roles and >> responsibilities between KVM and perf are poorly defined, which leads to suboptimal >> and hard to maintain code. > Right. >> Case in point, I'm pretty sure leaving guest values in PMCs _would_ leak guest >> state to userspace processes that have RDPMC permissions, as the PMCs might not >> be dirty from perf's perspective (see perf_clear_dirty_counters()). >> > ah. This is a good point. > > switch_mm_irqs_off() => > cr4_update_pce_mm() => > /* > * Clear the existing dirty counters to > * prevent the leak for an RDPMC task. > */ > perf_clear_dirty_counters() > > So perf does clear dirty counter values on process context switch. This > is nice to know. > > perf_clear_dirty_counters() clear the counter values according to > cpuc->dirty except for those assigned counters. > >> Blindly clearing PMCs in KVM "solves" that problem, but in doing so makes the >> overall code brittle because it's not clear whether KVM _needs_ to clear PMCs, >> or if KVM is just being paranoid. > There is a difference between KVM and perf subsystem on PMU context > switch. The latter has the notion of "perf_events", while the former > currently does not. It is quite hard for KVM to know which counters are > really "in use". > > Another point I want to raise up to you is that, KVM PMU context switch > and Perf PMU context switch happens at different timing: > > - The former is a context switch between guest/host state of the same > process, happening at VM-enter/exit boundary. > - The latter is a context switch beteen two host-level processes. > - The former happens before the latter. > - Current design has no PMC partitioning between host/guest due to > arch limitation. > > From the above, I feel that it might be impossible to combine them or to > add coordination? Unless we do the KVM PMU context switch at vcpu loop > boundary... It seems there are two ways to clear the PMCx MSRs. a) KVM clears these guest only used counters' PMCx MSRs when VM exits and saving the guest MSRs. This can be seen as a portion of the next step optimization that only guest used MSRs are saved and restored. It would avoid to save/restore unnecessary MSR and decrease the performance impact. b) Perf subsystem clears these guest used MSRs, but perf system doesn't know which MSRs are touched by guest, KVM has to provide a API to tell perf subsystem the information. Another issue is that perf does the clearing in task context switch. It may be too late, user can get the guest counter value via rdpmc instruction as long as the vCPU process is allowed to use rdpmc from user space. We had an internal rough talk on this, It seems the option 1 is the better one which looks simpler and has clearer boundary. We would implement it together with the optimization mentioned in option 1. > > Thanks. > -Mingwei >
On 2024/4/16 上午6:45, Sean Christopherson wrote: > On Mon, Apr 15, 2024, Mingwei Zhang wrote: >> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson <seanjc@google.com> wrote: >>> One my biggest complaints with the current vPMU code is that the roles and >>> responsibilities between KVM and perf are poorly defined, which leads to suboptimal >>> and hard to maintain code. >>> >>> Case in point, I'm pretty sure leaving guest values in PMCs _would_ leak guest >>> state to userspace processes that have RDPMC permissions, as the PMCs might not >>> be dirty from perf's perspective (see perf_clear_dirty_counters()). >>> >>> Blindly clearing PMCs in KVM "solves" that problem, but in doing so makes the >>> overall code brittle because it's not clear whether KVM _needs_ to clear PMCs, >>> or if KVM is just being paranoid. >> >> So once this rolls out, perf and vPMU are clients directly to PMU HW. > > I don't think this is a statement we want to make, as it opens a discussion > that we won't win. Nor do I think it's one we *need* to make. KVM doesn't need > to be on equal footing with perf in terms of owning/managing PMU hardware, KVM > just needs a few APIs to allow faithfully and accurately virtualizing a guest PMU. > >> Faithful cleaning (blind cleaning) has to be the baseline >> implementation, until both clients agree to a "deal" between them. >> Currently, there is no such deal, but I believe we could have one via >> future discussion. > > What I am saying is that there needs to be a "deal" in place before this code > is merged. It doesn't need to be anything fancy, e.g. perf can still pave over > PMCs it doesn't immediately load, as opposed to using cpu_hw_events.dirty to lazily > do the clearing. But perf and KVM need to work together from the get go, ie. I > don't want KVM doing something without regard to what perf does, and vice versa. > There is similar issue on LoongArch vPMU where vm can directly pmu hardware and pmu hw is shard with guest and host. Besides context switch there are other places where perf core will access pmu hw, such as tick timer/hrtimer/ipi function call, and KVM can only intercept context switch. Can we add callback handler in structure kvm_guest_cbs? just like this: @@ -6403,6 +6403,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .state = kvm_guest_state, .get_ip = kvm_guest_get_ip, .handle_intel_pt_intr = NULL, + .lose_pmu = kvm_guest_lose_pmu, }; By the way, I do not know should the callback handler be triggered in perf core or detailed pmu hw driver. From ARM pmu hw driver, it is triggered in pmu hw driver such as function kvm_vcpu_pmu_resync_el0, but I think it will be better if it is done in perf core. Regards Bibo Mao
On Mon, Apr 22, 2024, maobibo wrote: > On 2024/4/16 上午6:45, Sean Christopherson wrote: > > On Mon, Apr 15, 2024, Mingwei Zhang wrote: > > > On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson <seanjc@google.com> wrote: > > > > One my biggest complaints with the current vPMU code is that the roles and > > > > responsibilities between KVM and perf are poorly defined, which leads to suboptimal > > > > and hard to maintain code. > > > > > > > > Case in point, I'm pretty sure leaving guest values in PMCs _would_ leak guest > > > > state to userspace processes that have RDPMC permissions, as the PMCs might not > > > > be dirty from perf's perspective (see perf_clear_dirty_counters()). > > > > > > > > Blindly clearing PMCs in KVM "solves" that problem, but in doing so makes the > > > > overall code brittle because it's not clear whether KVM _needs_ to clear PMCs, > > > > or if KVM is just being paranoid. > > > > > > So once this rolls out, perf and vPMU are clients directly to PMU HW. > > > > I don't think this is a statement we want to make, as it opens a discussion > > that we won't win. Nor do I think it's one we *need* to make. KVM doesn't need > > to be on equal footing with perf in terms of owning/managing PMU hardware, KVM > > just needs a few APIs to allow faithfully and accurately virtualizing a guest PMU. > > > > > Faithful cleaning (blind cleaning) has to be the baseline > > > implementation, until both clients agree to a "deal" between them. > > > Currently, there is no such deal, but I believe we could have one via > > > future discussion. > > > > What I am saying is that there needs to be a "deal" in place before this code > > is merged. It doesn't need to be anything fancy, e.g. perf can still pave over > > PMCs it doesn't immediately load, as opposed to using cpu_hw_events.dirty to lazily > > do the clearing. But perf and KVM need to work together from the get go, ie. I > > don't want KVM doing something without regard to what perf does, and vice versa. > > > There is similar issue on LoongArch vPMU where vm can directly pmu hardware > and pmu hw is shard with guest and host. Besides context switch there are > other places where perf core will access pmu hw, such as tick > timer/hrtimer/ipi function call, and KVM can only intercept context switch. Two questions: 1) Can KVM prevent the guest from accessing the PMU? 2) If so, KVM can grant partial access to the PMU, or is it all or nothing? If the answer to both questions is "yes", then it sounds like LoongArch *requires* mediated/passthrough support in order to virtualize its PMU. > Can we add callback handler in structure kvm_guest_cbs? just like this: > @@ -6403,6 +6403,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs > = { > .state = kvm_guest_state, > .get_ip = kvm_guest_get_ip, > .handle_intel_pt_intr = NULL, > + .lose_pmu = kvm_guest_lose_pmu, > }; > > By the way, I do not know should the callback handler be triggered in perf > core or detailed pmu hw driver. From ARM pmu hw driver, it is triggered in > pmu hw driver such as function kvm_vcpu_pmu_resync_el0, > but I think it will be better if it is done in perf core. I don't think we want to take the approach of perf and KVM guests "fighting" over the PMU. That's effectively what we have today, and it's a mess for KVM because it's impossible to provide consistent, deterministic behavior for the guest. And it's just as messy for perf, which ends up having wierd, cumbersome flows that exists purely to try to play nice with KVM.
On 2024/4/23 上午1:01, Sean Christopherson wrote: > On Mon, Apr 22, 2024, maobibo wrote: >> On 2024/4/16 上午6:45, Sean Christopherson wrote: >>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: >>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson <seanjc@google.com> wrote: >>>>> One my biggest complaints with the current vPMU code is that the roles and >>>>> responsibilities between KVM and perf are poorly defined, which leads to suboptimal >>>>> and hard to maintain code. >>>>> >>>>> Case in point, I'm pretty sure leaving guest values in PMCs _would_ leak guest >>>>> state to userspace processes that have RDPMC permissions, as the PMCs might not >>>>> be dirty from perf's perspective (see perf_clear_dirty_counters()). >>>>> >>>>> Blindly clearing PMCs in KVM "solves" that problem, but in doing so makes the >>>>> overall code brittle because it's not clear whether KVM _needs_ to clear PMCs, >>>>> or if KVM is just being paranoid. >>>> >>>> So once this rolls out, perf and vPMU are clients directly to PMU HW. >>> >>> I don't think this is a statement we want to make, as it opens a discussion >>> that we won't win. Nor do I think it's one we *need* to make. KVM doesn't need >>> to be on equal footing with perf in terms of owning/managing PMU hardware, KVM >>> just needs a few APIs to allow faithfully and accurately virtualizing a guest PMU. >>> >>>> Faithful cleaning (blind cleaning) has to be the baseline >>>> implementation, until both clients agree to a "deal" between them. >>>> Currently, there is no such deal, but I believe we could have one via >>>> future discussion. >>> >>> What I am saying is that there needs to be a "deal" in place before this code >>> is merged. It doesn't need to be anything fancy, e.g. perf can still pave over >>> PMCs it doesn't immediately load, as opposed to using cpu_hw_events.dirty to lazily >>> do the clearing. But perf and KVM need to work together from the get go, ie. I >>> don't want KVM doing something without regard to what perf does, and vice versa. >>> >> There is similar issue on LoongArch vPMU where vm can directly pmu hardware >> and pmu hw is shard with guest and host. Besides context switch there are >> other places where perf core will access pmu hw, such as tick >> timer/hrtimer/ipi function call, and KVM can only intercept context switch. > > Two questions: > > 1) Can KVM prevent the guest from accessing the PMU? > > 2) If so, KVM can grant partial access to the PMU, or is it all or nothing? > > If the answer to both questions is "yes", then it sounds like LoongArch *requires* > mediated/passthrough support in order to virtualize its PMU. Hi Sean, Thank for your quick response. yes, kvm can prevent guest from accessing the PMU and grant partial or all to access to the PMU. Only that if one pmu event is granted to VM, host can not access this pmu event again. There must be pmu event switch if host want to. > >> Can we add callback handler in structure kvm_guest_cbs? just like this: >> @@ -6403,6 +6403,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs >> = { >> .state = kvm_guest_state, >> .get_ip = kvm_guest_get_ip, >> .handle_intel_pt_intr = NULL, >> + .lose_pmu = kvm_guest_lose_pmu, >> }; >> >> By the way, I do not know should the callback handler be triggered in perf >> core or detailed pmu hw driver. From ARM pmu hw driver, it is triggered in >> pmu hw driver such as function kvm_vcpu_pmu_resync_el0, >> but I think it will be better if it is done in perf core. > > I don't think we want to take the approach of perf and KVM guests "fighting" over > the PMU. That's effectively what we have today, and it's a mess for KVM because > it's impossible to provide consistent, deterministic behavior for the guest. And > it's just as messy for perf, which ends up having wierd, cumbersome flows that > exists purely to try to play nice with KVM. With existing pmu core code, in tick timer interrupt or IPI function call interrupt pmu hw may be accessed by host when VM is running and pmu is already granted to guest. KVM can not intercept host IPI/timer interrupt, there is no pmu context switch, there will be problem. Regards Bibo Mao
On 4/23/2024 9:01 AM, maobibo wrote: > > > On 2024/4/23 上午1:01, Sean Christopherson wrote: >> On Mon, Apr 22, 2024, maobibo wrote: >>> On 2024/4/16 上午6:45, Sean Christopherson wrote: >>>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: >>>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson >>>>> <seanjc@google.com> wrote: >>>>>> One my biggest complaints with the current vPMU code is that the >>>>>> roles and >>>>>> responsibilities between KVM and perf are poorly defined, which >>>>>> leads to suboptimal >>>>>> and hard to maintain code. >>>>>> >>>>>> Case in point, I'm pretty sure leaving guest values in PMCs >>>>>> _would_ leak guest >>>>>> state to userspace processes that have RDPMC permissions, as the >>>>>> PMCs might not >>>>>> be dirty from perf's perspective (see perf_clear_dirty_counters()). >>>>>> >>>>>> Blindly clearing PMCs in KVM "solves" that problem, but in doing >>>>>> so makes the >>>>>> overall code brittle because it's not clear whether KVM _needs_ >>>>>> to clear PMCs, >>>>>> or if KVM is just being paranoid. >>>>> >>>>> So once this rolls out, perf and vPMU are clients directly to PMU HW. >>>> >>>> I don't think this is a statement we want to make, as it opens a >>>> discussion >>>> that we won't win. Nor do I think it's one we *need* to make. KVM >>>> doesn't need >>>> to be on equal footing with perf in terms of owning/managing PMU >>>> hardware, KVM >>>> just needs a few APIs to allow faithfully and accurately >>>> virtualizing a guest PMU. >>>> >>>>> Faithful cleaning (blind cleaning) has to be the baseline >>>>> implementation, until both clients agree to a "deal" between them. >>>>> Currently, there is no such deal, but I believe we could have one via >>>>> future discussion. >>>> >>>> What I am saying is that there needs to be a "deal" in place before >>>> this code >>>> is merged. It doesn't need to be anything fancy, e.g. perf can >>>> still pave over >>>> PMCs it doesn't immediately load, as opposed to using >>>> cpu_hw_events.dirty to lazily >>>> do the clearing. But perf and KVM need to work together from the >>>> get go, ie. I >>>> don't want KVM doing something without regard to what perf does, >>>> and vice versa. >>>> >>> There is similar issue on LoongArch vPMU where vm can directly pmu >>> hardware >>> and pmu hw is shard with guest and host. Besides context switch >>> there are >>> other places where perf core will access pmu hw, such as tick >>> timer/hrtimer/ipi function call, and KVM can only intercept context >>> switch. >> >> Two questions: >> >> 1) Can KVM prevent the guest from accessing the PMU? >> >> 2) If so, KVM can grant partial access to the PMU, or is it all or >> nothing? >> >> If the answer to both questions is "yes", then it sounds like >> LoongArch *requires* >> mediated/passthrough support in order to virtualize its PMU. > > Hi Sean, > > Thank for your quick response. > > yes, kvm can prevent guest from accessing the PMU and grant partial or > all to access to the PMU. Only that if one pmu event is granted to VM, > host can not access this pmu event again. There must be pmu event > switch if host want to. PMU event is a software entity which won't be shared. did you mean if a PMU HW counter is granted to VM, then Host can't access the PMU HW counter, right? > >> >>> Can we add callback handler in structure kvm_guest_cbs? just like >>> this: >>> @@ -6403,6 +6403,7 @@ static struct perf_guest_info_callbacks >>> kvm_guest_cbs >>> = { >>> .state = kvm_guest_state, >>> .get_ip = kvm_guest_get_ip, >>> .handle_intel_pt_intr = NULL, >>> + .lose_pmu = kvm_guest_lose_pmu, >>> }; >>> >>> By the way, I do not know should the callback handler be triggered >>> in perf >>> core or detailed pmu hw driver. From ARM pmu hw driver, it is >>> triggered in >>> pmu hw driver such as function kvm_vcpu_pmu_resync_el0, >>> but I think it will be better if it is done in perf core. >> >> I don't think we want to take the approach of perf and KVM guests >> "fighting" over >> the PMU. That's effectively what we have today, and it's a mess for >> KVM because >> it's impossible to provide consistent, deterministic behavior for the >> guest. And >> it's just as messy for perf, which ends up having wierd, cumbersome >> flows that >> exists purely to try to play nice with KVM. > With existing pmu core code, in tick timer interrupt or IPI function > call interrupt pmu hw may be accessed by host when VM is running and > pmu is already granted to guest. KVM can not intercept host IPI/timer > interrupt, there is no pmu context switch, there will be problem. > > Regards > Bibo Mao >
On 2024/4/23 上午10:44, Mi, Dapeng wrote: > > On 4/23/2024 9:01 AM, maobibo wrote: >> >> >> On 2024/4/23 上午1:01, Sean Christopherson wrote: >>> On Mon, Apr 22, 2024, maobibo wrote: >>>> On 2024/4/16 上午6:45, Sean Christopherson wrote: >>>>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: >>>>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson >>>>>> <seanjc@google.com> wrote: >>>>>>> One my biggest complaints with the current vPMU code is that the >>>>>>> roles and >>>>>>> responsibilities between KVM and perf are poorly defined, which >>>>>>> leads to suboptimal >>>>>>> and hard to maintain code. >>>>>>> >>>>>>> Case in point, I'm pretty sure leaving guest values in PMCs >>>>>>> _would_ leak guest >>>>>>> state to userspace processes that have RDPMC permissions, as the >>>>>>> PMCs might not >>>>>>> be dirty from perf's perspective (see perf_clear_dirty_counters()). >>>>>>> >>>>>>> Blindly clearing PMCs in KVM "solves" that problem, but in doing >>>>>>> so makes the >>>>>>> overall code brittle because it's not clear whether KVM _needs_ >>>>>>> to clear PMCs, >>>>>>> or if KVM is just being paranoid. >>>>>> >>>>>> So once this rolls out, perf and vPMU are clients directly to PMU HW. >>>>> >>>>> I don't think this is a statement we want to make, as it opens a >>>>> discussion >>>>> that we won't win. Nor do I think it's one we *need* to make. KVM >>>>> doesn't need >>>>> to be on equal footing with perf in terms of owning/managing PMU >>>>> hardware, KVM >>>>> just needs a few APIs to allow faithfully and accurately >>>>> virtualizing a guest PMU. >>>>> >>>>>> Faithful cleaning (blind cleaning) has to be the baseline >>>>>> implementation, until both clients agree to a "deal" between them. >>>>>> Currently, there is no such deal, but I believe we could have one via >>>>>> future discussion. >>>>> >>>>> What I am saying is that there needs to be a "deal" in place before >>>>> this code >>>>> is merged. It doesn't need to be anything fancy, e.g. perf can >>>>> still pave over >>>>> PMCs it doesn't immediately load, as opposed to using >>>>> cpu_hw_events.dirty to lazily >>>>> do the clearing. But perf and KVM need to work together from the >>>>> get go, ie. I >>>>> don't want KVM doing something without regard to what perf does, >>>>> and vice versa. >>>>> >>>> There is similar issue on LoongArch vPMU where vm can directly pmu >>>> hardware >>>> and pmu hw is shard with guest and host. Besides context switch >>>> there are >>>> other places where perf core will access pmu hw, such as tick >>>> timer/hrtimer/ipi function call, and KVM can only intercept context >>>> switch. >>> >>> Two questions: >>> >>> 1) Can KVM prevent the guest from accessing the PMU? >>> >>> 2) If so, KVM can grant partial access to the PMU, or is it all or >>> nothing? >>> >>> If the answer to both questions is "yes", then it sounds like >>> LoongArch *requires* >>> mediated/passthrough support in order to virtualize its PMU. >> >> Hi Sean, >> >> Thank for your quick response. >> >> yes, kvm can prevent guest from accessing the PMU and grant partial or >> all to access to the PMU. Only that if one pmu event is granted to VM, >> host can not access this pmu event again. There must be pmu event >> switch if host want to. > > PMU event is a software entity which won't be shared. did you mean if a > PMU HW counter is granted to VM, then Host can't access the PMU HW > counter, right? yes, if PMU HW counter/control is granted to VM. The value comes from guest, and is not meaningful for host. Host pmu core does not know that it is granted to VM, host still think that it owns pmu. Just like FPU register, it is shared by VM and host during different time and it is lately switched. But if IPI or timer interrupt uses FPU register on host, there will be the same issue. Regards Bibo Mao > > >> >>> >>>> Can we add callback handler in structure kvm_guest_cbs? just like >>>> this: >>>> @@ -6403,6 +6403,7 @@ static struct perf_guest_info_callbacks >>>> kvm_guest_cbs >>>> = { >>>> .state = kvm_guest_state, >>>> .get_ip = kvm_guest_get_ip, >>>> .handle_intel_pt_intr = NULL, >>>> + .lose_pmu = kvm_guest_lose_pmu, >>>> }; >>>> >>>> By the way, I do not know should the callback handler be triggered >>>> in perf >>>> core or detailed pmu hw driver. From ARM pmu hw driver, it is >>>> triggered in >>>> pmu hw driver such as function kvm_vcpu_pmu_resync_el0, >>>> but I think it will be better if it is done in perf core. >>> >>> I don't think we want to take the approach of perf and KVM guests >>> "fighting" over >>> the PMU. That's effectively what we have today, and it's a mess for >>> KVM because >>> it's impossible to provide consistent, deterministic behavior for the >>> guest. And >>> it's just as messy for perf, which ends up having wierd, cumbersome >>> flows that >>> exists purely to try to play nice with KVM. >> With existing pmu core code, in tick timer interrupt or IPI function >> call interrupt pmu hw may be accessed by host when VM is running and >> pmu is already granted to guest. KVM can not intercept host IPI/timer >> interrupt, there is no pmu context switch, there will be problem. >> >> Regards >> Bibo Mao >>
On 4/23/2024 10:53 AM, maobibo wrote: > > > On 2024/4/23 上午10:44, Mi, Dapeng wrote: >> >> On 4/23/2024 9:01 AM, maobibo wrote: >>> >>> >>> On 2024/4/23 上午1:01, Sean Christopherson wrote: >>>> On Mon, Apr 22, 2024, maobibo wrote: >>>>> On 2024/4/16 上午6:45, Sean Christopherson wrote: >>>>>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: >>>>>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson >>>>>>> <seanjc@google.com> wrote: >>>>>>>> One my biggest complaints with the current vPMU code is that >>>>>>>> the roles and >>>>>>>> responsibilities between KVM and perf are poorly defined, which >>>>>>>> leads to suboptimal >>>>>>>> and hard to maintain code. >>>>>>>> >>>>>>>> Case in point, I'm pretty sure leaving guest values in PMCs >>>>>>>> _would_ leak guest >>>>>>>> state to userspace processes that have RDPMC permissions, as >>>>>>>> the PMCs might not >>>>>>>> be dirty from perf's perspective (see >>>>>>>> perf_clear_dirty_counters()). >>>>>>>> >>>>>>>> Blindly clearing PMCs in KVM "solves" that problem, but in >>>>>>>> doing so makes the >>>>>>>> overall code brittle because it's not clear whether KVM _needs_ >>>>>>>> to clear PMCs, >>>>>>>> or if KVM is just being paranoid. >>>>>>> >>>>>>> So once this rolls out, perf and vPMU are clients directly to >>>>>>> PMU HW. >>>>>> >>>>>> I don't think this is a statement we want to make, as it opens a >>>>>> discussion >>>>>> that we won't win. Nor do I think it's one we *need* to make. >>>>>> KVM doesn't need >>>>>> to be on equal footing with perf in terms of owning/managing PMU >>>>>> hardware, KVM >>>>>> just needs a few APIs to allow faithfully and accurately >>>>>> virtualizing a guest PMU. >>>>>> >>>>>>> Faithful cleaning (blind cleaning) has to be the baseline >>>>>>> implementation, until both clients agree to a "deal" between them. >>>>>>> Currently, there is no such deal, but I believe we could have >>>>>>> one via >>>>>>> future discussion. >>>>>> >>>>>> What I am saying is that there needs to be a "deal" in place >>>>>> before this code >>>>>> is merged. It doesn't need to be anything fancy, e.g. perf can >>>>>> still pave over >>>>>> PMCs it doesn't immediately load, as opposed to using >>>>>> cpu_hw_events.dirty to lazily >>>>>> do the clearing. But perf and KVM need to work together from the >>>>>> get go, ie. I >>>>>> don't want KVM doing something without regard to what perf does, >>>>>> and vice versa. >>>>>> >>>>> There is similar issue on LoongArch vPMU where vm can directly pmu >>>>> hardware >>>>> and pmu hw is shard with guest and host. Besides context switch >>>>> there are >>>>> other places where perf core will access pmu hw, such as tick >>>>> timer/hrtimer/ipi function call, and KVM can only intercept >>>>> context switch. >>>> >>>> Two questions: >>>> >>>> 1) Can KVM prevent the guest from accessing the PMU? >>>> >>>> 2) If so, KVM can grant partial access to the PMU, or is it all >>>> or nothing? >>>> >>>> If the answer to both questions is "yes", then it sounds like >>>> LoongArch *requires* >>>> mediated/passthrough support in order to virtualize its PMU. >>> >>> Hi Sean, >>> >>> Thank for your quick response. >>> >>> yes, kvm can prevent guest from accessing the PMU and grant partial >>> or all to access to the PMU. Only that if one pmu event is granted >>> to VM, host can not access this pmu event again. There must be pmu >>> event switch if host want to. >> >> PMU event is a software entity which won't be shared. did you mean if >> a PMU HW counter is granted to VM, then Host can't access the PMU HW >> counter, right? > yes, if PMU HW counter/control is granted to VM. The value comes from > guest, and is not meaningful for host. Host pmu core does not know > that it is granted to VM, host still think that it owns pmu. That's one issue this patchset tries to solve. Current new mediated x86 vPMU framework doesn't allow Host or Guest own the PMU HW resource simultaneously. Only when there is no !exclude_guest event on host, guest is allowed to exclusively own the PMU HW resource. > > Just like FPU register, it is shared by VM and host during different > time and it is lately switched. But if IPI or timer interrupt uses FPU > register on host, there will be the same issue. I didn't fully get your point. When IPI or timer interrupt reach, a VM-exit is triggered to make CPU traps into host first and then the host interrupt handler is called. Or are you complaining the executing sequence of switching guest PMU MSRs and these interrupt handler? > > Regards > Bibo Mao >> >> >>> >>>> >>>>> Can we add callback handler in structure kvm_guest_cbs? just like >>>>> this: >>>>> @@ -6403,6 +6403,7 @@ static struct perf_guest_info_callbacks >>>>> kvm_guest_cbs >>>>> = { >>>>> .state = kvm_guest_state, >>>>> .get_ip = kvm_guest_get_ip, >>>>> .handle_intel_pt_intr = NULL, >>>>> + .lose_pmu = kvm_guest_lose_pmu, >>>>> }; >>>>> >>>>> By the way, I do not know should the callback handler be triggered >>>>> in perf >>>>> core or detailed pmu hw driver. From ARM pmu hw driver, it is >>>>> triggered in >>>>> pmu hw driver such as function kvm_vcpu_pmu_resync_el0, >>>>> but I think it will be better if it is done in perf core. >>>> >>>> I don't think we want to take the approach of perf and KVM guests >>>> "fighting" over >>>> the PMU. That's effectively what we have today, and it's a mess >>>> for KVM because >>>> it's impossible to provide consistent, deterministic behavior for >>>> the guest. And >>>> it's just as messy for perf, which ends up having wierd, cumbersome >>>> flows that >>>> exists purely to try to play nice with KVM. >>> With existing pmu core code, in tick timer interrupt or IPI function >>> call interrupt pmu hw may be accessed by host when VM is running and >>> pmu is already granted to guest. KVM can not intercept host >>> IPI/timer interrupt, there is no pmu context switch, there will be >>> problem. >>> >>> Regards >>> Bibo Mao >>> >
On 2024/4/23 上午11:13, Mi, Dapeng wrote: > > On 4/23/2024 10:53 AM, maobibo wrote: >> >> >> On 2024/4/23 上午10:44, Mi, Dapeng wrote: >>> >>> On 4/23/2024 9:01 AM, maobibo wrote: >>>> >>>> >>>> On 2024/4/23 上午1:01, Sean Christopherson wrote: >>>>> On Mon, Apr 22, 2024, maobibo wrote: >>>>>> On 2024/4/16 上午6:45, Sean Christopherson wrote: >>>>>>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: >>>>>>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson >>>>>>>> <seanjc@google.com> wrote: >>>>>>>>> One my biggest complaints with the current vPMU code is that >>>>>>>>> the roles and >>>>>>>>> responsibilities between KVM and perf are poorly defined, which >>>>>>>>> leads to suboptimal >>>>>>>>> and hard to maintain code. >>>>>>>>> >>>>>>>>> Case in point, I'm pretty sure leaving guest values in PMCs >>>>>>>>> _would_ leak guest >>>>>>>>> state to userspace processes that have RDPMC permissions, as >>>>>>>>> the PMCs might not >>>>>>>>> be dirty from perf's perspective (see >>>>>>>>> perf_clear_dirty_counters()). >>>>>>>>> >>>>>>>>> Blindly clearing PMCs in KVM "solves" that problem, but in >>>>>>>>> doing so makes the >>>>>>>>> overall code brittle because it's not clear whether KVM _needs_ >>>>>>>>> to clear PMCs, >>>>>>>>> or if KVM is just being paranoid. >>>>>>>> >>>>>>>> So once this rolls out, perf and vPMU are clients directly to >>>>>>>> PMU HW. >>>>>>> >>>>>>> I don't think this is a statement we want to make, as it opens a >>>>>>> discussion >>>>>>> that we won't win. Nor do I think it's one we *need* to make. >>>>>>> KVM doesn't need >>>>>>> to be on equal footing with perf in terms of owning/managing PMU >>>>>>> hardware, KVM >>>>>>> just needs a few APIs to allow faithfully and accurately >>>>>>> virtualizing a guest PMU. >>>>>>> >>>>>>>> Faithful cleaning (blind cleaning) has to be the baseline >>>>>>>> implementation, until both clients agree to a "deal" between them. >>>>>>>> Currently, there is no such deal, but I believe we could have >>>>>>>> one via >>>>>>>> future discussion. >>>>>>> >>>>>>> What I am saying is that there needs to be a "deal" in place >>>>>>> before this code >>>>>>> is merged. It doesn't need to be anything fancy, e.g. perf can >>>>>>> still pave over >>>>>>> PMCs it doesn't immediately load, as opposed to using >>>>>>> cpu_hw_events.dirty to lazily >>>>>>> do the clearing. But perf and KVM need to work together from the >>>>>>> get go, ie. I >>>>>>> don't want KVM doing something without regard to what perf does, >>>>>>> and vice versa. >>>>>>> >>>>>> There is similar issue on LoongArch vPMU where vm can directly pmu >>>>>> hardware >>>>>> and pmu hw is shard with guest and host. Besides context switch >>>>>> there are >>>>>> other places where perf core will access pmu hw, such as tick >>>>>> timer/hrtimer/ipi function call, and KVM can only intercept >>>>>> context switch. >>>>> >>>>> Two questions: >>>>> >>>>> 1) Can KVM prevent the guest from accessing the PMU? >>>>> >>>>> 2) If so, KVM can grant partial access to the PMU, or is it all >>>>> or nothing? >>>>> >>>>> If the answer to both questions is "yes", then it sounds like >>>>> LoongArch *requires* >>>>> mediated/passthrough support in order to virtualize its PMU. >>>> >>>> Hi Sean, >>>> >>>> Thank for your quick response. >>>> >>>> yes, kvm can prevent guest from accessing the PMU and grant partial >>>> or all to access to the PMU. Only that if one pmu event is granted >>>> to VM, host can not access this pmu event again. There must be pmu >>>> event switch if host want to. >>> >>> PMU event is a software entity which won't be shared. did you mean if >>> a PMU HW counter is granted to VM, then Host can't access the PMU HW >>> counter, right? >> yes, if PMU HW counter/control is granted to VM. The value comes from >> guest, and is not meaningful for host. Host pmu core does not know >> that it is granted to VM, host still think that it owns pmu. > > That's one issue this patchset tries to solve. Current new mediated x86 > vPMU framework doesn't allow Host or Guest own the PMU HW resource > simultaneously. Only when there is no !exclude_guest event on host, > guest is allowed to exclusively own the PMU HW resource. > > >> >> Just like FPU register, it is shared by VM and host during different >> time and it is lately switched. But if IPI or timer interrupt uses FPU >> register on host, there will be the same issue. > > I didn't fully get your point. When IPI or timer interrupt reach, a > VM-exit is triggered to make CPU traps into host first and then the host > interrupt handler is called. Or are you complaining the executing > sequence of switching guest PMU MSRs and these interrupt handler? It is not necessary to save/restore PMU HW at every vm exit, it had better be lately saved/restored, such as only when vcpu thread is sched-out/sched-in, else the cost will be a little expensive. I know little about perf core. However there is PMU HW access in interrupt mode. That means PMU HW access should be irq disabled in general mode, else there may be nested PMU HW access. Is that true? > > >> >> Regards >> Bibo Mao >>> >>> >>>> >>>>> >>>>>> Can we add callback handler in structure kvm_guest_cbs? just like >>>>>> this: >>>>>> @@ -6403,6 +6403,7 @@ static struct perf_guest_info_callbacks >>>>>> kvm_guest_cbs >>>>>> = { >>>>>> .state = kvm_guest_state, >>>>>> .get_ip = kvm_guest_get_ip, >>>>>> .handle_intel_pt_intr = NULL, >>>>>> + .lose_pmu = kvm_guest_lose_pmu, >>>>>> }; >>>>>> >>>>>> By the way, I do not know should the callback handler be triggered >>>>>> in perf >>>>>> core or detailed pmu hw driver. From ARM pmu hw driver, it is >>>>>> triggered in >>>>>> pmu hw driver such as function kvm_vcpu_pmu_resync_el0, >>>>>> but I think it will be better if it is done in perf core. >>>>> >>>>> I don't think we want to take the approach of perf and KVM guests >>>>> "fighting" over >>>>> the PMU. That's effectively what we have today, and it's a mess >>>>> for KVM because >>>>> it's impossible to provide consistent, deterministic behavior for >>>>> the guest. And >>>>> it's just as messy for perf, which ends up having wierd, cumbersome >>>>> flows that >>>>> exists purely to try to play nice with KVM. >>>> With existing pmu core code, in tick timer interrupt or IPI function >>>> call interrupt pmu hw may be accessed by host when VM is running and >>>> pmu is already granted to guest. KVM can not intercept host >>>> IPI/timer interrupt, there is no pmu context switch, there will be >>>> problem. >>>> >>>> Regards >>>> Bibo Mao >>>> >>
On 2024/4/23 上午11:13, Mi, Dapeng wrote: > > On 4/23/2024 10:53 AM, maobibo wrote: >> >> >> On 2024/4/23 上午10:44, Mi, Dapeng wrote: >>> >>> On 4/23/2024 9:01 AM, maobibo wrote: >>>> >>>> >>>> On 2024/4/23 上午1:01, Sean Christopherson wrote: >>>>> On Mon, Apr 22, 2024, maobibo wrote: >>>>>> On 2024/4/16 上午6:45, Sean Christopherson wrote: >>>>>>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: >>>>>>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson >>>>>>>> <seanjc@google.com> wrote: >>>>>>>>> One my biggest complaints with the current vPMU code is that >>>>>>>>> the roles and >>>>>>>>> responsibilities between KVM and perf are poorly defined, which >>>>>>>>> leads to suboptimal >>>>>>>>> and hard to maintain code. >>>>>>>>> >>>>>>>>> Case in point, I'm pretty sure leaving guest values in PMCs >>>>>>>>> _would_ leak guest >>>>>>>>> state to userspace processes that have RDPMC permissions, as >>>>>>>>> the PMCs might not >>>>>>>>> be dirty from perf's perspective (see >>>>>>>>> perf_clear_dirty_counters()). >>>>>>>>> >>>>>>>>> Blindly clearing PMCs in KVM "solves" that problem, but in >>>>>>>>> doing so makes the >>>>>>>>> overall code brittle because it's not clear whether KVM _needs_ >>>>>>>>> to clear PMCs, >>>>>>>>> or if KVM is just being paranoid. >>>>>>>> >>>>>>>> So once this rolls out, perf and vPMU are clients directly to >>>>>>>> PMU HW. >>>>>>> >>>>>>> I don't think this is a statement we want to make, as it opens a >>>>>>> discussion >>>>>>> that we won't win. Nor do I think it's one we *need* to make. >>>>>>> KVM doesn't need >>>>>>> to be on equal footing with perf in terms of owning/managing PMU >>>>>>> hardware, KVM >>>>>>> just needs a few APIs to allow faithfully and accurately >>>>>>> virtualizing a guest PMU. >>>>>>> >>>>>>>> Faithful cleaning (blind cleaning) has to be the baseline >>>>>>>> implementation, until both clients agree to a "deal" between them. >>>>>>>> Currently, there is no such deal, but I believe we could have >>>>>>>> one via >>>>>>>> future discussion. >>>>>>> >>>>>>> What I am saying is that there needs to be a "deal" in place >>>>>>> before this code >>>>>>> is merged. It doesn't need to be anything fancy, e.g. perf can >>>>>>> still pave over >>>>>>> PMCs it doesn't immediately load, as opposed to using >>>>>>> cpu_hw_events.dirty to lazily >>>>>>> do the clearing. But perf and KVM need to work together from the >>>>>>> get go, ie. I >>>>>>> don't want KVM doing something without regard to what perf does, >>>>>>> and vice versa. >>>>>>> >>>>>> There is similar issue on LoongArch vPMU where vm can directly pmu >>>>>> hardware >>>>>> and pmu hw is shard with guest and host. Besides context switch >>>>>> there are >>>>>> other places where perf core will access pmu hw, such as tick >>>>>> timer/hrtimer/ipi function call, and KVM can only intercept >>>>>> context switch. >>>>> >>>>> Two questions: >>>>> >>>>> 1) Can KVM prevent the guest from accessing the PMU? >>>>> >>>>> 2) If so, KVM can grant partial access to the PMU, or is it all >>>>> or nothing? >>>>> >>>>> If the answer to both questions is "yes", then it sounds like >>>>> LoongArch *requires* >>>>> mediated/passthrough support in order to virtualize its PMU. >>>> >>>> Hi Sean, >>>> >>>> Thank for your quick response. >>>> >>>> yes, kvm can prevent guest from accessing the PMU and grant partial >>>> or all to access to the PMU. Only that if one pmu event is granted >>>> to VM, host can not access this pmu event again. There must be pmu >>>> event switch if host want to. >>> >>> PMU event is a software entity which won't be shared. did you mean if >>> a PMU HW counter is granted to VM, then Host can't access the PMU HW >>> counter, right? >> yes, if PMU HW counter/control is granted to VM. The value comes from >> guest, and is not meaningful for host. Host pmu core does not know >> that it is granted to VM, host still think that it owns pmu. > > That's one issue this patchset tries to solve. Current new mediated x86 > vPMU framework doesn't allow Host or Guest own the PMU HW resource > simultaneously. Only when there is no !exclude_guest event on host, > guest is allowed to exclusively own the PMU HW resource. > > >> >> Just like FPU register, it is shared by VM and host during different >> time and it is lately switched. But if IPI or timer interrupt uses FPU >> register on host, there will be the same issue. > > I didn't fully get your point. When IPI or timer interrupt reach, a > VM-exit is triggered to make CPU traps into host first and then the host yes, it is. > interrupt handler is called. Or are you complaining the executing > sequence of switching guest PMU MSRs and these interrupt handler? In our vPMU implementation, it is ok if vPMU is switched in vm exit path, however there is problem if vPMU is switched during vcpu thread sched-out/sched-in path since IPI/timer irq interrupt access pmu register in host mode. In general it will be better if the switch is done in vcpu thread sched-out/sched-in, else there is requirement to profile kvm hypervisor.Even there is such requirement, it is only one option. In most conditions, it will better if time of VM context exit is small. > > >> >> Regards >> Bibo Mao >>> >>> >>>> >>>>> >>>>>> Can we add callback handler in structure kvm_guest_cbs? just like >>>>>> this: >>>>>> @@ -6403,6 +6403,7 @@ static struct perf_guest_info_callbacks >>>>>> kvm_guest_cbs >>>>>> = { >>>>>> .state = kvm_guest_state, >>>>>> .get_ip = kvm_guest_get_ip, >>>>>> .handle_intel_pt_intr = NULL, >>>>>> + .lose_pmu = kvm_guest_lose_pmu, >>>>>> }; >>>>>> >>>>>> By the way, I do not know should the callback handler be triggered >>>>>> in perf >>>>>> core or detailed pmu hw driver. From ARM pmu hw driver, it is >>>>>> triggered in >>>>>> pmu hw driver such as function kvm_vcpu_pmu_resync_el0, >>>>>> but I think it will be better if it is done in perf core. >>>>> >>>>> I don't think we want to take the approach of perf and KVM guests >>>>> "fighting" over >>>>> the PMU. That's effectively what we have today, and it's a mess >>>>> for KVM because >>>>> it's impossible to provide consistent, deterministic behavior for >>>>> the guest. And >>>>> it's just as messy for perf, which ends up having wierd, cumbersome >>>>> flows that >>>>> exists purely to try to play nice with KVM. >>>> With existing pmu core code, in tick timer interrupt or IPI function >>>> call interrupt pmu hw may be accessed by host when VM is running and >>>> pmu is already granted to guest. KVM can not intercept host >>>> IPI/timer interrupt, there is no pmu context switch, there will be >>>> problem. >>>> >>>> Regards >>>> Bibo Mao >>>> >>
On 4/23/2024 11:26 AM, maobibo wrote: > > > On 2024/4/23 上午11:13, Mi, Dapeng wrote: >> >> On 4/23/2024 10:53 AM, maobibo wrote: >>> >>> >>> On 2024/4/23 上午10:44, Mi, Dapeng wrote: >>>> >>>> On 4/23/2024 9:01 AM, maobibo wrote: >>>>> >>>>> >>>>> On 2024/4/23 上午1:01, Sean Christopherson wrote: >>>>>> On Mon, Apr 22, 2024, maobibo wrote: >>>>>>> On 2024/4/16 上午6:45, Sean Christopherson wrote: >>>>>>>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: >>>>>>>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson >>>>>>>>> <seanjc@google.com> wrote: >>>>>>>>>> One my biggest complaints with the current vPMU code is that >>>>>>>>>> the roles and >>>>>>>>>> responsibilities between KVM and perf are poorly defined, >>>>>>>>>> which leads to suboptimal >>>>>>>>>> and hard to maintain code. >>>>>>>>>> >>>>>>>>>> Case in point, I'm pretty sure leaving guest values in PMCs >>>>>>>>>> _would_ leak guest >>>>>>>>>> state to userspace processes that have RDPMC permissions, as >>>>>>>>>> the PMCs might not >>>>>>>>>> be dirty from perf's perspective (see >>>>>>>>>> perf_clear_dirty_counters()). >>>>>>>>>> >>>>>>>>>> Blindly clearing PMCs in KVM "solves" that problem, but in >>>>>>>>>> doing so makes the >>>>>>>>>> overall code brittle because it's not clear whether KVM >>>>>>>>>> _needs_ to clear PMCs, >>>>>>>>>> or if KVM is just being paranoid. >>>>>>>>> >>>>>>>>> So once this rolls out, perf and vPMU are clients directly to >>>>>>>>> PMU HW. >>>>>>>> >>>>>>>> I don't think this is a statement we want to make, as it opens >>>>>>>> a discussion >>>>>>>> that we won't win. Nor do I think it's one we *need* to make. >>>>>>>> KVM doesn't need >>>>>>>> to be on equal footing with perf in terms of owning/managing >>>>>>>> PMU hardware, KVM >>>>>>>> just needs a few APIs to allow faithfully and accurately >>>>>>>> virtualizing a guest PMU. >>>>>>>> >>>>>>>>> Faithful cleaning (blind cleaning) has to be the baseline >>>>>>>>> implementation, until both clients agree to a "deal" between >>>>>>>>> them. >>>>>>>>> Currently, there is no such deal, but I believe we could have >>>>>>>>> one via >>>>>>>>> future discussion. >>>>>>>> >>>>>>>> What I am saying is that there needs to be a "deal" in place >>>>>>>> before this code >>>>>>>> is merged. It doesn't need to be anything fancy, e.g. perf can >>>>>>>> still pave over >>>>>>>> PMCs it doesn't immediately load, as opposed to using >>>>>>>> cpu_hw_events.dirty to lazily >>>>>>>> do the clearing. But perf and KVM need to work together from >>>>>>>> the get go, ie. I >>>>>>>> don't want KVM doing something without regard to what perf >>>>>>>> does, and vice versa. >>>>>>>> >>>>>>> There is similar issue on LoongArch vPMU where vm can directly >>>>>>> pmu hardware >>>>>>> and pmu hw is shard with guest and host. Besides context switch >>>>>>> there are >>>>>>> other places where perf core will access pmu hw, such as tick >>>>>>> timer/hrtimer/ipi function call, and KVM can only intercept >>>>>>> context switch. >>>>>> >>>>>> Two questions: >>>>>> >>>>>> 1) Can KVM prevent the guest from accessing the PMU? >>>>>> >>>>>> 2) If so, KVM can grant partial access to the PMU, or is it all >>>>>> or nothing? >>>>>> >>>>>> If the answer to both questions is "yes", then it sounds like >>>>>> LoongArch *requires* >>>>>> mediated/passthrough support in order to virtualize its PMU. >>>>> >>>>> Hi Sean, >>>>> >>>>> Thank for your quick response. >>>>> >>>>> yes, kvm can prevent guest from accessing the PMU and grant >>>>> partial or all to access to the PMU. Only that if one pmu event is >>>>> granted to VM, host can not access this pmu event again. There >>>>> must be pmu event switch if host want to. >>>> >>>> PMU event is a software entity which won't be shared. did you mean >>>> if a PMU HW counter is granted to VM, then Host can't access the >>>> PMU HW counter, right? >>> yes, if PMU HW counter/control is granted to VM. The value comes >>> from guest, and is not meaningful for host. Host pmu core does not >>> know that it is granted to VM, host still think that it owns pmu. >> >> That's one issue this patchset tries to solve. Current new mediated >> x86 vPMU framework doesn't allow Host or Guest own the PMU HW >> resource simultaneously. Only when there is no !exclude_guest event >> on host, guest is allowed to exclusively own the PMU HW resource. >> >> >>> >>> Just like FPU register, it is shared by VM and host during different >>> time and it is lately switched. But if IPI or timer interrupt uses >>> FPU register on host, there will be the same issue. >> >> I didn't fully get your point. When IPI or timer interrupt reach, a >> VM-exit is triggered to make CPU traps into host first and then the >> host interrupt handler is called. Or are you complaining the >> executing sequence of switching guest PMU MSRs and these interrupt >> handler? > It is not necessary to save/restore PMU HW at every vm exit, it had > better be lately saved/restored, such as only when vcpu thread is > sched-out/sched-in, else the cost will be a little expensive. I suspect this optimization deferring guest PMU state save/restore to vCPU task switching boundary would be really landed into KVM since it would make host lose the capability to profile KVM and It seems Sean object this. > > I know little about perf core. However there is PMU HW access in > interrupt mode. That means PMU HW access should be irq disabled in > general mode, else there may be nested PMU HW access. Is that true? I had no idea that timer irq handler would access PMU MSRs before. Could you please show me the code and I would look at it first. Thanks. > >> >> >>> >>> Regards >>> Bibo Mao >>>> >>>> >>>>> >>>>>> >>>>>>> Can we add callback handler in structure kvm_guest_cbs? just >>>>>>> like this: >>>>>>> @@ -6403,6 +6403,7 @@ static struct perf_guest_info_callbacks >>>>>>> kvm_guest_cbs >>>>>>> = { >>>>>>> .state = kvm_guest_state, >>>>>>> .get_ip = kvm_guest_get_ip, >>>>>>> .handle_intel_pt_intr = NULL, >>>>>>> + .lose_pmu = kvm_guest_lose_pmu, >>>>>>> }; >>>>>>> >>>>>>> By the way, I do not know should the callback handler be >>>>>>> triggered in perf >>>>>>> core or detailed pmu hw driver. From ARM pmu hw driver, it is >>>>>>> triggered in >>>>>>> pmu hw driver such as function kvm_vcpu_pmu_resync_el0, >>>>>>> but I think it will be better if it is done in perf core. >>>>>> >>>>>> I don't think we want to take the approach of perf and KVM guests >>>>>> "fighting" over >>>>>> the PMU. That's effectively what we have today, and it's a mess >>>>>> for KVM because >>>>>> it's impossible to provide consistent, deterministic behavior for >>>>>> the guest. And >>>>>> it's just as messy for perf, which ends up having wierd, >>>>>> cumbersome flows that >>>>>> exists purely to try to play nice with KVM. >>>>> With existing pmu core code, in tick timer interrupt or IPI >>>>> function call interrupt pmu hw may be accessed by host when VM is >>>>> running and pmu is already granted to guest. KVM can not intercept >>>>> host IPI/timer interrupt, there is no pmu context switch, there >>>>> will be problem. >>>>> >>>>> Regards >>>>> Bibo Mao >>>>> >>> >
On Mon, Apr 22, 2024 at 8:55 PM maobibo <maobibo@loongson.cn> wrote: > > > > On 2024/4/23 上午11:13, Mi, Dapeng wrote: > > > > On 4/23/2024 10:53 AM, maobibo wrote: > >> > >> > >> On 2024/4/23 上午10:44, Mi, Dapeng wrote: > >>> > >>> On 4/23/2024 9:01 AM, maobibo wrote: > >>>> > >>>> > >>>> On 2024/4/23 上午1:01, Sean Christopherson wrote: > >>>>> On Mon, Apr 22, 2024, maobibo wrote: > >>>>>> On 2024/4/16 上午6:45, Sean Christopherson wrote: > >>>>>>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: > >>>>>>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson > >>>>>>>> <seanjc@google.com> wrote: > >>>>>>>>> One my biggest complaints with the current vPMU code is that > >>>>>>>>> the roles and > >>>>>>>>> responsibilities between KVM and perf are poorly defined, which > >>>>>>>>> leads to suboptimal > >>>>>>>>> and hard to maintain code. > >>>>>>>>> > >>>>>>>>> Case in point, I'm pretty sure leaving guest values in PMCs > >>>>>>>>> _would_ leak guest > >>>>>>>>> state to userspace processes that have RDPMC permissions, as > >>>>>>>>> the PMCs might not > >>>>>>>>> be dirty from perf's perspective (see > >>>>>>>>> perf_clear_dirty_counters()). > >>>>>>>>> > >>>>>>>>> Blindly clearing PMCs in KVM "solves" that problem, but in > >>>>>>>>> doing so makes the > >>>>>>>>> overall code brittle because it's not clear whether KVM _needs_ > >>>>>>>>> to clear PMCs, > >>>>>>>>> or if KVM is just being paranoid. > >>>>>>>> > >>>>>>>> So once this rolls out, perf and vPMU are clients directly to > >>>>>>>> PMU HW. > >>>>>>> > >>>>>>> I don't think this is a statement we want to make, as it opens a > >>>>>>> discussion > >>>>>>> that we won't win. Nor do I think it's one we *need* to make. > >>>>>>> KVM doesn't need > >>>>>>> to be on equal footing with perf in terms of owning/managing PMU > >>>>>>> hardware, KVM > >>>>>>> just needs a few APIs to allow faithfully and accurately > >>>>>>> virtualizing a guest PMU. > >>>>>>> > >>>>>>>> Faithful cleaning (blind cleaning) has to be the baseline > >>>>>>>> implementation, until both clients agree to a "deal" between them. > >>>>>>>> Currently, there is no such deal, but I believe we could have > >>>>>>>> one via > >>>>>>>> future discussion. > >>>>>>> > >>>>>>> What I am saying is that there needs to be a "deal" in place > >>>>>>> before this code > >>>>>>> is merged. It doesn't need to be anything fancy, e.g. perf can > >>>>>>> still pave over > >>>>>>> PMCs it doesn't immediately load, as opposed to using > >>>>>>> cpu_hw_events.dirty to lazily > >>>>>>> do the clearing. But perf and KVM need to work together from the > >>>>>>> get go, ie. I > >>>>>>> don't want KVM doing something without regard to what perf does, > >>>>>>> and vice versa. > >>>>>>> > >>>>>> There is similar issue on LoongArch vPMU where vm can directly pmu > >>>>>> hardware > >>>>>> and pmu hw is shard with guest and host. Besides context switch > >>>>>> there are > >>>>>> other places where perf core will access pmu hw, such as tick > >>>>>> timer/hrtimer/ipi function call, and KVM can only intercept > >>>>>> context switch. > >>>>> > >>>>> Two questions: > >>>>> > >>>>> 1) Can KVM prevent the guest from accessing the PMU? > >>>>> > >>>>> 2) If so, KVM can grant partial access to the PMU, or is it all > >>>>> or nothing? > >>>>> > >>>>> If the answer to both questions is "yes", then it sounds like > >>>>> LoongArch *requires* > >>>>> mediated/passthrough support in order to virtualize its PMU. > >>>> > >>>> Hi Sean, > >>>> > >>>> Thank for your quick response. > >>>> > >>>> yes, kvm can prevent guest from accessing the PMU and grant partial > >>>> or all to access to the PMU. Only that if one pmu event is granted > >>>> to VM, host can not access this pmu event again. There must be pmu > >>>> event switch if host want to. > >>> > >>> PMU event is a software entity which won't be shared. did you mean if > >>> a PMU HW counter is granted to VM, then Host can't access the PMU HW > >>> counter, right? > >> yes, if PMU HW counter/control is granted to VM. The value comes from > >> guest, and is not meaningful for host. Host pmu core does not know > >> that it is granted to VM, host still think that it owns pmu. > > > > That's one issue this patchset tries to solve. Current new mediated x86 > > vPMU framework doesn't allow Host or Guest own the PMU HW resource > > simultaneously. Only when there is no !exclude_guest event on host, > > guest is allowed to exclusively own the PMU HW resource. > > > > > >> > >> Just like FPU register, it is shared by VM and host during different > >> time and it is lately switched. But if IPI or timer interrupt uses FPU > >> register on host, there will be the same issue. > > > > I didn't fully get your point. When IPI or timer interrupt reach, a > > VM-exit is triggered to make CPU traps into host first and then the host > yes, it is. This is correct. And this is one of the points that we had debated internally whether we should do PMU context switch at vcpu loop boundary or VM Enter/exit boundary. (host-level) timer interrupt can force VM Exit, which I think happens every 4ms or 1ms, depending on configuration. One of the key reasons we currently propose this is because it is the same boundary as the legacy PMU, i.e., it would be simple to propose from the perf subsystem perspective. Performance wise, doing PMU context switch at vcpu boundary would be way better in general. But the downside is that perf sub-system lose the capability to profile majority of the KVM code (functions) when guest PMU is enabled. > > > interrupt handler is called. Or are you complaining the executing > > sequence of switching guest PMU MSRs and these interrupt handler? > In our vPMU implementation, it is ok if vPMU is switched in vm exit > path, however there is problem if vPMU is switched during vcpu thread > sched-out/sched-in path since IPI/timer irq interrupt access pmu > register in host mode. Oh, the IPI/timer irq handler will access PMU registers? I thought only the host-level NMI handler will access the PMU MSRs since PMI is registered under NMI. In that case, you should disable IRQ during vcpu context switch. For NMI, we prevent its handler from accessing the PMU registers. In particular, we use a per-cpu variable to guard that. So, the host-level PMI handler for perf sub-system will check the variable before proceeding. > > In general it will be better if the switch is done in vcpu thread > sched-out/sched-in, else there is requirement to profile kvm > hypervisor.Even there is such requirement, it is only one option. In > most conditions, it will better if time of VM context exit is small. > Performance wise, agree, but there will be debate on perf functionality loss at the host level. Maybe, (just maybe), it is possible to do PMU context switch at vcpu boundary normally, but doing it at VM Enter/Exit boundary when host is profiling KVM kernel module. So, dynamically adjusting PMU context switch location could be an option. > > > > > >> > >> Regards > >> Bibo Mao > >>> > >>> > >>>> > >>>>> > >>>>>> Can we add callback handler in structure kvm_guest_cbs? just like > >>>>>> this: > >>>>>> @@ -6403,6 +6403,7 @@ static struct perf_guest_info_callbacks > >>>>>> kvm_guest_cbs > >>>>>> = { > >>>>>> .state = kvm_guest_state, > >>>>>> .get_ip = kvm_guest_get_ip, > >>>>>> .handle_intel_pt_intr = NULL, > >>>>>> + .lose_pmu = kvm_guest_lose_pmu, > >>>>>> }; > >>>>>> > >>>>>> By the way, I do not know should the callback handler be triggered > >>>>>> in perf > >>>>>> core or detailed pmu hw driver. From ARM pmu hw driver, it is > >>>>>> triggered in > >>>>>> pmu hw driver such as function kvm_vcpu_pmu_resync_el0, > >>>>>> but I think it will be better if it is done in perf core. > >>>>> > >>>>> I don't think we want to take the approach of perf and KVM guests > >>>>> "fighting" over > >>>>> the PMU. That's effectively what we have today, and it's a mess > >>>>> for KVM because > >>>>> it's impossible to provide consistent, deterministic behavior for > >>>>> the guest. And > >>>>> it's just as messy for perf, which ends up having wierd, cumbersome > >>>>> flows that > >>>>> exists purely to try to play nice with KVM. > >>>> With existing pmu core code, in tick timer interrupt or IPI function > >>>> call interrupt pmu hw may be accessed by host when VM is running and > >>>> pmu is already granted to guest. KVM can not intercept host > >>>> IPI/timer interrupt, there is no pmu context switch, there will be > >>>> problem. > >>>> > >>>> Regards > >>>> Bibo Mao > >>>> > >> >
On 2024/4/23 下午12:23, Mingwei Zhang wrote: > On Mon, Apr 22, 2024 at 8:55 PM maobibo <maobibo@loongson.cn> wrote: >> >> >> >> On 2024/4/23 上午11:13, Mi, Dapeng wrote: >>> >>> On 4/23/2024 10:53 AM, maobibo wrote: >>>> >>>> >>>> On 2024/4/23 上午10:44, Mi, Dapeng wrote: >>>>> >>>>> On 4/23/2024 9:01 AM, maobibo wrote: >>>>>> >>>>>> >>>>>> On 2024/4/23 上午1:01, Sean Christopherson wrote: >>>>>>> On Mon, Apr 22, 2024, maobibo wrote: >>>>>>>> On 2024/4/16 上午6:45, Sean Christopherson wrote: >>>>>>>>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: >>>>>>>>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson >>>>>>>>>> <seanjc@google.com> wrote: >>>>>>>>>>> One my biggest complaints with the current vPMU code is that >>>>>>>>>>> the roles and >>>>>>>>>>> responsibilities between KVM and perf are poorly defined, which >>>>>>>>>>> leads to suboptimal >>>>>>>>>>> and hard to maintain code. >>>>>>>>>>> >>>>>>>>>>> Case in point, I'm pretty sure leaving guest values in PMCs >>>>>>>>>>> _would_ leak guest >>>>>>>>>>> state to userspace processes that have RDPMC permissions, as >>>>>>>>>>> the PMCs might not >>>>>>>>>>> be dirty from perf's perspective (see >>>>>>>>>>> perf_clear_dirty_counters()). >>>>>>>>>>> >>>>>>>>>>> Blindly clearing PMCs in KVM "solves" that problem, but in >>>>>>>>>>> doing so makes the >>>>>>>>>>> overall code brittle because it's not clear whether KVM _needs_ >>>>>>>>>>> to clear PMCs, >>>>>>>>>>> or if KVM is just being paranoid. >>>>>>>>>> >>>>>>>>>> So once this rolls out, perf and vPMU are clients directly to >>>>>>>>>> PMU HW. >>>>>>>>> >>>>>>>>> I don't think this is a statement we want to make, as it opens a >>>>>>>>> discussion >>>>>>>>> that we won't win. Nor do I think it's one we *need* to make. >>>>>>>>> KVM doesn't need >>>>>>>>> to be on equal footing with perf in terms of owning/managing PMU >>>>>>>>> hardware, KVM >>>>>>>>> just needs a few APIs to allow faithfully and accurately >>>>>>>>> virtualizing a guest PMU. >>>>>>>>> >>>>>>>>>> Faithful cleaning (blind cleaning) has to be the baseline >>>>>>>>>> implementation, until both clients agree to a "deal" between them. >>>>>>>>>> Currently, there is no such deal, but I believe we could have >>>>>>>>>> one via >>>>>>>>>> future discussion. >>>>>>>>> >>>>>>>>> What I am saying is that there needs to be a "deal" in place >>>>>>>>> before this code >>>>>>>>> is merged. It doesn't need to be anything fancy, e.g. perf can >>>>>>>>> still pave over >>>>>>>>> PMCs it doesn't immediately load, as opposed to using >>>>>>>>> cpu_hw_events.dirty to lazily >>>>>>>>> do the clearing. But perf and KVM need to work together from the >>>>>>>>> get go, ie. I >>>>>>>>> don't want KVM doing something without regard to what perf does, >>>>>>>>> and vice versa. >>>>>>>>> >>>>>>>> There is similar issue on LoongArch vPMU where vm can directly pmu >>>>>>>> hardware >>>>>>>> and pmu hw is shard with guest and host. Besides context switch >>>>>>>> there are >>>>>>>> other places where perf core will access pmu hw, such as tick >>>>>>>> timer/hrtimer/ipi function call, and KVM can only intercept >>>>>>>> context switch. >>>>>>> >>>>>>> Two questions: >>>>>>> >>>>>>> 1) Can KVM prevent the guest from accessing the PMU? >>>>>>> >>>>>>> 2) If so, KVM can grant partial access to the PMU, or is it all >>>>>>> or nothing? >>>>>>> >>>>>>> If the answer to both questions is "yes", then it sounds like >>>>>>> LoongArch *requires* >>>>>>> mediated/passthrough support in order to virtualize its PMU. >>>>>> >>>>>> Hi Sean, >>>>>> >>>>>> Thank for your quick response. >>>>>> >>>>>> yes, kvm can prevent guest from accessing the PMU and grant partial >>>>>> or all to access to the PMU. Only that if one pmu event is granted >>>>>> to VM, host can not access this pmu event again. There must be pmu >>>>>> event switch if host want to. >>>>> >>>>> PMU event is a software entity which won't be shared. did you mean if >>>>> a PMU HW counter is granted to VM, then Host can't access the PMU HW >>>>> counter, right? >>>> yes, if PMU HW counter/control is granted to VM. The value comes from >>>> guest, and is not meaningful for host. Host pmu core does not know >>>> that it is granted to VM, host still think that it owns pmu. >>> >>> That's one issue this patchset tries to solve. Current new mediated x86 >>> vPMU framework doesn't allow Host or Guest own the PMU HW resource >>> simultaneously. Only when there is no !exclude_guest event on host, >>> guest is allowed to exclusively own the PMU HW resource. >>> >>> >>>> >>>> Just like FPU register, it is shared by VM and host during different >>>> time and it is lately switched. But if IPI or timer interrupt uses FPU >>>> register on host, there will be the same issue. >>> >>> I didn't fully get your point. When IPI or timer interrupt reach, a >>> VM-exit is triggered to make CPU traps into host first and then the host >> yes, it is. > > This is correct. And this is one of the points that we had debated > internally whether we should do PMU context switch at vcpu loop > boundary or VM Enter/exit boundary. (host-level) timer interrupt can > force VM Exit, which I think happens every 4ms or 1ms, depending on > configuration. > > One of the key reasons we currently propose this is because it is the > same boundary as the legacy PMU, i.e., it would be simple to propose > from the perf subsystem perspective. > > Performance wise, doing PMU context switch at vcpu boundary would be > way better in general. But the downside is that perf sub-system lose > the capability to profile majority of the KVM code (functions) when > guest PMU is enabled. > >> >>> interrupt handler is called. Or are you complaining the executing >>> sequence of switching guest PMU MSRs and these interrupt handler? >> In our vPMU implementation, it is ok if vPMU is switched in vm exit >> path, however there is problem if vPMU is switched during vcpu thread >> sched-out/sched-in path since IPI/timer irq interrupt access pmu >> register in host mode. > > Oh, the IPI/timer irq handler will access PMU registers? I thought > only the host-level NMI handler will access the PMU MSRs since PMI is > registered under NMI. > > In that case, you should disable IRQ during vcpu context switch. For > NMI, we prevent its handler from accessing the PMU registers. In > particular, we use a per-cpu variable to guard that. So, the > host-level PMI handler for perf sub-system will check the variable > before proceeding. perf core will access pmu hw in tick timer/hrtimer/ipi function call, such as function perf_event_task_tick() is called in tick timer, there are event_function_call(event, __perf_event_xxx, &value) in file kernel/events/core.c. https://lore.kernel.org/lkml/20240417065236.500011-1-gaosong@loongson.cn/T/#m15aeb79fdc9ce72dd5b374edd6acdcf7a9dafcf4 > >> >> In general it will be better if the switch is done in vcpu thread >> sched-out/sched-in, else there is requirement to profile kvm >> hypervisor.Even there is such requirement, it is only one option. In >> most conditions, it will better if time of VM context exit is small. >> > Performance wise, agree, but there will be debate on perf > functionality loss at the host level. > > Maybe, (just maybe), it is possible to do PMU context switch at vcpu > boundary normally, but doing it at VM Enter/Exit boundary when host is > profiling KVM kernel module. So, dynamically adjusting PMU context > switch location could be an option. > >>> >>> >>>> >>>> Regards >>>> Bibo Mao >>>>> >>>>> >>>>>> >>>>>>> >>>>>>>> Can we add callback handler in structure kvm_guest_cbs? just like >>>>>>>> this: >>>>>>>> @@ -6403,6 +6403,7 @@ static struct perf_guest_info_callbacks >>>>>>>> kvm_guest_cbs >>>>>>>> = { >>>>>>>> .state = kvm_guest_state, >>>>>>>> .get_ip = kvm_guest_get_ip, >>>>>>>> .handle_intel_pt_intr = NULL, >>>>>>>> + .lose_pmu = kvm_guest_lose_pmu, >>>>>>>> }; >>>>>>>> >>>>>>>> By the way, I do not know should the callback handler be triggered >>>>>>>> in perf >>>>>>>> core or detailed pmu hw driver. From ARM pmu hw driver, it is >>>>>>>> triggered in >>>>>>>> pmu hw driver such as function kvm_vcpu_pmu_resync_el0, >>>>>>>> but I think it will be better if it is done in perf core. >>>>>>> >>>>>>> I don't think we want to take the approach of perf and KVM guests >>>>>>> "fighting" over >>>>>>> the PMU. That's effectively what we have today, and it's a mess >>>>>>> for KVM because >>>>>>> it's impossible to provide consistent, deterministic behavior for >>>>>>> the guest. And >>>>>>> it's just as messy for perf, which ends up having wierd, cumbersome >>>>>>> flows that >>>>>>> exists purely to try to play nice with KVM. >>>>>> With existing pmu core code, in tick timer interrupt or IPI function >>>>>> call interrupt pmu hw may be accessed by host when VM is running and >>>>>> pmu is already granted to guest. KVM can not intercept host >>>>>> IPI/timer interrupt, there is no pmu context switch, there will be >>>>>> problem. >>>>>> >>>>>> Regards >>>>>> Bibo Mao >>>>>> >>>> >>
On 4/23/2024 2:08 PM, maobibo wrote: > > > On 2024/4/23 下午12:23, Mingwei Zhang wrote: >> On Mon, Apr 22, 2024 at 8:55 PM maobibo <maobibo@loongson.cn> wrote: >>> >>> >>> >>> On 2024/4/23 上午11:13, Mi, Dapeng wrote: >>>> >>>> On 4/23/2024 10:53 AM, maobibo wrote: >>>>> >>>>> >>>>> On 2024/4/23 上午10:44, Mi, Dapeng wrote: >>>>>> >>>>>> On 4/23/2024 9:01 AM, maobibo wrote: >>>>>>> >>>>>>> >>>>>>> On 2024/4/23 上午1:01, Sean Christopherson wrote: >>>>>>>> On Mon, Apr 22, 2024, maobibo wrote: >>>>>>>>> On 2024/4/16 上午6:45, Sean Christopherson wrote: >>>>>>>>>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: >>>>>>>>>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson >>>>>>>>>>> <seanjc@google.com> wrote: >>>>>>>>>>>> One my biggest complaints with the current vPMU code is that >>>>>>>>>>>> the roles and >>>>>>>>>>>> responsibilities between KVM and perf are poorly defined, >>>>>>>>>>>> which >>>>>>>>>>>> leads to suboptimal >>>>>>>>>>>> and hard to maintain code. >>>>>>>>>>>> >>>>>>>>>>>> Case in point, I'm pretty sure leaving guest values in PMCs >>>>>>>>>>>> _would_ leak guest >>>>>>>>>>>> state to userspace processes that have RDPMC permissions, as >>>>>>>>>>>> the PMCs might not >>>>>>>>>>>> be dirty from perf's perspective (see >>>>>>>>>>>> perf_clear_dirty_counters()). >>>>>>>>>>>> >>>>>>>>>>>> Blindly clearing PMCs in KVM "solves" that problem, but in >>>>>>>>>>>> doing so makes the >>>>>>>>>>>> overall code brittle because it's not clear whether KVM >>>>>>>>>>>> _needs_ >>>>>>>>>>>> to clear PMCs, >>>>>>>>>>>> or if KVM is just being paranoid. >>>>>>>>>>> >>>>>>>>>>> So once this rolls out, perf and vPMU are clients directly to >>>>>>>>>>> PMU HW. >>>>>>>>>> >>>>>>>>>> I don't think this is a statement we want to make, as it opens a >>>>>>>>>> discussion >>>>>>>>>> that we won't win. Nor do I think it's one we *need* to make. >>>>>>>>>> KVM doesn't need >>>>>>>>>> to be on equal footing with perf in terms of owning/managing PMU >>>>>>>>>> hardware, KVM >>>>>>>>>> just needs a few APIs to allow faithfully and accurately >>>>>>>>>> virtualizing a guest PMU. >>>>>>>>>> >>>>>>>>>>> Faithful cleaning (blind cleaning) has to be the baseline >>>>>>>>>>> implementation, until both clients agree to a "deal" between >>>>>>>>>>> them. >>>>>>>>>>> Currently, there is no such deal, but I believe we could have >>>>>>>>>>> one via >>>>>>>>>>> future discussion. >>>>>>>>>> >>>>>>>>>> What I am saying is that there needs to be a "deal" in place >>>>>>>>>> before this code >>>>>>>>>> is merged. It doesn't need to be anything fancy, e.g. perf can >>>>>>>>>> still pave over >>>>>>>>>> PMCs it doesn't immediately load, as opposed to using >>>>>>>>>> cpu_hw_events.dirty to lazily >>>>>>>>>> do the clearing. But perf and KVM need to work together from >>>>>>>>>> the >>>>>>>>>> get go, ie. I >>>>>>>>>> don't want KVM doing something without regard to what perf does, >>>>>>>>>> and vice versa. >>>>>>>>>> >>>>>>>>> There is similar issue on LoongArch vPMU where vm can directly >>>>>>>>> pmu >>>>>>>>> hardware >>>>>>>>> and pmu hw is shard with guest and host. Besides context switch >>>>>>>>> there are >>>>>>>>> other places where perf core will access pmu hw, such as tick >>>>>>>>> timer/hrtimer/ipi function call, and KVM can only intercept >>>>>>>>> context switch. >>>>>>>> >>>>>>>> Two questions: >>>>>>>> >>>>>>>> 1) Can KVM prevent the guest from accessing the PMU? >>>>>>>> >>>>>>>> 2) If so, KVM can grant partial access to the PMU, or is it all >>>>>>>> or nothing? >>>>>>>> >>>>>>>> If the answer to both questions is "yes", then it sounds like >>>>>>>> LoongArch *requires* >>>>>>>> mediated/passthrough support in order to virtualize its PMU. >>>>>>> >>>>>>> Hi Sean, >>>>>>> >>>>>>> Thank for your quick response. >>>>>>> >>>>>>> yes, kvm can prevent guest from accessing the PMU and grant partial >>>>>>> or all to access to the PMU. Only that if one pmu event is granted >>>>>>> to VM, host can not access this pmu event again. There must be pmu >>>>>>> event switch if host want to. >>>>>> >>>>>> PMU event is a software entity which won't be shared. did you >>>>>> mean if >>>>>> a PMU HW counter is granted to VM, then Host can't access the PMU HW >>>>>> counter, right? >>>>> yes, if PMU HW counter/control is granted to VM. The value comes from >>>>> guest, and is not meaningful for host. Host pmu core does not know >>>>> that it is granted to VM, host still think that it owns pmu. >>>> >>>> That's one issue this patchset tries to solve. Current new mediated >>>> x86 >>>> vPMU framework doesn't allow Host or Guest own the PMU HW resource >>>> simultaneously. Only when there is no !exclude_guest event on host, >>>> guest is allowed to exclusively own the PMU HW resource. >>>> >>>> >>>>> >>>>> Just like FPU register, it is shared by VM and host during different >>>>> time and it is lately switched. But if IPI or timer interrupt uses >>>>> FPU >>>>> register on host, there will be the same issue. >>>> >>>> I didn't fully get your point. When IPI or timer interrupt reach, a >>>> VM-exit is triggered to make CPU traps into host first and then the >>>> host >>> yes, it is. >> >> This is correct. And this is one of the points that we had debated >> internally whether we should do PMU context switch at vcpu loop >> boundary or VM Enter/exit boundary. (host-level) timer interrupt can >> force VM Exit, which I think happens every 4ms or 1ms, depending on >> configuration. >> >> One of the key reasons we currently propose this is because it is the >> same boundary as the legacy PMU, i.e., it would be simple to propose >> from the perf subsystem perspective. >> >> Performance wise, doing PMU context switch at vcpu boundary would be >> way better in general. But the downside is that perf sub-system lose >> the capability to profile majority of the KVM code (functions) when >> guest PMU is enabled. >> >>> >>>> interrupt handler is called. Or are you complaining the executing >>>> sequence of switching guest PMU MSRs and these interrupt handler? >>> In our vPMU implementation, it is ok if vPMU is switched in vm exit >>> path, however there is problem if vPMU is switched during vcpu thread >>> sched-out/sched-in path since IPI/timer irq interrupt access pmu >>> register in host mode. >> >> Oh, the IPI/timer irq handler will access PMU registers? I thought >> only the host-level NMI handler will access the PMU MSRs since PMI is >> registered under NMI. >> >> In that case, you should disable IRQ during vcpu context switch. For >> NMI, we prevent its handler from accessing the PMU registers. In >> particular, we use a per-cpu variable to guard that. So, the >> host-level PMI handler for perf sub-system will check the variable >> before proceeding. > > perf core will access pmu hw in tick timer/hrtimer/ipi function call, > such as function perf_event_task_tick() is called in tick timer, there > are event_function_call(event, __perf_event_xxx, &value) in file > kernel/events/core.c. > > https://lore.kernel.org/lkml/20240417065236.500011-1-gaosong@loongson.cn/T/#m15aeb79fdc9ce72dd5b374edd6acdcf7a9dafcf4 > Just go through functions (not sure if all), whether perf_event_task_tick() or the callbacks of event_function_call() would check the event->state first, if the event is in PERF_EVENT_STATE_INACTIVE, the PMU HW MSRs would not be touched really. In this new proposal, all host events with exclude_guest attribute would be put on PERF_EVENT_STATE_INACTIVE sate if guest own the PMU HW resource. So I think it's fine. > > >> >>> >>> In general it will be better if the switch is done in vcpu thread >>> sched-out/sched-in, else there is requirement to profile kvm >>> hypervisor.Even there is such requirement, it is only one option. In >>> most conditions, it will better if time of VM context exit is small. >>> >> Performance wise, agree, but there will be debate on perf >> functionality loss at the host level. >> >> Maybe, (just maybe), it is possible to do PMU context switch at vcpu >> boundary normally, but doing it at VM Enter/Exit boundary when host is >> profiling KVM kernel module. So, dynamically adjusting PMU context >> switch location could be an option. >> >>>> >>>> >>>>> >>>>> Regards >>>>> Bibo Mao >>>>>> >>>>>> >>>>>>> >>>>>>>> >>>>>>>>> Can we add callback handler in structure kvm_guest_cbs? just >>>>>>>>> like >>>>>>>>> this: >>>>>>>>> @@ -6403,6 +6403,7 @@ static struct perf_guest_info_callbacks >>>>>>>>> kvm_guest_cbs >>>>>>>>> = { >>>>>>>>> .state = kvm_guest_state, >>>>>>>>> .get_ip = kvm_guest_get_ip, >>>>>>>>> .handle_intel_pt_intr = NULL, >>>>>>>>> + .lose_pmu = kvm_guest_lose_pmu, >>>>>>>>> }; >>>>>>>>> >>>>>>>>> By the way, I do not know should the callback handler be >>>>>>>>> triggered >>>>>>>>> in perf >>>>>>>>> core or detailed pmu hw driver. From ARM pmu hw driver, it is >>>>>>>>> triggered in >>>>>>>>> pmu hw driver such as function kvm_vcpu_pmu_resync_el0, >>>>>>>>> but I think it will be better if it is done in perf core. >>>>>>>> >>>>>>>> I don't think we want to take the approach of perf and KVM guests >>>>>>>> "fighting" over >>>>>>>> the PMU. That's effectively what we have today, and it's a mess >>>>>>>> for KVM because >>>>>>>> it's impossible to provide consistent, deterministic behavior for >>>>>>>> the guest. And >>>>>>>> it's just as messy for perf, which ends up having wierd, >>>>>>>> cumbersome >>>>>>>> flows that >>>>>>>> exists purely to try to play nice with KVM. >>>>>>> With existing pmu core code, in tick timer interrupt or IPI >>>>>>> function >>>>>>> call interrupt pmu hw may be accessed by host when VM is running >>>>>>> and >>>>>>> pmu is already granted to guest. KVM can not intercept host >>>>>>> IPI/timer interrupt, there is no pmu context switch, there will be >>>>>>> problem. >>>>>>> >>>>>>> Regards >>>>>>> Bibo Mao >>>>>>> >>>>> >>> > >
On Mon, Apr 22, 2024 at 11:45 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > On 4/23/2024 2:08 PM, maobibo wrote: > > > > > > On 2024/4/23 下午12:23, Mingwei Zhang wrote: > >> On Mon, Apr 22, 2024 at 8:55 PM maobibo <maobibo@loongson.cn> wrote: > >>> > >>> > >>> > >>> On 2024/4/23 上午11:13, Mi, Dapeng wrote: > >>>> > >>>> On 4/23/2024 10:53 AM, maobibo wrote: > >>>>> > >>>>> > >>>>> On 2024/4/23 上午10:44, Mi, Dapeng wrote: > >>>>>> > >>>>>> On 4/23/2024 9:01 AM, maobibo wrote: > >>>>>>> > >>>>>>> > >>>>>>> On 2024/4/23 上午1:01, Sean Christopherson wrote: > >>>>>>>> On Mon, Apr 22, 2024, maobibo wrote: > >>>>>>>>> On 2024/4/16 上午6:45, Sean Christopherson wrote: > >>>>>>>>>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: > >>>>>>>>>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson > >>>>>>>>>>> <seanjc@google.com> wrote: > >>>>>>>>>>>> One my biggest complaints with the current vPMU code is that > >>>>>>>>>>>> the roles and > >>>>>>>>>>>> responsibilities between KVM and perf are poorly defined, > >>>>>>>>>>>> which > >>>>>>>>>>>> leads to suboptimal > >>>>>>>>>>>> and hard to maintain code. > >>>>>>>>>>>> > >>>>>>>>>>>> Case in point, I'm pretty sure leaving guest values in PMCs > >>>>>>>>>>>> _would_ leak guest > >>>>>>>>>>>> state to userspace processes that have RDPMC permissions, as > >>>>>>>>>>>> the PMCs might not > >>>>>>>>>>>> be dirty from perf's perspective (see > >>>>>>>>>>>> perf_clear_dirty_counters()). > >>>>>>>>>>>> > >>>>>>>>>>>> Blindly clearing PMCs in KVM "solves" that problem, but in > >>>>>>>>>>>> doing so makes the > >>>>>>>>>>>> overall code brittle because it's not clear whether KVM > >>>>>>>>>>>> _needs_ > >>>>>>>>>>>> to clear PMCs, > >>>>>>>>>>>> or if KVM is just being paranoid. > >>>>>>>>>>> > >>>>>>>>>>> So once this rolls out, perf and vPMU are clients directly to > >>>>>>>>>>> PMU HW. > >>>>>>>>>> > >>>>>>>>>> I don't think this is a statement we want to make, as it opens a > >>>>>>>>>> discussion > >>>>>>>>>> that we won't win. Nor do I think it's one we *need* to make. > >>>>>>>>>> KVM doesn't need > >>>>>>>>>> to be on equal footing with perf in terms of owning/managing PMU > >>>>>>>>>> hardware, KVM > >>>>>>>>>> just needs a few APIs to allow faithfully and accurately > >>>>>>>>>> virtualizing a guest PMU. > >>>>>>>>>> > >>>>>>>>>>> Faithful cleaning (blind cleaning) has to be the baseline > >>>>>>>>>>> implementation, until both clients agree to a "deal" between > >>>>>>>>>>> them. > >>>>>>>>>>> Currently, there is no such deal, but I believe we could have > >>>>>>>>>>> one via > >>>>>>>>>>> future discussion. > >>>>>>>>>> > >>>>>>>>>> What I am saying is that there needs to be a "deal" in place > >>>>>>>>>> before this code > >>>>>>>>>> is merged. It doesn't need to be anything fancy, e.g. perf can > >>>>>>>>>> still pave over > >>>>>>>>>> PMCs it doesn't immediately load, as opposed to using > >>>>>>>>>> cpu_hw_events.dirty to lazily > >>>>>>>>>> do the clearing. But perf and KVM need to work together from > >>>>>>>>>> the > >>>>>>>>>> get go, ie. I > >>>>>>>>>> don't want KVM doing something without regard to what perf does, > >>>>>>>>>> and vice versa. > >>>>>>>>>> > >>>>>>>>> There is similar issue on LoongArch vPMU where vm can directly > >>>>>>>>> pmu > >>>>>>>>> hardware > >>>>>>>>> and pmu hw is shard with guest and host. Besides context switch > >>>>>>>>> there are > >>>>>>>>> other places where perf core will access pmu hw, such as tick > >>>>>>>>> timer/hrtimer/ipi function call, and KVM can only intercept > >>>>>>>>> context switch. > >>>>>>>> > >>>>>>>> Two questions: > >>>>>>>> > >>>>>>>> 1) Can KVM prevent the guest from accessing the PMU? > >>>>>>>> > >>>>>>>> 2) If so, KVM can grant partial access to the PMU, or is it all > >>>>>>>> or nothing? > >>>>>>>> > >>>>>>>> If the answer to both questions is "yes", then it sounds like > >>>>>>>> LoongArch *requires* > >>>>>>>> mediated/passthrough support in order to virtualize its PMU. > >>>>>>> > >>>>>>> Hi Sean, > >>>>>>> > >>>>>>> Thank for your quick response. > >>>>>>> > >>>>>>> yes, kvm can prevent guest from accessing the PMU and grant partial > >>>>>>> or all to access to the PMU. Only that if one pmu event is granted > >>>>>>> to VM, host can not access this pmu event again. There must be pmu > >>>>>>> event switch if host want to. > >>>>>> > >>>>>> PMU event is a software entity which won't be shared. did you > >>>>>> mean if > >>>>>> a PMU HW counter is granted to VM, then Host can't access the PMU HW > >>>>>> counter, right? > >>>>> yes, if PMU HW counter/control is granted to VM. The value comes from > >>>>> guest, and is not meaningful for host. Host pmu core does not know > >>>>> that it is granted to VM, host still think that it owns pmu. > >>>> > >>>> That's one issue this patchset tries to solve. Current new mediated > >>>> x86 > >>>> vPMU framework doesn't allow Host or Guest own the PMU HW resource > >>>> simultaneously. Only when there is no !exclude_guest event on host, > >>>> guest is allowed to exclusively own the PMU HW resource. > >>>> > >>>> > >>>>> > >>>>> Just like FPU register, it is shared by VM and host during different > >>>>> time and it is lately switched. But if IPI or timer interrupt uses > >>>>> FPU > >>>>> register on host, there will be the same issue. > >>>> > >>>> I didn't fully get your point. When IPI or timer interrupt reach, a > >>>> VM-exit is triggered to make CPU traps into host first and then the > >>>> host > >>> yes, it is. > >> > >> This is correct. And this is one of the points that we had debated > >> internally whether we should do PMU context switch at vcpu loop > >> boundary or VM Enter/exit boundary. (host-level) timer interrupt can > >> force VM Exit, which I think happens every 4ms or 1ms, depending on > >> configuration. > >> > >> One of the key reasons we currently propose this is because it is the > >> same boundary as the legacy PMU, i.e., it would be simple to propose > >> from the perf subsystem perspective. > >> > >> Performance wise, doing PMU context switch at vcpu boundary would be > >> way better in general. But the downside is that perf sub-system lose > >> the capability to profile majority of the KVM code (functions) when > >> guest PMU is enabled. > >> > >>> > >>>> interrupt handler is called. Or are you complaining the executing > >>>> sequence of switching guest PMU MSRs and these interrupt handler? > >>> In our vPMU implementation, it is ok if vPMU is switched in vm exit > >>> path, however there is problem if vPMU is switched during vcpu thread > >>> sched-out/sched-in path since IPI/timer irq interrupt access pmu > >>> register in host mode. > >> > >> Oh, the IPI/timer irq handler will access PMU registers? I thought > >> only the host-level NMI handler will access the PMU MSRs since PMI is > >> registered under NMI. > >> > >> In that case, you should disable IRQ during vcpu context switch. For > >> NMI, we prevent its handler from accessing the PMU registers. In > >> particular, we use a per-cpu variable to guard that. So, the > >> host-level PMI handler for perf sub-system will check the variable > >> before proceeding. > > > > perf core will access pmu hw in tick timer/hrtimer/ipi function call, > > such as function perf_event_task_tick() is called in tick timer, there > > are event_function_call(event, __perf_event_xxx, &value) in file > > kernel/events/core.c. > > > > https://lore.kernel.org/lkml/20240417065236.500011-1-gaosong@loongson.cn/T/#m15aeb79fdc9ce72dd5b374edd6acdcf7a9dafcf4 > > > > Just go through functions (not sure if all), whether > perf_event_task_tick() or the callbacks of event_function_call() would > check the event->state first, if the event is in > PERF_EVENT_STATE_INACTIVE, the PMU HW MSRs would not be touched really. > In this new proposal, all host events with exclude_guest attribute would > be put on PERF_EVENT_STATE_INACTIVE sate if guest own the PMU HW > resource. So I think it's fine. > Is there any event in the host still having PERF_EVENT_STATE_ACTIVE? If so, hmm, it will reach perf_pmu_disable(event->pmu), which will access the global ctrl MSR.
On 4/23/2024 3:10 PM, Mingwei Zhang wrote: > On Mon, Apr 22, 2024 at 11:45 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: >> >> On 4/23/2024 2:08 PM, maobibo wrote: >>> >>> On 2024/4/23 下午12:23, Mingwei Zhang wrote: >>>> On Mon, Apr 22, 2024 at 8:55 PM maobibo <maobibo@loongson.cn> wrote: >>>>> >>>>> >>>>> On 2024/4/23 上午11:13, Mi, Dapeng wrote: >>>>>> On 4/23/2024 10:53 AM, maobibo wrote: >>>>>>> >>>>>>> On 2024/4/23 上午10:44, Mi, Dapeng wrote: >>>>>>>> On 4/23/2024 9:01 AM, maobibo wrote: >>>>>>>>> >>>>>>>>> On 2024/4/23 上午1:01, Sean Christopherson wrote: >>>>>>>>>> On Mon, Apr 22, 2024, maobibo wrote: >>>>>>>>>>> On 2024/4/16 上午6:45, Sean Christopherson wrote: >>>>>>>>>>>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: >>>>>>>>>>>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson >>>>>>>>>>>>> <seanjc@google.com> wrote: >>>>>>>>>>>>>> One my biggest complaints with the current vPMU code is that >>>>>>>>>>>>>> the roles and >>>>>>>>>>>>>> responsibilities between KVM and perf are poorly defined, >>>>>>>>>>>>>> which >>>>>>>>>>>>>> leads to suboptimal >>>>>>>>>>>>>> and hard to maintain code. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Case in point, I'm pretty sure leaving guest values in PMCs >>>>>>>>>>>>>> _would_ leak guest >>>>>>>>>>>>>> state to userspace processes that have RDPMC permissions, as >>>>>>>>>>>>>> the PMCs might not >>>>>>>>>>>>>> be dirty from perf's perspective (see >>>>>>>>>>>>>> perf_clear_dirty_counters()). >>>>>>>>>>>>>> >>>>>>>>>>>>>> Blindly clearing PMCs in KVM "solves" that problem, but in >>>>>>>>>>>>>> doing so makes the >>>>>>>>>>>>>> overall code brittle because it's not clear whether KVM >>>>>>>>>>>>>> _needs_ >>>>>>>>>>>>>> to clear PMCs, >>>>>>>>>>>>>> or if KVM is just being paranoid. >>>>>>>>>>>>> So once this rolls out, perf and vPMU are clients directly to >>>>>>>>>>>>> PMU HW. >>>>>>>>>>>> I don't think this is a statement we want to make, as it opens a >>>>>>>>>>>> discussion >>>>>>>>>>>> that we won't win. Nor do I think it's one we *need* to make. >>>>>>>>>>>> KVM doesn't need >>>>>>>>>>>> to be on equal footing with perf in terms of owning/managing PMU >>>>>>>>>>>> hardware, KVM >>>>>>>>>>>> just needs a few APIs to allow faithfully and accurately >>>>>>>>>>>> virtualizing a guest PMU. >>>>>>>>>>>> >>>>>>>>>>>>> Faithful cleaning (blind cleaning) has to be the baseline >>>>>>>>>>>>> implementation, until both clients agree to a "deal" between >>>>>>>>>>>>> them. >>>>>>>>>>>>> Currently, there is no such deal, but I believe we could have >>>>>>>>>>>>> one via >>>>>>>>>>>>> future discussion. >>>>>>>>>>>> What I am saying is that there needs to be a "deal" in place >>>>>>>>>>>> before this code >>>>>>>>>>>> is merged. It doesn't need to be anything fancy, e.g. perf can >>>>>>>>>>>> still pave over >>>>>>>>>>>> PMCs it doesn't immediately load, as opposed to using >>>>>>>>>>>> cpu_hw_events.dirty to lazily >>>>>>>>>>>> do the clearing. But perf and KVM need to work together from >>>>>>>>>>>> the >>>>>>>>>>>> get go, ie. I >>>>>>>>>>>> don't want KVM doing something without regard to what perf does, >>>>>>>>>>>> and vice versa. >>>>>>>>>>>> >>>>>>>>>>> There is similar issue on LoongArch vPMU where vm can directly >>>>>>>>>>> pmu >>>>>>>>>>> hardware >>>>>>>>>>> and pmu hw is shard with guest and host. Besides context switch >>>>>>>>>>> there are >>>>>>>>>>> other places where perf core will access pmu hw, such as tick >>>>>>>>>>> timer/hrtimer/ipi function call, and KVM can only intercept >>>>>>>>>>> context switch. >>>>>>>>>> Two questions: >>>>>>>>>> >>>>>>>>>> 1) Can KVM prevent the guest from accessing the PMU? >>>>>>>>>> >>>>>>>>>> 2) If so, KVM can grant partial access to the PMU, or is it all >>>>>>>>>> or nothing? >>>>>>>>>> >>>>>>>>>> If the answer to both questions is "yes", then it sounds like >>>>>>>>>> LoongArch *requires* >>>>>>>>>> mediated/passthrough support in order to virtualize its PMU. >>>>>>>>> Hi Sean, >>>>>>>>> >>>>>>>>> Thank for your quick response. >>>>>>>>> >>>>>>>>> yes, kvm can prevent guest from accessing the PMU and grant partial >>>>>>>>> or all to access to the PMU. Only that if one pmu event is granted >>>>>>>>> to VM, host can not access this pmu event again. There must be pmu >>>>>>>>> event switch if host want to. >>>>>>>> PMU event is a software entity which won't be shared. did you >>>>>>>> mean if >>>>>>>> a PMU HW counter is granted to VM, then Host can't access the PMU HW >>>>>>>> counter, right? >>>>>>> yes, if PMU HW counter/control is granted to VM. The value comes from >>>>>>> guest, and is not meaningful for host. Host pmu core does not know >>>>>>> that it is granted to VM, host still think that it owns pmu. >>>>>> That's one issue this patchset tries to solve. Current new mediated >>>>>> x86 >>>>>> vPMU framework doesn't allow Host or Guest own the PMU HW resource >>>>>> simultaneously. Only when there is no !exclude_guest event on host, >>>>>> guest is allowed to exclusively own the PMU HW resource. >>>>>> >>>>>> >>>>>>> Just like FPU register, it is shared by VM and host during different >>>>>>> time and it is lately switched. But if IPI or timer interrupt uses >>>>>>> FPU >>>>>>> register on host, there will be the same issue. >>>>>> I didn't fully get your point. When IPI or timer interrupt reach, a >>>>>> VM-exit is triggered to make CPU traps into host first and then the >>>>>> host >>>>> yes, it is. >>>> This is correct. And this is one of the points that we had debated >>>> internally whether we should do PMU context switch at vcpu loop >>>> boundary or VM Enter/exit boundary. (host-level) timer interrupt can >>>> force VM Exit, which I think happens every 4ms or 1ms, depending on >>>> configuration. >>>> >>>> One of the key reasons we currently propose this is because it is the >>>> same boundary as the legacy PMU, i.e., it would be simple to propose >>>> from the perf subsystem perspective. >>>> >>>> Performance wise, doing PMU context switch at vcpu boundary would be >>>> way better in general. But the downside is that perf sub-system lose >>>> the capability to profile majority of the KVM code (functions) when >>>> guest PMU is enabled. >>>> >>>>>> interrupt handler is called. Or are you complaining the executing >>>>>> sequence of switching guest PMU MSRs and these interrupt handler? >>>>> In our vPMU implementation, it is ok if vPMU is switched in vm exit >>>>> path, however there is problem if vPMU is switched during vcpu thread >>>>> sched-out/sched-in path since IPI/timer irq interrupt access pmu >>>>> register in host mode. >>>> Oh, the IPI/timer irq handler will access PMU registers? I thought >>>> only the host-level NMI handler will access the PMU MSRs since PMI is >>>> registered under NMI. >>>> >>>> In that case, you should disable IRQ during vcpu context switch. For >>>> NMI, we prevent its handler from accessing the PMU registers. In >>>> particular, we use a per-cpu variable to guard that. So, the >>>> host-level PMI handler for perf sub-system will check the variable >>>> before proceeding. >>> perf core will access pmu hw in tick timer/hrtimer/ipi function call, >>> such as function perf_event_task_tick() is called in tick timer, there >>> are event_function_call(event, __perf_event_xxx, &value) in file >>> kernel/events/core.c. >>> >>> https://lore.kernel.org/lkml/20240417065236.500011-1-gaosong@loongson.cn/T/#m15aeb79fdc9ce72dd5b374edd6acdcf7a9dafcf4 >>> >> Just go through functions (not sure if all), whether >> perf_event_task_tick() or the callbacks of event_function_call() would >> check the event->state first, if the event is in >> PERF_EVENT_STATE_INACTIVE, the PMU HW MSRs would not be touched really. >> In this new proposal, all host events with exclude_guest attribute would >> be put on PERF_EVENT_STATE_INACTIVE sate if guest own the PMU HW >> resource. So I think it's fine. >> > Is there any event in the host still having PERF_EVENT_STATE_ACTIVE? > If so, hmm, it will reach perf_pmu_disable(event->pmu), which will > access the global ctrl MSR. I don't think there is any event with PERF_EVENT_STATE_ACTIVE state on host when guest owns the PMU HW resource. In current solution, VM would fail to create if there is any system-wide event without exclude_guest attribute. If VM is created successfully and when vm-entry happens, the helper perf_guest_enter() would put all host events with exclude_guest attribute into PERF_EVENT_STATE_INACTIVE state and block host to create system-wide events without exclude_guest attribute.
On 2024/4/23 下午4:24, Mi, Dapeng wrote: > > On 4/23/2024 3:10 PM, Mingwei Zhang wrote: >> On Mon, Apr 22, 2024 at 11:45 PM Mi, Dapeng >> <dapeng1.mi@linux.intel.com> wrote: >>> >>> On 4/23/2024 2:08 PM, maobibo wrote: >>>> >>>> On 2024/4/23 下午12:23, Mingwei Zhang wrote: >>>>> On Mon, Apr 22, 2024 at 8:55 PM maobibo <maobibo@loongson.cn> wrote: >>>>>> >>>>>> >>>>>> On 2024/4/23 上午11:13, Mi, Dapeng wrote: >>>>>>> On 4/23/2024 10:53 AM, maobibo wrote: >>>>>>>> >>>>>>>> On 2024/4/23 上午10:44, Mi, Dapeng wrote: >>>>>>>>> On 4/23/2024 9:01 AM, maobibo wrote: >>>>>>>>>> >>>>>>>>>> On 2024/4/23 上午1:01, Sean Christopherson wrote: >>>>>>>>>>> On Mon, Apr 22, 2024, maobibo wrote: >>>>>>>>>>>> On 2024/4/16 上午6:45, Sean Christopherson wrote: >>>>>>>>>>>>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: >>>>>>>>>>>>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson >>>>>>>>>>>>>> <seanjc@google.com> wrote: >>>>>>>>>>>>>>> One my biggest complaints with the current vPMU code is that >>>>>>>>>>>>>>> the roles and >>>>>>>>>>>>>>> responsibilities between KVM and perf are poorly defined, >>>>>>>>>>>>>>> which >>>>>>>>>>>>>>> leads to suboptimal >>>>>>>>>>>>>>> and hard to maintain code. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Case in point, I'm pretty sure leaving guest values in PMCs >>>>>>>>>>>>>>> _would_ leak guest >>>>>>>>>>>>>>> state to userspace processes that have RDPMC permissions, as >>>>>>>>>>>>>>> the PMCs might not >>>>>>>>>>>>>>> be dirty from perf's perspective (see >>>>>>>>>>>>>>> perf_clear_dirty_counters()). >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Blindly clearing PMCs in KVM "solves" that problem, but in >>>>>>>>>>>>>>> doing so makes the >>>>>>>>>>>>>>> overall code brittle because it's not clear whether KVM >>>>>>>>>>>>>>> _needs_ >>>>>>>>>>>>>>> to clear PMCs, >>>>>>>>>>>>>>> or if KVM is just being paranoid. >>>>>>>>>>>>>> So once this rolls out, perf and vPMU are clients directly to >>>>>>>>>>>>>> PMU HW. >>>>>>>>>>>>> I don't think this is a statement we want to make, as it >>>>>>>>>>>>> opens a >>>>>>>>>>>>> discussion >>>>>>>>>>>>> that we won't win. Nor do I think it's one we *need* to make. >>>>>>>>>>>>> KVM doesn't need >>>>>>>>>>>>> to be on equal footing with perf in terms of >>>>>>>>>>>>> owning/managing PMU >>>>>>>>>>>>> hardware, KVM >>>>>>>>>>>>> just needs a few APIs to allow faithfully and accurately >>>>>>>>>>>>> virtualizing a guest PMU. >>>>>>>>>>>>> >>>>>>>>>>>>>> Faithful cleaning (blind cleaning) has to be the baseline >>>>>>>>>>>>>> implementation, until both clients agree to a "deal" between >>>>>>>>>>>>>> them. >>>>>>>>>>>>>> Currently, there is no such deal, but I believe we could have >>>>>>>>>>>>>> one via >>>>>>>>>>>>>> future discussion. >>>>>>>>>>>>> What I am saying is that there needs to be a "deal" in place >>>>>>>>>>>>> before this code >>>>>>>>>>>>> is merged. It doesn't need to be anything fancy, e.g. perf >>>>>>>>>>>>> can >>>>>>>>>>>>> still pave over >>>>>>>>>>>>> PMCs it doesn't immediately load, as opposed to using >>>>>>>>>>>>> cpu_hw_events.dirty to lazily >>>>>>>>>>>>> do the clearing. But perf and KVM need to work together from >>>>>>>>>>>>> the >>>>>>>>>>>>> get go, ie. I >>>>>>>>>>>>> don't want KVM doing something without regard to what perf >>>>>>>>>>>>> does, >>>>>>>>>>>>> and vice versa. >>>>>>>>>>>>> >>>>>>>>>>>> There is similar issue on LoongArch vPMU where vm can directly >>>>>>>>>>>> pmu >>>>>>>>>>>> hardware >>>>>>>>>>>> and pmu hw is shard with guest and host. Besides context switch >>>>>>>>>>>> there are >>>>>>>>>>>> other places where perf core will access pmu hw, such as tick >>>>>>>>>>>> timer/hrtimer/ipi function call, and KVM can only intercept >>>>>>>>>>>> context switch. >>>>>>>>>>> Two questions: >>>>>>>>>>> >>>>>>>>>>> 1) Can KVM prevent the guest from accessing the PMU? >>>>>>>>>>> >>>>>>>>>>> 2) If so, KVM can grant partial access to the PMU, or is >>>>>>>>>>> it all >>>>>>>>>>> or nothing? >>>>>>>>>>> >>>>>>>>>>> If the answer to both questions is "yes", then it sounds like >>>>>>>>>>> LoongArch *requires* >>>>>>>>>>> mediated/passthrough support in order to virtualize its PMU. >>>>>>>>>> Hi Sean, >>>>>>>>>> >>>>>>>>>> Thank for your quick response. >>>>>>>>>> >>>>>>>>>> yes, kvm can prevent guest from accessing the PMU and grant >>>>>>>>>> partial >>>>>>>>>> or all to access to the PMU. Only that if one pmu event is >>>>>>>>>> granted >>>>>>>>>> to VM, host can not access this pmu event again. There must be >>>>>>>>>> pmu >>>>>>>>>> event switch if host want to. >>>>>>>>> PMU event is a software entity which won't be shared. did you >>>>>>>>> mean if >>>>>>>>> a PMU HW counter is granted to VM, then Host can't access the >>>>>>>>> PMU HW >>>>>>>>> counter, right? >>>>>>>> yes, if PMU HW counter/control is granted to VM. The value comes >>>>>>>> from >>>>>>>> guest, and is not meaningful for host. Host pmu core does not know >>>>>>>> that it is granted to VM, host still think that it owns pmu. >>>>>>> That's one issue this patchset tries to solve. Current new mediated >>>>>>> x86 >>>>>>> vPMU framework doesn't allow Host or Guest own the PMU HW resource >>>>>>> simultaneously. Only when there is no !exclude_guest event on host, >>>>>>> guest is allowed to exclusively own the PMU HW resource. >>>>>>> >>>>>>> >>>>>>>> Just like FPU register, it is shared by VM and host during >>>>>>>> different >>>>>>>> time and it is lately switched. But if IPI or timer interrupt uses >>>>>>>> FPU >>>>>>>> register on host, there will be the same issue. >>>>>>> I didn't fully get your point. When IPI or timer interrupt reach, a >>>>>>> VM-exit is triggered to make CPU traps into host first and then the >>>>>>> host >>>>>> yes, it is. >>>>> This is correct. And this is one of the points that we had debated >>>>> internally whether we should do PMU context switch at vcpu loop >>>>> boundary or VM Enter/exit boundary. (host-level) timer interrupt can >>>>> force VM Exit, which I think happens every 4ms or 1ms, depending on >>>>> configuration. >>>>> >>>>> One of the key reasons we currently propose this is because it is the >>>>> same boundary as the legacy PMU, i.e., it would be simple to propose >>>>> from the perf subsystem perspective. >>>>> >>>>> Performance wise, doing PMU context switch at vcpu boundary would be >>>>> way better in general. But the downside is that perf sub-system lose >>>>> the capability to profile majority of the KVM code (functions) when >>>>> guest PMU is enabled. >>>>> >>>>>>> interrupt handler is called. Or are you complaining the executing >>>>>>> sequence of switching guest PMU MSRs and these interrupt handler? >>>>>> In our vPMU implementation, it is ok if vPMU is switched in vm exit >>>>>> path, however there is problem if vPMU is switched during vcpu thread >>>>>> sched-out/sched-in path since IPI/timer irq interrupt access pmu >>>>>> register in host mode. >>>>> Oh, the IPI/timer irq handler will access PMU registers? I thought >>>>> only the host-level NMI handler will access the PMU MSRs since PMI is >>>>> registered under NMI. >>>>> >>>>> In that case, you should disable IRQ during vcpu context switch. For >>>>> NMI, we prevent its handler from accessing the PMU registers. In >>>>> particular, we use a per-cpu variable to guard that. So, the >>>>> host-level PMI handler for perf sub-system will check the variable >>>>> before proceeding. >>>> perf core will access pmu hw in tick timer/hrtimer/ipi function call, >>>> such as function perf_event_task_tick() is called in tick timer, there >>>> are event_function_call(event, __perf_event_xxx, &value) in file >>>> kernel/events/core.c. >>>> >>>> https://lore.kernel.org/lkml/20240417065236.500011-1-gaosong@loongson.cn/T/#m15aeb79fdc9ce72dd5b374edd6acdcf7a9dafcf4 >>>> >>>> >>> Just go through functions (not sure if all), whether >>> perf_event_task_tick() or the callbacks of event_function_call() would >>> check the event->state first, if the event is in >>> PERF_EVENT_STATE_INACTIVE, the PMU HW MSRs would not be touched really. >>> In this new proposal, all host events with exclude_guest attribute would >>> be put on PERF_EVENT_STATE_INACTIVE sate if guest own the PMU HW >>> resource. So I think it's fine. >>> >> Is there any event in the host still having PERF_EVENT_STATE_ACTIVE? >> If so, hmm, it will reach perf_pmu_disable(event->pmu), which will >> access the global ctrl MSR. > > I don't think there is any event with PERF_EVENT_STATE_ACTIVE state on > host when guest owns the PMU HW resource. > > In current solution, VM would fail to create if there is any system-wide > event without exclude_guest attribute. If VM is created successfully and > when vm-entry happens, the helper perf_guest_enter() would put all host > events with exclude_guest attribute into PERF_EVENT_STATE_INACTIVE state > and block host to create system-wide events without exclude_guest > attribute. I do not know perf subsytem, Can the perf event state kept unchanged? After VM enters, hw perf counter is allocated to VM. HW counter function for host should be stopped already. It seems that host perf core needs not perceive VM enter/exit.
On 2024/4/23 下午12:23, Mingwei Zhang wrote: > On Mon, Apr 22, 2024 at 8:55 PM maobibo <maobibo@loongson.cn> wrote: >> >> >> >> On 2024/4/23 上午11:13, Mi, Dapeng wrote: >>> >>> On 4/23/2024 10:53 AM, maobibo wrote: >>>> >>>> >>>> On 2024/4/23 上午10:44, Mi, Dapeng wrote: >>>>> >>>>> On 4/23/2024 9:01 AM, maobibo wrote: >>>>>> >>>>>> >>>>>> On 2024/4/23 上午1:01, Sean Christopherson wrote: >>>>>>> On Mon, Apr 22, 2024, maobibo wrote: >>>>>>>> On 2024/4/16 上午6:45, Sean Christopherson wrote: >>>>>>>>> On Mon, Apr 15, 2024, Mingwei Zhang wrote: >>>>>>>>>> On Mon, Apr 15, 2024 at 10:38 AM Sean Christopherson >>>>>>>>>> <seanjc@google.com> wrote: >>>>>>>>>>> One my biggest complaints with the current vPMU code is that >>>>>>>>>>> the roles and >>>>>>>>>>> responsibilities between KVM and perf are poorly defined, which >>>>>>>>>>> leads to suboptimal >>>>>>>>>>> and hard to maintain code. >>>>>>>>>>> >>>>>>>>>>> Case in point, I'm pretty sure leaving guest values in PMCs >>>>>>>>>>> _would_ leak guest >>>>>>>>>>> state to userspace processes that have RDPMC permissions, as >>>>>>>>>>> the PMCs might not >>>>>>>>>>> be dirty from perf's perspective (see >>>>>>>>>>> perf_clear_dirty_counters()). >>>>>>>>>>> >>>>>>>>>>> Blindly clearing PMCs in KVM "solves" that problem, but in >>>>>>>>>>> doing so makes the >>>>>>>>>>> overall code brittle because it's not clear whether KVM _needs_ >>>>>>>>>>> to clear PMCs, >>>>>>>>>>> or if KVM is just being paranoid. >>>>>>>>>> >>>>>>>>>> So once this rolls out, perf and vPMU are clients directly to >>>>>>>>>> PMU HW. >>>>>>>>> >>>>>>>>> I don't think this is a statement we want to make, as it opens a >>>>>>>>> discussion >>>>>>>>> that we won't win. Nor do I think it's one we *need* to make. >>>>>>>>> KVM doesn't need >>>>>>>>> to be on equal footing with perf in terms of owning/managing PMU >>>>>>>>> hardware, KVM >>>>>>>>> just needs a few APIs to allow faithfully and accurately >>>>>>>>> virtualizing a guest PMU. >>>>>>>>> >>>>>>>>>> Faithful cleaning (blind cleaning) has to be the baseline >>>>>>>>>> implementation, until both clients agree to a "deal" between them. >>>>>>>>>> Currently, there is no such deal, but I believe we could have >>>>>>>>>> one via >>>>>>>>>> future discussion. >>>>>>>>> >>>>>>>>> What I am saying is that there needs to be a "deal" in place >>>>>>>>> before this code >>>>>>>>> is merged. It doesn't need to be anything fancy, e.g. perf can >>>>>>>>> still pave over >>>>>>>>> PMCs it doesn't immediately load, as opposed to using >>>>>>>>> cpu_hw_events.dirty to lazily >>>>>>>>> do the clearing. But perf and KVM need to work together from the >>>>>>>>> get go, ie. I >>>>>>>>> don't want KVM doing something without regard to what perf does, >>>>>>>>> and vice versa. >>>>>>>>> >>>>>>>> There is similar issue on LoongArch vPMU where vm can directly pmu >>>>>>>> hardware >>>>>>>> and pmu hw is shard with guest and host. Besides context switch >>>>>>>> there are >>>>>>>> other places where perf core will access pmu hw, such as tick >>>>>>>> timer/hrtimer/ipi function call, and KVM can only intercept >>>>>>>> context switch. >>>>>>> >>>>>>> Two questions: >>>>>>> >>>>>>> 1) Can KVM prevent the guest from accessing the PMU? >>>>>>> >>>>>>> 2) If so, KVM can grant partial access to the PMU, or is it all >>>>>>> or nothing? >>>>>>> >>>>>>> If the answer to both questions is "yes", then it sounds like >>>>>>> LoongArch *requires* >>>>>>> mediated/passthrough support in order to virtualize its PMU. >>>>>> >>>>>> Hi Sean, >>>>>> >>>>>> Thank for your quick response. >>>>>> >>>>>> yes, kvm can prevent guest from accessing the PMU and grant partial >>>>>> or all to access to the PMU. Only that if one pmu event is granted >>>>>> to VM, host can not access this pmu event again. There must be pmu >>>>>> event switch if host want to. >>>>> >>>>> PMU event is a software entity which won't be shared. did you mean if >>>>> a PMU HW counter is granted to VM, then Host can't access the PMU HW >>>>> counter, right? >>>> yes, if PMU HW counter/control is granted to VM. The value comes from >>>> guest, and is not meaningful for host. Host pmu core does not know >>>> that it is granted to VM, host still think that it owns pmu. >>> >>> That's one issue this patchset tries to solve. Current new mediated x86 >>> vPMU framework doesn't allow Host or Guest own the PMU HW resource >>> simultaneously. Only when there is no !exclude_guest event on host, >>> guest is allowed to exclusively own the PMU HW resource. >>> >>> >>>> >>>> Just like FPU register, it is shared by VM and host during different >>>> time and it is lately switched. But if IPI or timer interrupt uses FPU >>>> register on host, there will be the same issue. >>> >>> I didn't fully get your point. When IPI or timer interrupt reach, a >>> VM-exit is triggered to make CPU traps into host first and then the host >> yes, it is. > > This is correct. And this is one of the points that we had debated > internally whether we should do PMU context switch at vcpu loop > boundary or VM Enter/exit boundary. (host-level) timer interrupt can > force VM Exit, which I think happens every 4ms or 1ms, depending on > configuration. > > One of the key reasons we currently propose this is because it is the > same boundary as the legacy PMU, i.e., it would be simple to propose > from the perf subsystem perspective. > > Performance wise, doing PMU context switch at vcpu boundary would be > way better in general. But the downside is that perf sub-system lose > the capability to profile majority of the KVM code (functions) when > guest PMU is enabled. > >> >>> interrupt handler is called. Or are you complaining the executing >>> sequence of switching guest PMU MSRs and these interrupt handler? >> In our vPMU implementation, it is ok if vPMU is switched in vm exit >> path, however there is problem if vPMU is switched during vcpu thread >> sched-out/sched-in path since IPI/timer irq interrupt access pmu >> register in host mode. > > Oh, the IPI/timer irq handler will access PMU registers? I thought > only the host-level NMI handler will access the PMU MSRs since PMI is > registered under NMI. > > In that case, you should disable IRQ during vcpu context switch. For > NMI, we prevent its handler from accessing the PMU registers. In > particular, we use a per-cpu variable to guard that. So, the > host-level PMI handler for perf sub-system will check the variable > before proceeding. > >> >> In general it will be better if the switch is done in vcpu thread >> sched-out/sched-in, else there is requirement to profile kvm >> hypervisor.Even there is such requirement, it is only one option. In >> most conditions, it will better if time of VM context exit is small. >> > Performance wise, agree, but there will be debate on perf > functionality loss at the host level. > > Maybe, (just maybe), it is possible to do PMU context switch at vcpu > boundary normally, but doing it at VM Enter/Exit boundary when host is > profiling KVM kernel module. So, dynamically adjusting PMU context > switch location could be an option. If there are two VMs with pmu enabled both, however host PMU is not enabled. PMU context switch should be done in vcpu thread sched-out path. If host pmu is used also, we can choose whether PMU switch should be done in vm exit path or vcpu thread sched-out path. > >>> >>> >>>> >>>> Regards >>>> Bibo Mao >>>>> >>>>> >>>>>> >>>>>>> >>>>>>>> Can we add callback handler in structure kvm_guest_cbs? just like >>>>>>>> this: >>>>>>>> @@ -6403,6 +6403,7 @@ static struct perf_guest_info_callbacks >>>>>>>> kvm_guest_cbs >>>>>>>> = { >>>>>>>> .state = kvm_guest_state, >>>>>>>> .get_ip = kvm_guest_get_ip, >>>>>>>> .handle_intel_pt_intr = NULL, >>>>>>>> + .lose_pmu = kvm_guest_lose_pmu, >>>>>>>> }; >>>>>>>> >>>>>>>> By the way, I do not know should the callback handler be triggered >>>>>>>> in perf >>>>>>>> core or detailed pmu hw driver. From ARM pmu hw driver, it is >>>>>>>> triggered in >>>>>>>> pmu hw driver such as function kvm_vcpu_pmu_resync_el0, >>>>>>>> but I think it will be better if it is done in perf core. >>>>>>> >>>>>>> I don't think we want to take the approach of perf and KVM guests >>>>>>> "fighting" over >>>>>>> the PMU. That's effectively what we have today, and it's a mess >>>>>>> for KVM because >>>>>>> it's impossible to provide consistent, deterministic behavior for >>>>>>> the guest. And >>>>>>> it's just as messy for perf, which ends up having wierd, cumbersome >>>>>>> flows that >>>>>>> exists purely to try to play nice with KVM. >>>>>> With existing pmu core code, in tick timer interrupt or IPI function >>>>>> call interrupt pmu hw may be accessed by host when VM is running and >>>>>> pmu is already granted to guest. KVM can not intercept host >>>>>> IPI/timer interrupt, there is no pmu context switch, there will be >>>>>> problem. >>>>>> >>>>>> Regards >>>>>> Bibo Mao >>>>>> >>>> >>
> > Is there any event in the host still having PERF_EVENT_STATE_ACTIVE? > > If so, hmm, it will reach perf_pmu_disable(event->pmu), which will > > access the global ctrl MSR. > > I don't think there is any event with PERF_EVENT_STATE_ACTIVE state on > host when guest owns the PMU HW resource. > > In current solution, VM would fail to create if there is any system-wide > event without exclude_guest attribute. If VM is created successfully and > when vm-entry happens, the helper perf_guest_enter() would put all host > events with exclude_guest attribute into PERF_EVENT_STATE_INACTIVE state > and block host to create system-wide events without exclude_guest attribute. > Yeah, that's perfect. Thanks. -Mingwei
> > > > Maybe, (just maybe), it is possible to do PMU context switch at vcpu > > boundary normally, but doing it at VM Enter/Exit boundary when host is > > profiling KVM kernel module. So, dynamically adjusting PMU context > > switch location could be an option. > If there are two VMs with pmu enabled both, however host PMU is not > enabled. PMU context switch should be done in vcpu thread sched-out path. > > If host pmu is used also, we can choose whether PMU switch should be > done in vm exit path or vcpu thread sched-out path. > host PMU is always enabled, ie., Linux currently does not support KVM PMU running standalone. I guess what you mean is there are no active perf_events on the host side. Allowing a PMU context switch drifting from vm-enter/exit boundary to vcpu loop boundary by checking host side events might be a good option. We can keep the discussion, but I won't propose that in v2. I guess we are off topic. Sean's suggestion is that we should put "perf" and "kvm" together while doing the context switch. I think this is quite reasonable regardless of the PMU context switch location. To execute this, I am thinking about adding a parameter or return value to perf_guest_enter() so that once it returns back to KVM, KVM gets to know which counters are active/inactive/cleared from the host side. Knowing that, KVM can do the context switch more efficiently. Thanks. -Mingwei
On 2024/4/24 上午1:02, Mingwei Zhang wrote: >>> >>> Maybe, (just maybe), it is possible to do PMU context switch at vcpu >>> boundary normally, but doing it at VM Enter/Exit boundary when host is >>> profiling KVM kernel module. So, dynamically adjusting PMU context >>> switch location could be an option. >> If there are two VMs with pmu enabled both, however host PMU is not >> enabled. PMU context switch should be done in vcpu thread sched-out path. >> >> If host pmu is used also, we can choose whether PMU switch should be >> done in vm exit path or vcpu thread sched-out path. >> > > host PMU is always enabled, ie., Linux currently does not support KVM > PMU running standalone. I guess what you mean is there are no active > perf_events on the host side. Allowing a PMU context switch drifting > from vm-enter/exit boundary to vcpu loop boundary by checking host > side events might be a good option. We can keep the discussion, but I > won't propose that in v2. > > I guess we are off topic. Sean's suggestion is that we should put > "perf" and "kvm" together while doing the context switch. I think this > is quite reasonable regardless of the PMU context switch location. > > To execute this, I am thinking about adding a parameter or return > value to perf_guest_enter() so that once it returns back to KVM, KVM > gets to know which counters are active/inactive/cleared from the host > side. Knowing that, KVM can do the context switch more efficiently. yeap, that sounds great. Regards Bibo Mao > > Thanks. > -Mingwei >
On 4/24/2024 1:02 AM, Mingwei Zhang wrote: >>> Maybe, (just maybe), it is possible to do PMU context switch at vcpu >>> boundary normally, but doing it at VM Enter/Exit boundary when host is >>> profiling KVM kernel module. So, dynamically adjusting PMU context >>> switch location could be an option. >> If there are two VMs with pmu enabled both, however host PMU is not >> enabled. PMU context switch should be done in vcpu thread sched-out path. >> >> If host pmu is used also, we can choose whether PMU switch should be >> done in vm exit path or vcpu thread sched-out path. >> > host PMU is always enabled, ie., Linux currently does not support KVM > PMU running standalone. I guess what you mean is there are no active > perf_events on the host side. Allowing a PMU context switch drifting > from vm-enter/exit boundary to vcpu loop boundary by checking host > side events might be a good option. We can keep the discussion, but I > won't propose that in v2. I suspect if it's really doable to do this deferring. This still makes host lose the most of capability to profile KVM. Per my understanding, most of KVM overhead happens in the vcpu loop, exactly speaking in VM-exit handling. We have no idea when host want to create perf event to profile KVM, it could be at any time. > > I guess we are off topic. Sean's suggestion is that we should put > "perf" and "kvm" together while doing the context switch. I think this > is quite reasonable regardless of the PMU context switch location. > > To execute this, I am thinking about adding a parameter or return > value to perf_guest_enter() so that once it returns back to KVM, KVM > gets to know which counters are active/inactive/cleared from the host > side. Knowing that, KVM can do the context switch more efficiently. > > Thanks. > -Mingwei >
On Wed, Apr 24, 2024, Dapeng Mi wrote: > > On 4/24/2024 1:02 AM, Mingwei Zhang wrote: > > > > Maybe, (just maybe), it is possible to do PMU context switch at vcpu > > > > boundary normally, but doing it at VM Enter/Exit boundary when host is > > > > profiling KVM kernel module. So, dynamically adjusting PMU context > > > > switch location could be an option. > > > If there are two VMs with pmu enabled both, however host PMU is not > > > enabled. PMU context switch should be done in vcpu thread sched-out path. > > > > > > If host pmu is used also, we can choose whether PMU switch should be > > > done in vm exit path or vcpu thread sched-out path. > > > > > host PMU is always enabled, ie., Linux currently does not support KVM > > PMU running standalone. I guess what you mean is there are no active > > perf_events on the host side. Allowing a PMU context switch drifting > > from vm-enter/exit boundary to vcpu loop boundary by checking host > > side events might be a good option. We can keep the discussion, but I > > won't propose that in v2. > > I suspect if it's really doable to do this deferring. This still makes host > lose the most of capability to profile KVM. Per my understanding, most of > KVM overhead happens in the vcpu loop, exactly speaking in VM-exit handling. > We have no idea when host want to create perf event to profile KVM, it could > be at any time. No, the idea is that KVM will load host PMU state asap, but only when host PMU state actually needs to be loaded, i.e. only when there are relevant host events. If there are no host perf events, KVM keeps guest PMU state loaded for the entire KVM_RUN loop, i.e. provides optimal behavior for the guest. But if a host perf events exists (or comes along), the KVM context switches PMU at VM-Enter/VM-Exit, i.e. lets the host profile almost all of KVM, at the cost of a degraded experience for the guest while host perf events are active. My original sketch: https://lore.kernel.org/all/ZR3eNtP5IVAHeFNC@google.com
On 4/24/2024 11:00 PM, Sean Christopherson wrote: > On Wed, Apr 24, 2024, Dapeng Mi wrote: >> On 4/24/2024 1:02 AM, Mingwei Zhang wrote: >>>>> Maybe, (just maybe), it is possible to do PMU context switch at vcpu >>>>> boundary normally, but doing it at VM Enter/Exit boundary when host is >>>>> profiling KVM kernel module. So, dynamically adjusting PMU context >>>>> switch location could be an option. >>>> If there are two VMs with pmu enabled both, however host PMU is not >>>> enabled. PMU context switch should be done in vcpu thread sched-out path. >>>> >>>> If host pmu is used also, we can choose whether PMU switch should be >>>> done in vm exit path or vcpu thread sched-out path. >>>> >>> host PMU is always enabled, ie., Linux currently does not support KVM >>> PMU running standalone. I guess what you mean is there are no active >>> perf_events on the host side. Allowing a PMU context switch drifting >>> from vm-enter/exit boundary to vcpu loop boundary by checking host >>> side events might be a good option. We can keep the discussion, but I >>> won't propose that in v2. >> I suspect if it's really doable to do this deferring. This still makes host >> lose the most of capability to profile KVM. Per my understanding, most of >> KVM overhead happens in the vcpu loop, exactly speaking in VM-exit handling. >> We have no idea when host want to create perf event to profile KVM, it could >> be at any time. > No, the idea is that KVM will load host PMU state asap, but only when host PMU > state actually needs to be loaded, i.e. only when there are relevant host events. > > If there are no host perf events, KVM keeps guest PMU state loaded for the entire > KVM_RUN loop, i.e. provides optimal behavior for the guest. But if a host perf > events exists (or comes along), the KVM context switches PMU at VM-Enter/VM-Exit, > i.e. lets the host profile almost all of KVM, at the cost of a degraded experience > for the guest while host perf events are active. I see. So KVM needs to provide a callback which needs to be called in the IPI handler. The KVM callback needs to be called to switch PMU state before perf really enabling host event and touching PMU MSRs. And only the perf event with exclude_guest attribute is allowed to create on host. Thanks. > > My original sketch: https://lore.kernel.org/all/ZR3eNtP5IVAHeFNC@google.com
On Wed, Apr 24, 2024 at 8:56 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > On 4/24/2024 11:00 PM, Sean Christopherson wrote: > > On Wed, Apr 24, 2024, Dapeng Mi wrote: > >> On 4/24/2024 1:02 AM, Mingwei Zhang wrote: > >>>>> Maybe, (just maybe), it is possible to do PMU context switch at vcpu > >>>>> boundary normally, but doing it at VM Enter/Exit boundary when host is > >>>>> profiling KVM kernel module. So, dynamically adjusting PMU context > >>>>> switch location could be an option. > >>>> If there are two VMs with pmu enabled both, however host PMU is not > >>>> enabled. PMU context switch should be done in vcpu thread sched-out path. > >>>> > >>>> If host pmu is used also, we can choose whether PMU switch should be > >>>> done in vm exit path or vcpu thread sched-out path. > >>>> > >>> host PMU is always enabled, ie., Linux currently does not support KVM > >>> PMU running standalone. I guess what you mean is there are no active > >>> perf_events on the host side. Allowing a PMU context switch drifting > >>> from vm-enter/exit boundary to vcpu loop boundary by checking host > >>> side events might be a good option. We can keep the discussion, but I > >>> won't propose that in v2. > >> I suspect if it's really doable to do this deferring. This still makes host > >> lose the most of capability to profile KVM. Per my understanding, most of > >> KVM overhead happens in the vcpu loop, exactly speaking in VM-exit handling. > >> We have no idea when host want to create perf event to profile KVM, it could > >> be at any time. > > No, the idea is that KVM will load host PMU state asap, but only when host PMU > > state actually needs to be loaded, i.e. only when there are relevant host events. > > > > If there are no host perf events, KVM keeps guest PMU state loaded for the entire > > KVM_RUN loop, i.e. provides optimal behavior for the guest. But if a host perf > > events exists (or comes along), the KVM context switches PMU at VM-Enter/VM-Exit, > > i.e. lets the host profile almost all of KVM, at the cost of a degraded experience > > for the guest while host perf events are active. > > I see. So KVM needs to provide a callback which needs to be called in > the IPI handler. The KVM callback needs to be called to switch PMU state > before perf really enabling host event and touching PMU MSRs. And only > the perf event with exclude_guest attribute is allowed to create on > host. Thanks. Do we really need a KVM callback? I think that is one option. Immediately after VMEXIT, KVM will check whether there are "host perf events". If so, do the PMU context switch immediately. Otherwise, keep deferring the context switch to the end of vPMU loop. Detecting if there are "host perf events" would be interesting. The "host perf events" refer to the perf_events on the host that are active and assigned with HW counters and that are saved when context switching to the guest PMU. I think getting those events could be done by fetching the bitmaps in cpuc. I have to look into the details. But at the time of VMEXIT, kvm should already have that information, so it can immediately decide whether to do the PMU context switch or not. oh, but when the control is executing within the run loop, a host-level profiling starts, say 'perf record -a ...', it will generate an IPI to all CPUs. Maybe that's when we need a callback so the KVM guest PMU context gets preempted for the host-level profiling. Gah.. hmm, not a fan of that. That means the host can poke the guest PMU context at any time and cause higher overhead. But I admit it is much better than the current approach. The only thing is that: any command like 'perf record/stat -a' shot in dark corners of the host can preempt guest PMUs of _all_ running VMs. So, to alleviate that, maybe a module parameter that disables this "preemption" is possible? This should fit scenarios where we don't want guest PMU to be preempted outside of the vCPU loop? Thanks. Regards -Mingwei -Mingwei > > > > > > My original sketch: https://lore.kernel.org/all/ZR3eNtP5IVAHeFNC@google.com
On 2024-04-25 12:24 a.m., Mingwei Zhang wrote: > On Wed, Apr 24, 2024 at 8:56 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: >> >> >> On 4/24/2024 11:00 PM, Sean Christopherson wrote: >>> On Wed, Apr 24, 2024, Dapeng Mi wrote: >>>> On 4/24/2024 1:02 AM, Mingwei Zhang wrote: >>>>>>> Maybe, (just maybe), it is possible to do PMU context switch at vcpu >>>>>>> boundary normally, but doing it at VM Enter/Exit boundary when host is >>>>>>> profiling KVM kernel module. So, dynamically adjusting PMU context >>>>>>> switch location could be an option. >>>>>> If there are two VMs with pmu enabled both, however host PMU is not >>>>>> enabled. PMU context switch should be done in vcpu thread sched-out path. >>>>>> >>>>>> If host pmu is used also, we can choose whether PMU switch should be >>>>>> done in vm exit path or vcpu thread sched-out path. >>>>>> >>>>> host PMU is always enabled, ie., Linux currently does not support KVM >>>>> PMU running standalone. I guess what you mean is there are no active >>>>> perf_events on the host side. Allowing a PMU context switch drifting >>>>> from vm-enter/exit boundary to vcpu loop boundary by checking host >>>>> side events might be a good option. We can keep the discussion, but I >>>>> won't propose that in v2. >>>> I suspect if it's really doable to do this deferring. This still makes host >>>> lose the most of capability to profile KVM. Per my understanding, most of >>>> KVM overhead happens in the vcpu loop, exactly speaking in VM-exit handling. >>>> We have no idea when host want to create perf event to profile KVM, it could >>>> be at any time. >>> No, the idea is that KVM will load host PMU state asap, but only when host PMU >>> state actually needs to be loaded, i.e. only when there are relevant host events. >>> >>> If there are no host perf events, KVM keeps guest PMU state loaded for the entire >>> KVM_RUN loop, i.e. provides optimal behavior for the guest. But if a host perf >>> events exists (or comes along), the KVM context switches PMU at VM-Enter/VM-Exit, >>> i.e. lets the host profile almost all of KVM, at the cost of a degraded experience >>> for the guest while host perf events are active. >> >> I see. So KVM needs to provide a callback which needs to be called in >> the IPI handler. The KVM callback needs to be called to switch PMU state >> before perf really enabling host event and touching PMU MSRs. And only >> the perf event with exclude_guest attribute is allowed to create on >> host. Thanks. > > Do we really need a KVM callback? I think that is one option. > > Immediately after VMEXIT, KVM will check whether there are "host perf > events". If so, do the PMU context switch immediately. Otherwise, keep > deferring the context switch to the end of vPMU loop. > > Detecting if there are "host perf events" would be interesting. The > "host perf events" refer to the perf_events on the host that are > active and assigned with HW counters and that are saved when context > switching to the guest PMU. I think getting those events could be done > by fetching the bitmaps in cpuc. The cpuc is ARCH specific structure. I don't think it can be get in the generic code. You probably have to implement ARCH specific functions to fetch the bitmaps. It probably won't worth it. You may check the pinned_groups and flexible_groups to understand if there are host perf events which may be scheduled when VM-exit. But it will not tell the idx of the counters which can only be got when the host event is really scheduled. > I have to look into the details. But > at the time of VMEXIT, kvm should already have that information, so it > can immediately decide whether to do the PMU context switch or not. > > oh, but when the control is executing within the run loop, a > host-level profiling starts, say 'perf record -a ...', it will > generate an IPI to all CPUs. Maybe that's when we need a callback so > the KVM guest PMU context gets preempted for the host-level profiling. > Gah.. > > hmm, not a fan of that. That means the host can poke the guest PMU > context at any time and cause higher overhead. But I admit it is much > better than the current approach. > > The only thing is that: any command like 'perf record/stat -a' shot in > dark corners of the host can preempt guest PMUs of _all_ running VMs. > So, to alleviate that, maybe a module parameter that disables this > "preemption" is possible? This should fit scenarios where we don't > want guest PMU to be preempted outside of the vCPU loop? > It should not happen. For the current implementation, perf rejects all the !exclude_guest system-wide event creation if a guest with the vPMU is running. However, it's possible to create an exclude_guest system-wide event at any time. KVM cannot use the information from the VM-entry to decide if there will be active perf events in the VM-exit. The perf_guest_exit() will reload the host state. It's impossible to save the guest state after that. We may need a KVM callback. So perf can tell KVM whether to save the guest state before perf reloads the host state. Thanks, Kan >> >> >>> >>> My original sketch: https://lore.kernel.org/all/ZR3eNtP5IVAHeFNC@googlecom >
On Thu, Apr 25, 2024 at 9:13 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > > > > On 2024-04-25 12:24 a.m., Mingwei Zhang wrote: > > On Wed, Apr 24, 2024 at 8:56 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > >> > >> > >> On 4/24/2024 11:00 PM, Sean Christopherson wrote: > >>> On Wed, Apr 24, 2024, Dapeng Mi wrote: > >>>> On 4/24/2024 1:02 AM, Mingwei Zhang wrote: > >>>>>>> Maybe, (just maybe), it is possible to do PMU context switch at vcpu > >>>>>>> boundary normally, but doing it at VM Enter/Exit boundary when host is > >>>>>>> profiling KVM kernel module. So, dynamically adjusting PMU context > >>>>>>> switch location could be an option. > >>>>>> If there are two VMs with pmu enabled both, however host PMU is not > >>>>>> enabled. PMU context switch should be done in vcpu thread sched-out path. > >>>>>> > >>>>>> If host pmu is used also, we can choose whether PMU switch should be > >>>>>> done in vm exit path or vcpu thread sched-out path. > >>>>>> > >>>>> host PMU is always enabled, ie., Linux currently does not support KVM > >>>>> PMU running standalone. I guess what you mean is there are no active > >>>>> perf_events on the host side. Allowing a PMU context switch drifting > >>>>> from vm-enter/exit boundary to vcpu loop boundary by checking host > >>>>> side events might be a good option. We can keep the discussion, but I > >>>>> won't propose that in v2. > >>>> I suspect if it's really doable to do this deferring. This still makes host > >>>> lose the most of capability to profile KVM. Per my understanding, most of > >>>> KVM overhead happens in the vcpu loop, exactly speaking in VM-exit handling. > >>>> We have no idea when host want to create perf event to profile KVM, it could > >>>> be at any time. > >>> No, the idea is that KVM will load host PMU state asap, but only when host PMU > >>> state actually needs to be loaded, i.e. only when there are relevant host events. > >>> > >>> If there are no host perf events, KVM keeps guest PMU state loaded for the entire > >>> KVM_RUN loop, i.e. provides optimal behavior for the guest. But if a host perf > >>> events exists (or comes along), the KVM context switches PMU at VM-Enter/VM-Exit, > >>> i.e. lets the host profile almost all of KVM, at the cost of a degraded experience > >>> for the guest while host perf events are active. > >> > >> I see. So KVM needs to provide a callback which needs to be called in > >> the IPI handler. The KVM callback needs to be called to switch PMU state > >> before perf really enabling host event and touching PMU MSRs. And only > >> the perf event with exclude_guest attribute is allowed to create on > >> host. Thanks. > > > > Do we really need a KVM callback? I think that is one option. > > > > Immediately after VMEXIT, KVM will check whether there are "host perf > > events". If so, do the PMU context switch immediately. Otherwise, keep > > deferring the context switch to the end of vPMU loop. > > > > Detecting if there are "host perf events" would be interesting. The > > "host perf events" refer to the perf_events on the host that are > > active and assigned with HW counters and that are saved when context > > switching to the guest PMU. I think getting those events could be done > > by fetching the bitmaps in cpuc. > > The cpuc is ARCH specific structure. I don't think it can be get in the > generic code. You probably have to implement ARCH specific functions to > fetch the bitmaps. It probably won't worth it. > > You may check the pinned_groups and flexible_groups to understand if > there are host perf events which may be scheduled when VM-exit. But it > will not tell the idx of the counters which can only be got when the > host event is really scheduled. > > > I have to look into the details. But > > at the time of VMEXIT, kvm should already have that information, so it > > can immediately decide whether to do the PMU context switch or not. > > > > oh, but when the control is executing within the run loop, a > > host-level profiling starts, say 'perf record -a ...', it will > > generate an IPI to all CPUs. Maybe that's when we need a callback so > > the KVM guest PMU context gets preempted for the host-level profiling. > > Gah.. > > > > hmm, not a fan of that. That means the host can poke the guest PMU > > context at any time and cause higher overhead. But I admit it is much > > better than the current approach. > > > > The only thing is that: any command like 'perf record/stat -a' shot in > > dark corners of the host can preempt guest PMUs of _all_ running VMs. > > So, to alleviate that, maybe a module parameter that disables this > > "preemption" is possible? This should fit scenarios where we don't > > want guest PMU to be preempted outside of the vCPU loop? > > > > It should not happen. For the current implementation, perf rejects all > the !exclude_guest system-wide event creation if a guest with the vPMU > is running. > However, it's possible to create an exclude_guest system-wide event at > any time. KVM cannot use the information from the VM-entry to decide if > there will be active perf events in the VM-exit. Hmm, why not? If there is any exclude_guest system-wide event, perf_guest_enter() can return something to tell KVM "hey, some active host events are swapped out. they are originally in counter #2 and #3". If so, at the time when perf_guest_enter() returns, KVM will ack that and keep it in its pmu data structure. Now, when doing context switching back to host at just VMEXIT, KVM will check this data and see if host perf context has something active (of course, they are all exclude_guest events). If not, deferring the context switch to vcpu boundary. Otherwise, do the proper PMU context switching by respecting the occupied counter positions on the host side, i.e., avoid doubling the work on the KVM side. Kan, any suggestion on the above approach? Totally understand that there might be some difficulty, since perf subsystem works in several layers and obviously fetching low-level mapping is arch specific work. If that is difficult, we can split the work in two phases: 1) phase #1, just ask perf to tell kvm if there are active exclude_guest events swapped out; 2) phase #2, ask perf to tell their (low-level) counter indices. Thanks. -Mingwei > > The perf_guest_exit() will reload the host state. It's impossible to > save the guest state after that. We may need a KVM callback. So perf can > tell KVM whether to save the guest state before perf reloads the host state. > > Thanks, > Kan > >> > >> > >>> > >>> My original sketch: https://lore.kernel.org/all/ZR3eNtP5IVAHeFNC@googlecom > >
On 2024-04-25 4:16 p.m., Mingwei Zhang wrote: > On Thu, Apr 25, 2024 at 9:13 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >> >> >> >> On 2024-04-25 12:24 a.m., Mingwei Zhang wrote: >>> On Wed, Apr 24, 2024 at 8:56 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: >>>> >>>> >>>> On 4/24/2024 11:00 PM, Sean Christopherson wrote: >>>>> On Wed, Apr 24, 2024, Dapeng Mi wrote: >>>>>> On 4/24/2024 1:02 AM, Mingwei Zhang wrote: >>>>>>>>> Maybe, (just maybe), it is possible to do PMU context switch at vcpu >>>>>>>>> boundary normally, but doing it at VM Enter/Exit boundary when host is >>>>>>>>> profiling KVM kernel module. So, dynamically adjusting PMU context >>>>>>>>> switch location could be an option. >>>>>>>> If there are two VMs with pmu enabled both, however host PMU is not >>>>>>>> enabled. PMU context switch should be done in vcpu thread sched-out path. >>>>>>>> >>>>>>>> If host pmu is used also, we can choose whether PMU switch should be >>>>>>>> done in vm exit path or vcpu thread sched-out path. >>>>>>>> >>>>>>> host PMU is always enabled, ie., Linux currently does not support KVM >>>>>>> PMU running standalone. I guess what you mean is there are no active >>>>>>> perf_events on the host side. Allowing a PMU context switch drifting >>>>>>> from vm-enter/exit boundary to vcpu loop boundary by checking host >>>>>>> side events might be a good option. We can keep the discussion, but I >>>>>>> won't propose that in v2. >>>>>> I suspect if it's really doable to do this deferring. This still makes host >>>>>> lose the most of capability to profile KVM. Per my understanding, most of >>>>>> KVM overhead happens in the vcpu loop, exactly speaking in VM-exit handling. >>>>>> We have no idea when host want to create perf event to profile KVM, it could >>>>>> be at any time. >>>>> No, the idea is that KVM will load host PMU state asap, but only when host PMU >>>>> state actually needs to be loaded, i.e. only when there are relevant host events. >>>>> >>>>> If there are no host perf events, KVM keeps guest PMU state loaded for the entire >>>>> KVM_RUN loop, i.e. provides optimal behavior for the guest. But if a host perf >>>>> events exists (or comes along), the KVM context switches PMU at VM-Enter/VM-Exit, >>>>> i.e. lets the host profile almost all of KVM, at the cost of a degraded experience >>>>> for the guest while host perf events are active. >>>> >>>> I see. So KVM needs to provide a callback which needs to be called in >>>> the IPI handler. The KVM callback needs to be called to switch PMU state >>>> before perf really enabling host event and touching PMU MSRs. And only >>>> the perf event with exclude_guest attribute is allowed to create on >>>> host. Thanks. >>> >>> Do we really need a KVM callback? I think that is one option. >>> >>> Immediately after VMEXIT, KVM will check whether there are "host perf >>> events". If so, do the PMU context switch immediately. Otherwise, keep >>> deferring the context switch to the end of vPMU loop. >>> >>> Detecting if there are "host perf events" would be interesting. The >>> "host perf events" refer to the perf_events on the host that are >>> active and assigned with HW counters and that are saved when context >>> switching to the guest PMU. I think getting those events could be done >>> by fetching the bitmaps in cpuc. >> >> The cpuc is ARCH specific structure. I don't think it can be get in the >> generic code. You probably have to implement ARCH specific functions to >> fetch the bitmaps. It probably won't worth it. >> >> You may check the pinned_groups and flexible_groups to understand if >> there are host perf events which may be scheduled when VM-exit. But it >> will not tell the idx of the counters which can only be got when the >> host event is really scheduled. >> >>> I have to look into the details. But >>> at the time of VMEXIT, kvm should already have that information, so it >>> can immediately decide whether to do the PMU context switch or not. >>> >>> oh, but when the control is executing within the run loop, a >>> host-level profiling starts, say 'perf record -a ...', it will >>> generate an IPI to all CPUs. Maybe that's when we need a callback so >>> the KVM guest PMU context gets preempted for the host-level profiling. >>> Gah.. >>> >>> hmm, not a fan of that. That means the host can poke the guest PMU >>> context at any time and cause higher overhead. But I admit it is much >>> better than the current approach. >>> >>> The only thing is that: any command like 'perf record/stat -a' shot in >>> dark corners of the host can preempt guest PMUs of _all_ running VMs. >>> So, to alleviate that, maybe a module parameter that disables this >>> "preemption" is possible? This should fit scenarios where we don't >>> want guest PMU to be preempted outside of the vCPU loop? >>> >> >> It should not happen. For the current implementation, perf rejects all >> the !exclude_guest system-wide event creation if a guest with the vPMU >> is running. >> However, it's possible to create an exclude_guest system-wide event at >> any time. KVM cannot use the information from the VM-entry to decide if >> there will be active perf events in the VM-exit. > > Hmm, why not? If there is any exclude_guest system-wide event, > perf_guest_enter() can return something to tell KVM "hey, some active > host events are swapped out. they are originally in counter #2 and > #3". If so, at the time when perf_guest_enter() returns, KVM will ack > that and keep it in its pmu data structure. I think it's possible that someone creates !exclude_guest event after the perf_guest_enter(). The stale information is saved in the KVM. Perf will schedule the event in the next perf_guest_exit(). KVM will not know it. > > Now, when doing context switching back to host at just VMEXIT, KVM > will check this data and see if host perf context has something active > (of course, they are all exclude_guest events). If not, deferring the > context switch to vcpu boundary. Otherwise, do the proper PMU context > switching by respecting the occupied counter positions on the host > side, i.e., avoid doubling the work on the KVM side. > > Kan, any suggestion on the above approach? I think we can only know the accurate event list at perf_guest_exit(). You may check the pinned_groups and flexible_groups, which tell if there are candidate events. > Totally understand that > there might be some difficulty, since perf subsystem works in several > layers and obviously fetching low-level mapping is arch specific work. > If that is difficult, we can split the work in two phases: 1) phase > #1, just ask perf to tell kvm if there are active exclude_guest events > swapped out; 2) phase #2, ask perf to tell their (low-level) counter > indices. > If you want an accurate counter mask, the changes in the arch specific code is required. Two phases sound good to me. Besides perf changes, I think the KVM should also track which counters need to be saved/restored. The information can be get from the EventSel interception. Thanks, Kan >> >> The perf_guest_exit() will reload the host state. It's impossible to >> save the guest state after that. We may need a KVM callback. So perf can >> tell KVM whether to save the guest state before perf reloads the host state. >> >> Thanks, >> Kan >>>> >>>> >>>>> >>>>> My original sketch: https://lore.kernel.org/all/ZR3eNtP5IVAHeFNC@googlecom >>> >
On Thu, Apr 25, 2024, Kan Liang wrote: > On 2024-04-25 4:16 p.m., Mingwei Zhang wrote: > > On Thu, Apr 25, 2024 at 9:13 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > >> It should not happen. For the current implementation, perf rejects all > >> the !exclude_guest system-wide event creation if a guest with the vPMU > >> is running. > >> However, it's possible to create an exclude_guest system-wide event at > >> any time. KVM cannot use the information from the VM-entry to decide if > >> there will be active perf events in the VM-exit. > > > > Hmm, why not? If there is any exclude_guest system-wide event, > > perf_guest_enter() can return something to tell KVM "hey, some active > > host events are swapped out. they are originally in counter #2 and > > #3". If so, at the time when perf_guest_enter() returns, KVM will ack > > that and keep it in its pmu data structure. > > I think it's possible that someone creates !exclude_guest event after I assume you mean an exclude_guest=1 event? Because perf should be in a state where it rejects exclude_guest=0 events. > the perf_guest_enter(). The stale information is saved in the KVM. Perf > will schedule the event in the next perf_guest_exit(). KVM will not know it. Ya, the creation of an event on a CPU that currently has guest PMU state loaded is what I had in mind when I suggested a callback in my sketch: : D. Add a perf callback that is invoked from IRQ context when perf wants to : configure a new PMU-based events, *before* actually programming the MSRs, : and have KVM's callback put the guest PMU state It's a similar idea to TIF_NEED_FPU_LOAD, just that instead of a common chunk of kernel code swapping out the guest state (kernel_fpu_begin()), it's a callback into KVM.
On 4/26/2024 5:46 AM, Sean Christopherson wrote: > On Thu, Apr 25, 2024, Kan Liang wrote: >> On 2024-04-25 4:16 p.m., Mingwei Zhang wrote: >>> On Thu, Apr 25, 2024 at 9:13 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >>>> It should not happen. For the current implementation, perf rejects all >>>> the !exclude_guest system-wide event creation if a guest with the vPMU >>>> is running. >>>> However, it's possible to create an exclude_guest system-wide event at >>>> any time. KVM cannot use the information from the VM-entry to decide if >>>> there will be active perf events in the VM-exit. >>> Hmm, why not? If there is any exclude_guest system-wide event, >>> perf_guest_enter() can return something to tell KVM "hey, some active >>> host events are swapped out. they are originally in counter #2 and >>> #3". If so, at the time when perf_guest_enter() returns, KVM will ack >>> that and keep it in its pmu data structure. >> I think it's possible that someone creates !exclude_guest event after > I assume you mean an exclude_guest=1 event? Because perf should be in a state > where it rejects exclude_guest=0 events. Suppose should be exclude_guest=1 event, the perf event without exclude_guest attribute would be blocked to create in the v2 patches which we are working on. > >> the perf_guest_enter(). The stale information is saved in the KVM. Perf >> will schedule the event in the next perf_guest_exit(). KVM will not know it. > Ya, the creation of an event on a CPU that currently has guest PMU state loaded > is what I had in mind when I suggested a callback in my sketch: > > : D. Add a perf callback that is invoked from IRQ context when perf wants to > : configure a new PMU-based events, *before* actually programming the MSRs, > : and have KVM's callback put the guest PMU state when host creates a perf event with exclude_guest attribute which is used to profile KVM/VMM user space, the vCPU process could work at three places. 1. in guest state (non-root mode) 2. inside vcpu-loop 3. outside vcpu-loop Since the PMU state has already been switched to host state, we don't need to consider the case 3 and only care about cases 1 and 2. when host creates a perf event with exclude_guest attribute to profile KVM/VMM user space, an IPI is triggered to enable the perf event eventually like the following code shows. event_function_call(event, __perf_event_enable, NULL); For case 1, a vm-exit is triggered and KVM starts to process the vm-exit and then run IPI irq handler, exactly speaking __perf_event_enable() to enable the perf event. For case 2, the IPI irq handler would preempt the vcpu-loop and call __perf_event_enable() to enable the perf event. So IMO KVM just needs to provide a callback to switch guest/host PMU state, and __perf_event_enable() calls this callback before really touching PMU MSRs. > > It's a similar idea to TIF_NEED_FPU_LOAD, just that instead of a common chunk of > kernel code swapping out the guest state (kernel_fpu_begin()), it's a callback > into KVM.
On 4/26/2024 4:43 AM, Liang, Kan wrote: > > On 2024-04-25 4:16 p.m., Mingwei Zhang wrote: >> On Thu, Apr 25, 2024 at 9:13 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >>> >>> >>> On 2024-04-25 12:24 a.m., Mingwei Zhang wrote: >>>> On Wed, Apr 24, 2024 at 8:56 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: >>>>> >>>>> On 4/24/2024 11:00 PM, Sean Christopherson wrote: >>>>>> On Wed, Apr 24, 2024, Dapeng Mi wrote: >>>>>>> On 4/24/2024 1:02 AM, Mingwei Zhang wrote: >>>>>>>>>> Maybe, (just maybe), it is possible to do PMU context switch at vcpu >>>>>>>>>> boundary normally, but doing it at VM Enter/Exit boundary when host is >>>>>>>>>> profiling KVM kernel module. So, dynamically adjusting PMU context >>>>>>>>>> switch location could be an option. >>>>>>>>> If there are two VMs with pmu enabled both, however host PMU is not >>>>>>>>> enabled. PMU context switch should be done in vcpu thread sched-out path. >>>>>>>>> >>>>>>>>> If host pmu is used also, we can choose whether PMU switch should be >>>>>>>>> done in vm exit path or vcpu thread sched-out path. >>>>>>>>> >>>>>>>> host PMU is always enabled, ie., Linux currently does not support KVM >>>>>>>> PMU running standalone. I guess what you mean is there are no active >>>>>>>> perf_events on the host side. Allowing a PMU context switch drifting >>>>>>>> from vm-enter/exit boundary to vcpu loop boundary by checking host >>>>>>>> side events might be a good option. We can keep the discussion, but I >>>>>>>> won't propose that in v2. >>>>>>> I suspect if it's really doable to do this deferring. This still makes host >>>>>>> lose the most of capability to profile KVM. Per my understanding, most of >>>>>>> KVM overhead happens in the vcpu loop, exactly speaking in VM-exit handling. >>>>>>> We have no idea when host want to create perf event to profile KVM, it could >>>>>>> be at any time. >>>>>> No, the idea is that KVM will load host PMU state asap, but only when host PMU >>>>>> state actually needs to be loaded, i.e. only when there are relevant host events. >>>>>> >>>>>> If there are no host perf events, KVM keeps guest PMU state loaded for the entire >>>>>> KVM_RUN loop, i.e. provides optimal behavior for the guest. But if a host perf >>>>>> events exists (or comes along), the KVM context switches PMU at VM-Enter/VM-Exit, >>>>>> i.e. lets the host profile almost all of KVM, at the cost of a degraded experience >>>>>> for the guest while host perf events are active. >>>>> I see. So KVM needs to provide a callback which needs to be called in >>>>> the IPI handler. The KVM callback needs to be called to switch PMU state >>>>> before perf really enabling host event and touching PMU MSRs. And only >>>>> the perf event with exclude_guest attribute is allowed to create on >>>>> host. Thanks. >>>> Do we really need a KVM callback? I think that is one option. >>>> >>>> Immediately after VMEXIT, KVM will check whether there are "host perf >>>> events". If so, do the PMU context switch immediately. Otherwise, keep >>>> deferring the context switch to the end of vPMU loop. >>>> >>>> Detecting if there are "host perf events" would be interesting. The >>>> "host perf events" refer to the perf_events on the host that are >>>> active and assigned with HW counters and that are saved when context >>>> switching to the guest PMU. I think getting those events could be done >>>> by fetching the bitmaps in cpuc. >>> The cpuc is ARCH specific structure. I don't think it can be get in the >>> generic code. You probably have to implement ARCH specific functions to >>> fetch the bitmaps. It probably won't worth it. >>> >>> You may check the pinned_groups and flexible_groups to understand if >>> there are host perf events which may be scheduled when VM-exit. But it >>> will not tell the idx of the counters which can only be got when the >>> host event is really scheduled. >>> >>>> I have to look into the details. But >>>> at the time of VMEXIT, kvm should already have that information, so it >>>> can immediately decide whether to do the PMU context switch or not. >>>> >>>> oh, but when the control is executing within the run loop, a >>>> host-level profiling starts, say 'perf record -a ...', it will >>>> generate an IPI to all CPUs. Maybe that's when we need a callback so >>>> the KVM guest PMU context gets preempted for the host-level profiling. >>>> Gah.. >>>> >>>> hmm, not a fan of that. That means the host can poke the guest PMU >>>> context at any time and cause higher overhead. But I admit it is much >>>> better than the current approach. >>>> >>>> The only thing is that: any command like 'perf record/stat -a' shot in >>>> dark corners of the host can preempt guest PMUs of _all_ running VMs. >>>> So, to alleviate that, maybe a module parameter that disables this >>>> "preemption" is possible? This should fit scenarios where we don't >>>> want guest PMU to be preempted outside of the vCPU loop? >>>> >>> It should not happen. For the current implementation, perf rejects all >>> the !exclude_guest system-wide event creation if a guest with the vPMU >>> is running. >>> However, it's possible to create an exclude_guest system-wide event at >>> any time. KVM cannot use the information from the VM-entry to decide if >>> there will be active perf events in the VM-exit. >> Hmm, why not? If there is any exclude_guest system-wide event, >> perf_guest_enter() can return something to tell KVM "hey, some active >> host events are swapped out. they are originally in counter #2 and >> #3". If so, at the time when perf_guest_enter() returns, KVM will ack >> that and keep it in its pmu data structure. > I think it's possible that someone creates !exclude_guest event after > the perf_guest_enter(). The stale information is saved in the KVM. Perf > will schedule the event in the next perf_guest_exit(). KVM will not know it. > >> Now, when doing context switching back to host at just VMEXIT, KVM >> will check this data and see if host perf context has something active >> (of course, they are all exclude_guest events). If not, deferring the >> context switch to vcpu boundary. Otherwise, do the proper PMU context >> switching by respecting the occupied counter positions on the host >> side, i.e., avoid doubling the work on the KVM side. >> >> Kan, any suggestion on the above approach? > I think we can only know the accurate event list at perf_guest_exit(). > You may check the pinned_groups and flexible_groups, which tell if there > are candidate events. > >> Totally understand that >> there might be some difficulty, since perf subsystem works in several >> layers and obviously fetching low-level mapping is arch specific work. >> If that is difficult, we can split the work in two phases: 1) phase >> #1, just ask perf to tell kvm if there are active exclude_guest events >> swapped out; 2) phase #2, ask perf to tell their (low-level) counter >> indices. >> > If you want an accurate counter mask, the changes in the arch specific > code is required. Two phases sound good to me. > > Besides perf changes, I think the KVM should also track which counters > need to be saved/restored. The information can be get from the EventSel > interception. Yes, that's another optimization from guest point view. It's in our to-do list. > > Thanks, > Kan >>> The perf_guest_exit() will reload the host state. It's impossible to >>> save the guest state after that. We may need a KVM callback. So perf can >>> tell KVM whether to save the guest state before perf reloads the host state. >>> >>> Thanks, >>> Kan >>>>> >>>>>> My original sketch: https://lore.kernel.org/all/ZR3eNtP5IVAHeFNC@googlecom
On Thu, Apr 25, 2024 at 6:46 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > On 4/26/2024 5:46 AM, Sean Christopherson wrote: > > On Thu, Apr 25, 2024, Kan Liang wrote: > >> On 2024-04-25 4:16 p.m., Mingwei Zhang wrote: > >>> On Thu, Apr 25, 2024 at 9:13 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > >>>> It should not happen. For the current implementation, perf rejects all > >>>> the !exclude_guest system-wide event creation if a guest with the vPMU > >>>> is running. > >>>> However, it's possible to create an exclude_guest system-wide event at > >>>> any time. KVM cannot use the information from the VM-entry to decide if > >>>> there will be active perf events in the VM-exit. > >>> Hmm, why not? If there is any exclude_guest system-wide event, > >>> perf_guest_enter() can return something to tell KVM "hey, some active > >>> host events are swapped out. they are originally in counter #2 and > >>> #3". If so, at the time when perf_guest_enter() returns, KVM will ack > >>> that and keep it in its pmu data structure. > >> I think it's possible that someone creates !exclude_guest event after > > I assume you mean an exclude_guest=1 event? Because perf should be in a state > > where it rejects exclude_guest=0 events. > > Suppose should be exclude_guest=1 event, the perf event without > exclude_guest attribute would be blocked to create in the v2 patches > which we are working on. > > > > > >> the perf_guest_enter(). The stale information is saved in the KVM. Perf > >> will schedule the event in the next perf_guest_exit(). KVM will not know it. > > Ya, the creation of an event on a CPU that currently has guest PMU state loaded > > is what I had in mind when I suggested a callback in my sketch: > > > > : D. Add a perf callback that is invoked from IRQ context when perf wants to > > : configure a new PMU-based events, *before* actually programming the MSRs, > > : and have KVM's callback put the guest PMU state > > > when host creates a perf event with exclude_guest attribute which is > used to profile KVM/VMM user space, the vCPU process could work at three > places. > > 1. in guest state (non-root mode) > > 2. inside vcpu-loop > > 3. outside vcpu-loop > > Since the PMU state has already been switched to host state, we don't > need to consider the case 3 and only care about cases 1 and 2. > > when host creates a perf event with exclude_guest attribute to profile > KVM/VMM user space, an IPI is triggered to enable the perf event > eventually like the following code shows. > > event_function_call(event, __perf_event_enable, NULL); > > For case 1, a vm-exit is triggered and KVM starts to process the > vm-exit and then run IPI irq handler, exactly speaking > __perf_event_enable() to enable the perf event. > > For case 2, the IPI irq handler would preempt the vcpu-loop and call > __perf_event_enable() to enable the perf event. > > So IMO KVM just needs to provide a callback to switch guest/host PMU > state, and __perf_event_enable() calls this callback before really > touching PMU MSRs. ok, in this case, do we still need KVM to query perf if there are active exclude_guest events? yes? Because there is an ordering issue. The above suggests that the host-level perf profiling comes when a VM is already running, there is an IPI that can invoke the callback and trigger preemption. In this case, KVM should switch the context from guest to host. What if it is the other way around, ie., host-level profiling runs first and then VM runs? In this case, just before entering the vcpu loop, kvm should check whether there is an active host event and save that into a pmu data structure. If none, do the context switch early (so that KVM saves a huge amount of unnecessary PMU context switches in the future). Otherwise, keep the host PMU context until vm-enter. At the time of vm-exit, do the check again using the data stored in pmu structure. If there is an active event do the context switch to the host PMU, otherwise defer that until exiting the vcpu loop. Of course, in the meantime, if there is any perf profiling started causing the IPI, the irq handler calls the callback, preempting the guest PMU context. If that happens, at the time of exiting the vcpu boundary, PMU context switch is skipped since it is already done. Of course, note that the irq could come at any time, so the PMU context switch in all 4 locations need to check the state flag (and skip the context switch if needed). So this requires vcpu->pmu has two pieces of state information: 1) the flag similar to TIF_NEED_FPU_LOAD; 2) host perf context info (phase #1 just a boolean; phase #2, bitmap of occupied counters). This is a non-trivial optimization on the PMU context switch. I am thinking about splitting them into the following phases: 1) lazy PMU context switch, i.e., wait until the guest touches PMU MSR for the 1st time. 2) fast PMU context switch on KVM side, i.e., KVM checking event selector value (enable/disable) and selectively switch PMU state (reducing rd/wr msrs) 3) dynamic PMU context boundary, ie., KVM can dynamically choose PMU context switch boundary depending on existing active host-level events. 3.1) more accurate dynamic PMU context switch, ie., KVM checking host-level counter position and further reduces the number of msr accesses. 4) guest PMU context preemption, i.e., any new host-level perf profiling can immediately preempt the guest PMU in the vcpu loop (instead of waiting for the next PMU context switch in KVM). Thanks. -Mingwei > > > > > It's a similar idea to TIF_NEED_FPU_LOAD, just that instead of a common chunk of > > kernel code swapping out the guest state (kernel_fpu_begin()), it's a callback > > into KVM.
On 4/26/2024 11:12 AM, Mingwei Zhang wrote: > On Thu, Apr 25, 2024 at 6:46 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: >> >> On 4/26/2024 5:46 AM, Sean Christopherson wrote: >>> On Thu, Apr 25, 2024, Kan Liang wrote: >>>> On 2024-04-25 4:16 p.m., Mingwei Zhang wrote: >>>>> On Thu, Apr 25, 2024 at 9:13 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >>>>>> It should not happen. For the current implementation, perf rejects all >>>>>> the !exclude_guest system-wide event creation if a guest with the vPMU >>>>>> is running. >>>>>> However, it's possible to create an exclude_guest system-wide event at >>>>>> any time. KVM cannot use the information from the VM-entry to decide if >>>>>> there will be active perf events in the VM-exit. >>>>> Hmm, why not? If there is any exclude_guest system-wide event, >>>>> perf_guest_enter() can return something to tell KVM "hey, some active >>>>> host events are swapped out. they are originally in counter #2 and >>>>> #3". If so, at the time when perf_guest_enter() returns, KVM will ack >>>>> that and keep it in its pmu data structure. >>>> I think it's possible that someone creates !exclude_guest event after >>> I assume you mean an exclude_guest=1 event? Because perf should be in a state >>> where it rejects exclude_guest=0 events. >> Suppose should be exclude_guest=1 event, the perf event without >> exclude_guest attribute would be blocked to create in the v2 patches >> which we are working on. >> >> >>>> the perf_guest_enter(). The stale information is saved in the KVM. Perf >>>> will schedule the event in the next perf_guest_exit(). KVM will not know it. >>> Ya, the creation of an event on a CPU that currently has guest PMU state loaded >>> is what I had in mind when I suggested a callback in my sketch: >>> >>> : D. Add a perf callback that is invoked from IRQ context when perf wants to >>> : configure a new PMU-based events, *before* actually programming the MSRs, >>> : and have KVM's callback put the guest PMU state >> >> when host creates a perf event with exclude_guest attribute which is >> used to profile KVM/VMM user space, the vCPU process could work at three >> places. >> >> 1. in guest state (non-root mode) >> >> 2. inside vcpu-loop >> >> 3. outside vcpu-loop >> >> Since the PMU state has already been switched to host state, we don't >> need to consider the case 3 and only care about cases 1 and 2. >> >> when host creates a perf event with exclude_guest attribute to profile >> KVM/VMM user space, an IPI is triggered to enable the perf event >> eventually like the following code shows. >> >> event_function_call(event, __perf_event_enable, NULL); >> >> For case 1, a vm-exit is triggered and KVM starts to process the >> vm-exit and then run IPI irq handler, exactly speaking >> __perf_event_enable() to enable the perf event. >> >> For case 2, the IPI irq handler would preempt the vcpu-loop and call >> __perf_event_enable() to enable the perf event. >> >> So IMO KVM just needs to provide a callback to switch guest/host PMU >> state, and __perf_event_enable() calls this callback before really >> touching PMU MSRs. > ok, in this case, do we still need KVM to query perf if there are > active exclude_guest events? yes? Because there is an ordering issue. > The above suggests that the host-level perf profiling comes when a VM > is already running, there is an IPI that can invoke the callback and > trigger preemption. In this case, KVM should switch the context from > guest to host. What if it is the other way around, ie., host-level > profiling runs first and then VM runs? > > In this case, just before entering the vcpu loop, kvm should check > whether there is an active host event and save that into a pmu data > structure. If none, do the context switch early (so that KVM saves a > huge amount of unnecessary PMU context switches in the future). > Otherwise, keep the host PMU context until vm-enter. At the time of > vm-exit, do the check again using the data stored in pmu structure. If > there is an active event do the context switch to the host PMU, > otherwise defer that until exiting the vcpu loop. Of course, in the > meantime, if there is any perf profiling started causing the IPI, the > irq handler calls the callback, preempting the guest PMU context. If > that happens, at the time of exiting the vcpu boundary, PMU context > switch is skipped since it is already done. Of course, note that the > irq could come at any time, so the PMU context switch in all 4 > locations need to check the state flag (and skip the context switch if > needed). > > So this requires vcpu->pmu has two pieces of state information: 1) the > flag similar to TIF_NEED_FPU_LOAD; 2) host perf context info (phase #1 > just a boolean; phase #2, bitmap of occupied counters). I still had no chance to look at the details about vFPU implementation, currently I have no idea what we need exactly on vPMU side, a flag or a callback. Anyway, that's just implementation details, we can look at it when starting to implement it. > > This is a non-trivial optimization on the PMU context switch. I am > thinking about splitting them into the following phases: > > 1) lazy PMU context switch, i.e., wait until the guest touches PMU MSR > for the 1st time. > 2) fast PMU context switch on KVM side, i.e., KVM checking event > selector value (enable/disable) and selectively switch PMU state > (reducing rd/wr msrs) > 3) dynamic PMU context boundary, ie., KVM can dynamically choose PMU > context switch boundary depending on existing active host-level > events. > 3.1) more accurate dynamic PMU context switch, ie., KVM checking > host-level counter position and further reduces the number of msr > accesses. > 4) guest PMU context preemption, i.e., any new host-level perf > profiling can immediately preempt the guest PMU in the vcpu loop > (instead of waiting for the next PMU context switch in KVM). Great! we have a whole clear picture about the optimization right now. BTW, the optimization 1 and 2 are already on our original to-do list. We plan to do it after RFC v2 is ready. > > Thanks. > -Mingwei >>> It's a similar idea to TIF_NEED_FPU_LOAD, just that instead of a common chunk of >>> kernel code swapping out the guest state (kernel_fpu_begin()), it's a callback >>> into KVM.
On Thu, Apr 25, 2024 at 9:03 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > On 4/26/2024 11:12 AM, Mingwei Zhang wrote: > > On Thu, Apr 25, 2024 at 6:46 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > >> > >> On 4/26/2024 5:46 AM, Sean Christopherson wrote: > >>> On Thu, Apr 25, 2024, Kan Liang wrote: > >>>> On 2024-04-25 4:16 p.m., Mingwei Zhang wrote: > >>>>> On Thu, Apr 25, 2024 at 9:13 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > >>>>>> It should not happen. For the current implementation, perf rejects all > >>>>>> the !exclude_guest system-wide event creation if a guest with the vPMU > >>>>>> is running. > >>>>>> However, it's possible to create an exclude_guest system-wide event at > >>>>>> any time. KVM cannot use the information from the VM-entry to decide if > >>>>>> there will be active perf events in the VM-exit. > >>>>> Hmm, why not? If there is any exclude_guest system-wide event, > >>>>> perf_guest_enter() can return something to tell KVM "hey, some active > >>>>> host events are swapped out. they are originally in counter #2 and > >>>>> #3". If so, at the time when perf_guest_enter() returns, KVM will ack > >>>>> that and keep it in its pmu data structure. > >>>> I think it's possible that someone creates !exclude_guest event after > >>> I assume you mean an exclude_guest=1 event? Because perf should be in a state > >>> where it rejects exclude_guest=0 events. > >> Suppose should be exclude_guest=1 event, the perf event without > >> exclude_guest attribute would be blocked to create in the v2 patches > >> which we are working on. > >> > >> > >>>> the perf_guest_enter(). The stale information is saved in the KVM. Perf > >>>> will schedule the event in the next perf_guest_exit(). KVM will not know it. > >>> Ya, the creation of an event on a CPU that currently has guest PMU state loaded > >>> is what I had in mind when I suggested a callback in my sketch: > >>> > >>> : D. Add a perf callback that is invoked from IRQ context when perf wants to > >>> : configure a new PMU-based events, *before* actually programming the MSRs, > >>> : and have KVM's callback put the guest PMU state > >> > >> when host creates a perf event with exclude_guest attribute which is > >> used to profile KVM/VMM user space, the vCPU process could work at three > >> places. > >> > >> 1. in guest state (non-root mode) > >> > >> 2. inside vcpu-loop > >> > >> 3. outside vcpu-loop > >> > >> Since the PMU state has already been switched to host state, we don't > >> need to consider the case 3 and only care about cases 1 and 2. > >> > >> when host creates a perf event with exclude_guest attribute to profile > >> KVM/VMM user space, an IPI is triggered to enable the perf event > >> eventually like the following code shows. > >> > >> event_function_call(event, __perf_event_enable, NULL); > >> > >> For case 1, a vm-exit is triggered and KVM starts to process the > >> vm-exit and then run IPI irq handler, exactly speaking > >> __perf_event_enable() to enable the perf event. > >> > >> For case 2, the IPI irq handler would preempt the vcpu-loop and call > >> __perf_event_enable() to enable the perf event. > >> > >> So IMO KVM just needs to provide a callback to switch guest/host PMU > >> state, and __perf_event_enable() calls this callback before really > >> touching PMU MSRs. > > ok, in this case, do we still need KVM to query perf if there are > > active exclude_guest events? yes? Because there is an ordering issue. > > The above suggests that the host-level perf profiling comes when a VM > > is already running, there is an IPI that can invoke the callback and > > trigger preemption. In this case, KVM should switch the context from > > guest to host. What if it is the other way around, ie., host-level > > profiling runs first and then VM runs? > > > > In this case, just before entering the vcpu loop, kvm should check > > whether there is an active host event and save that into a pmu data > > structure. If none, do the context switch early (so that KVM saves a > > huge amount of unnecessary PMU context switches in the future). > > Otherwise, keep the host PMU context until vm-enter. At the time of > > vm-exit, do the check again using the data stored in pmu structure. If > > there is an active event do the context switch to the host PMU, > > otherwise defer that until exiting the vcpu loop. Of course, in the > > meantime, if there is any perf profiling started causing the IPI, the > > irq handler calls the callback, preempting the guest PMU context. If > > that happens, at the time of exiting the vcpu boundary, PMU context > > switch is skipped since it is already done. Of course, note that the > > irq could come at any time, so the PMU context switch in all 4 > > locations need to check the state flag (and skip the context switch if > > needed). > > > > So this requires vcpu->pmu has two pieces of state information: 1) the > > flag similar to TIF_NEED_FPU_LOAD; 2) host perf context info (phase #1 > > just a boolean; phase #2, bitmap of occupied counters). > > I still had no chance to look at the details about vFPU implementation, > currently I have no idea what we need exactly on vPMU side, a flag or a > callback. Anyway, that's just implementation details, we can look at it > when starting to implement it. I think both. The flag helps to decide whether the context switch has already been done. The callback will always trigger the context switch, but the context switch code should always check if the switch has already been done. FPU context switch is similar but slightly different. That is done at the host-level context switch boundary and even crossing that boundary as long as the next process/thread is not using FPU and/or not going back to userspace. I don't think we want to defer it that far. Instead, the PMU context switch should still happen within the range of KVM. > > > > > This is a non-trivial optimization on the PMU context switch. I am > > thinking about splitting them into the following phases: > > > > 1) lazy PMU context switch, i.e., wait until the guest touches PMU MSR > > for the 1st time. > > 2) fast PMU context switch on KVM side, i.e., KVM checking event > > selector value (enable/disable) and selectively switch PMU state > > (reducing rd/wr msrs) > > 3) dynamic PMU context boundary, ie., KVM can dynamically choose PMU > > context switch boundary depending on existing active host-level > > events. > > 3.1) more accurate dynamic PMU context switch, ie., KVM checking > > host-level counter position and further reduces the number of msr > > accesses. > > 4) guest PMU context preemption, i.e., any new host-level perf > > profiling can immediately preempt the guest PMU in the vcpu loop > > (instead of waiting for the next PMU context switch in KVM). > > Great! we have a whole clear picture about the optimization right now. > BTW, the optimization 1 and 2 are already on our original to-do list. We > plan to do it after RFC v2 is ready. > I am going to summarize that into a design doc. It has been 50 emails in this thread. I am sure no one has the patience to read our garbage unless they are involved at the very beginning :) Any of the implementations are very welcome. 1) and 2) are low hanging fruit and we can finish that quickly after v2. 3-4 is error prone and needs further discussions so let's not rush for that. On the other hand, how do we test this is the question we need to think about. Thanks. -Mingwei
On 2024-04-25 5:46 p.m., Sean Christopherson wrote: > On Thu, Apr 25, 2024, Kan Liang wrote: >> On 2024-04-25 4:16 p.m., Mingwei Zhang wrote: >>> On Thu, Apr 25, 2024 at 9:13 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >>>> It should not happen. For the current implementation, perf rejects all >>>> the !exclude_guest system-wide event creation if a guest with the vPMU >>>> is running. >>>> However, it's possible to create an exclude_guest system-wide event at >>>> any time. KVM cannot use the information from the VM-entry to decide if >>>> there will be active perf events in the VM-exit. >>> >>> Hmm, why not? If there is any exclude_guest system-wide event, >>> perf_guest_enter() can return something to tell KVM "hey, some active >>> host events are swapped out. they are originally in counter #2 and >>> #3". If so, at the time when perf_guest_enter() returns, KVM will ack >>> that and keep it in its pmu data structure. >> >> I think it's possible that someone creates !exclude_guest event after > > I assume you mean an exclude_guest=1 event? Because perf should be in a state > where it rejects exclude_guest=0 events. > Right. >> the perf_guest_enter(). The stale information is saved in the KVM. Perf >> will schedule the event in the next perf_guest_exit(). KVM will not know it. > > Ya, the creation of an event on a CPU that currently has guest PMU state loaded > is what I had in mind when I suggested a callback in my sketch: > > : D. Add a perf callback that is invoked from IRQ context when perf wants to > : configure a new PMU-based events, *before* actually programming the MSRs, > : and have KVM's callback put the guest PMU state > > It's a similar idea to TIF_NEED_FPU_LOAD, just that instead of a common chunk of > kernel code swapping out the guest state (kernel_fpu_begin()), it's a callback > into KVM. Yes, a callback should be required. I think it should be done right before switching back to the host perf events, so there are an accurate active event list. Thanks, Kan
On 2024-04-25 11:12 p.m., Mingwei Zhang wrote: >>>> the perf_guest_enter(). The stale information is saved in the KVM. Perf >>>> will schedule the event in the next perf_guest_exit(). KVM will not know it. >>> Ya, the creation of an event on a CPU that currently has guest PMU state loaded >>> is what I had in mind when I suggested a callback in my sketch: >>> >>> : D. Add a perf callback that is invoked from IRQ context when perf wants to >>> : configure a new PMU-based events, *before* actually programming the MSRs, >>> : and have KVM's callback put the guest PMU state >> >> when host creates a perf event with exclude_guest attribute which is >> used to profile KVM/VMM user space, the vCPU process could work at three >> places. >> >> 1. in guest state (non-root mode) >> >> 2. inside vcpu-loop >> >> 3. outside vcpu-loop >> >> Since the PMU state has already been switched to host state, we don't >> need to consider the case 3 and only care about cases 1 and 2. >> >> when host creates a perf event with exclude_guest attribute to profile >> KVM/VMM user space, an IPI is triggered to enable the perf event >> eventually like the following code shows. >> >> event_function_call(event, __perf_event_enable, NULL); >> >> For case 1, a vm-exit is triggered and KVM starts to process the >> vm-exit and then run IPI irq handler, exactly speaking >> __perf_event_enable() to enable the perf event. >> >> For case 2, the IPI irq handler would preempt the vcpu-loop and call >> __perf_event_enable() to enable the perf event. >> >> So IMO KVM just needs to provide a callback to switch guest/host PMU >> state, and __perf_event_enable() calls this callback before really >> touching PMU MSRs. > ok, in this case, do we still need KVM to query perf if there are > active exclude_guest events? yes? Because there is an ordering issue. > The above suggests that the host-level perf profiling comes when a VM > is already running, there is an IPI that can invoke the callback and > trigger preemption. In this case, KVM should switch the context from > guest to host. What if it is the other way around, ie., host-level > profiling runs first and then VM runs? > > In this case, just before entering the vcpu loop, kvm should check > whether there is an active host event and save that into a pmu data > structure. KVM doesn't need to save/restore the host state. Host perf has the information and will reload the values whenever the host events are rescheduled. But I think KVM should clear the registers used by the host to prevent the value leaks to the guest. > If none, do the context switch early (so that KVM saves a > huge amount of unnecessary PMU context switches in the future). > Otherwise, keep the host PMU context until vm-enter. At the time of > vm-exit, do the check again using the data stored in pmu structure. If > there is an active event do the context switch to the host PMU, > otherwise defer that until exiting the vcpu loop. Of course, in the > meantime, if there is any perf profiling started causing the IPI, the > irq handler calls the callback, preempting the guest PMU context. If > that happens, at the time of exiting the vcpu boundary, PMU context > switch is skipped since it is already done. Of course, note that the > irq could come at any time, so the PMU context switch in all 4 > locations need to check the state flag (and skip the context switch if > needed). > > So this requires vcpu->pmu has two pieces of state information: 1) the > flag similar to TIF_NEED_FPU_LOAD; 2) host perf context info (phase #1 > just a boolean; phase #2, bitmap of occupied counters). > > This is a non-trivial optimization on the PMU context switch. I am > thinking about splitting them into the following phases: > > 1) lazy PMU context switch, i.e., wait until the guest touches PMU MSR > for the 1st time. > 2) fast PMU context switch on KVM side, i.e., KVM checking event > selector value (enable/disable) and selectively switch PMU state > (reducing rd/wr msrs) > 3) dynamic PMU context boundary, ie., KVM can dynamically choose PMU > context switch boundary depending on existing active host-level > events. > 3.1) more accurate dynamic PMU context switch, ie., KVM checking > host-level counter position and further reduces the number of msr > accesses. > 4) guest PMU context preemption, i.e., any new host-level perf > profiling can immediately preempt the guest PMU in the vcpu loop > (instead of waiting for the next PMU context switch in KVM). I'm not quit sure about the 4. The new host-level perf must be an exclude_guest event. It should not be scheduled when a guest is using the PMU. Why do we want to preempt the guest PMU? The current implementation in perf doesn't schedule any exclude_guest events when a guest is running. Thanks, Kan
On Fri, Apr 26, 2024 at 7:10 AM Liang, Kan <kan.liang@linux.intel.com> wrote: > > > > On 2024-04-25 11:12 p.m., Mingwei Zhang wrote: > >>>> the perf_guest_enter(). The stale information is saved in the KVM. Perf > >>>> will schedule the event in the next perf_guest_exit(). KVM will not know it. > >>> Ya, the creation of an event on a CPU that currently has guest PMU state loaded > >>> is what I had in mind when I suggested a callback in my sketch: > >>> > >>> : D. Add a perf callback that is invoked from IRQ context when perf wants to > >>> : configure a new PMU-based events, *before* actually programming the MSRs, > >>> : and have KVM's callback put the guest PMU state > >> > >> when host creates a perf event with exclude_guest attribute which is > >> used to profile KVM/VMM user space, the vCPU process could work at three > >> places. > >> > >> 1. in guest state (non-root mode) > >> > >> 2. inside vcpu-loop > >> > >> 3. outside vcpu-loop > >> > >> Since the PMU state has already been switched to host state, we don't > >> need to consider the case 3 and only care about cases 1 and 2. > >> > >> when host creates a perf event with exclude_guest attribute to profile > >> KVM/VMM user space, an IPI is triggered to enable the perf event > >> eventually like the following code shows. > >> > >> event_function_call(event, __perf_event_enable, NULL); > >> > >> For case 1, a vm-exit is triggered and KVM starts to process the > >> vm-exit and then run IPI irq handler, exactly speaking > >> __perf_event_enable() to enable the perf event. > >> > >> For case 2, the IPI irq handler would preempt the vcpu-loop and call > >> __perf_event_enable() to enable the perf event. > >> > >> So IMO KVM just needs to provide a callback to switch guest/host PMU > >> state, and __perf_event_enable() calls this callback before really > >> touching PMU MSRs. > > ok, in this case, do we still need KVM to query perf if there are > > active exclude_guest events? yes? Because there is an ordering issue. > > The above suggests that the host-level perf profiling comes when a VM > > is already running, there is an IPI that can invoke the callback and > > trigger preemption. In this case, KVM should switch the context from > > guest to host. What if it is the other way around, ie., host-level > > profiling runs first and then VM runs? > > > > In this case, just before entering the vcpu loop, kvm should check > > whether there is an active host event and save that into a pmu data > > structure. > > KVM doesn't need to save/restore the host state. Host perf has the > information and will reload the values whenever the host events are > rescheduled. But I think KVM should clear the registers used by the host > to prevent the value leaks to the guest. Right, KVM needs to know about the host state to optimize its own PMU context switch. If the host is using a counter of index, say 1, then KVM may not need to zap the value of counter #1, since perf side will override it. > > > If none, do the context switch early (so that KVM saves a > > huge amount of unnecessary PMU context switches in the future). > > Otherwise, keep the host PMU context until vm-enter. At the time of > > vm-exit, do the check again using the data stored in pmu structure. If > > there is an active event do the context switch to the host PMU, > > otherwise defer that until exiting the vcpu loop. Of course, in the > > meantime, if there is any perf profiling started causing the IPI, the > > irq handler calls the callback, preempting the guest PMU context. If > > that happens, at the time of exiting the vcpu boundary, PMU context > > switch is skipped since it is already done. Of course, note that the > > irq could come at any time, so the PMU context switch in all 4 > > locations need to check the state flag (and skip the context switch if > > needed). > > > > So this requires vcpu->pmu has two pieces of state information: 1) the > > flag similar to TIF_NEED_FPU_LOAD; 2) host perf context info (phase #1 > > just a boolean; phase #2, bitmap of occupied counters). > > > > This is a non-trivial optimization on the PMU context switch. I am > > thinking about splitting them into the following phases: > > > > 1) lazy PMU context switch, i.e., wait until the guest touches PMU MSR > > for the 1st time. > > 2) fast PMU context switch on KVM side, i.e., KVM checking event > > selector value (enable/disable) and selectively switch PMU state > > (reducing rd/wr msrs) > > 3) dynamic PMU context boundary, ie., KVM can dynamically choose PMU > > context switch boundary depending on existing active host-level > > events. > > 3.1) more accurate dynamic PMU context switch, ie., KVM checking > > host-level counter position and further reduces the number of msr > > accesses. > > 4) guest PMU context preemption, i.e., any new host-level perf > > profiling can immediately preempt the guest PMU in the vcpu loop > > (instead of waiting for the next PMU context switch in KVM). > > I'm not quit sure about the 4. > The new host-level perf must be an exclude_guest event. It should not be > scheduled when a guest is using the PMU. Why do we want to preempt the > guest PMU? The current implementation in perf doesn't schedule any > exclude_guest events when a guest is running. right. The grey area is the code within the KVM_RUN loop, but _outside_ of the guest. This part of the code is on the "host" side. However, for efficiency reasons, KVM defers the PMU context switch by retaining the guest PMU MSR values within the loop. Optimization 4 allows the host side to immediately profiling this part instead of waiting for vcpu to reach to PMU context switch locations. Doing so will generate more accurate results. Do we want to preempt that? I think it depends. For regular cloud usage, we don't. But for any other usages where we want to prioritize KVM/VMM profiling over guest vPMU, it is useful. My current opinion is that optimization 4 is something nice to have. But we should allow people to turn it off just like we could choose to disable preempt kernel. Thanks. -Mingwei > > Thanks, > Kan
On 2024-04-26 2:41 p.m., Mingwei Zhang wrote: >>> So this requires vcpu->pmu has two pieces of state information: 1) the >>> flag similar to TIF_NEED_FPU_LOAD; 2) host perf context info (phase #1 >>> just a boolean; phase #2, bitmap of occupied counters). >>> >>> This is a non-trivial optimization on the PMU context switch. I am >>> thinking about splitting them into the following phases: >>> >>> 1) lazy PMU context switch, i.e., wait until the guest touches PMU MSR >>> for the 1st time. >>> 2) fast PMU context switch on KVM side, i.e., KVM checking event >>> selector value (enable/disable) and selectively switch PMU state >>> (reducing rd/wr msrs) >>> 3) dynamic PMU context boundary, ie., KVM can dynamically choose PMU >>> context switch boundary depending on existing active host-level >>> events. >>> 3.1) more accurate dynamic PMU context switch, ie., KVM checking >>> host-level counter position and further reduces the number of msr >>> accesses. >>> 4) guest PMU context preemption, i.e., any new host-level perf >>> profiling can immediately preempt the guest PMU in the vcpu loop >>> (instead of waiting for the next PMU context switch in KVM). >> I'm not quit sure about the 4. >> The new host-level perf must be an exclude_guest event. It should not be >> scheduled when a guest is using the PMU. Why do we want to preempt the >> guest PMU? The current implementation in perf doesn't schedule any >> exclude_guest events when a guest is running. > right. The grey area is the code within the KVM_RUN loop, but > _outside_ of the guest. This part of the code is on the "host" side. > However, for efficiency reasons, KVM defers the PMU context switch by > retaining the guest PMU MSR values within the loop. I assume you mean the optimization of moving the context switch from VM-exit/entry boundary to the vCPU boundary. > Optimization 4 > allows the host side to immediately profiling this part instead of > waiting for vcpu to reach to PMU context switch locations. Doing so > will generate more accurate results. If so, I think the 4 is a must to have. Otherwise, it wouldn't honer the definition of the exclude_guest. Without 4, it brings some random blind spots, right? > > Do we want to preempt that? I think it depends. For regular cloud > usage, we don't. But for any other usages where we want to prioritize > KVM/VMM profiling over guest vPMU, it is useful. > > My current opinion is that optimization 4 is something nice to have. > But we should allow people to turn it off just like we could choose to > disable preempt kernel. The exclude_guest means everything but the guest. I don't see a reason why people want to turn it off and get some random blind spots. Thanks, Kan
On Fri, Apr 26, 2024, Kan Liang wrote: > > Optimization 4 > > allows the host side to immediately profiling this part instead of > > waiting for vcpu to reach to PMU context switch locations. Doing so > > will generate more accurate results. > > If so, I think the 4 is a must to have. Otherwise, it wouldn't honer the > definition of the exclude_guest. Without 4, it brings some random blind > spots, right? +1, I view it as a hard requirement. It's not an optimization, it's about accuracy and functional correctness. What _is_ an optimization is keeping guest state loaded while KVM is in its run loop, i.e. initial mediated/passthrough PMU support could land upstream with unconditional switches at entry/exit. The performance of KVM would likely be unacceptable for any production use cases, but that would give us motivation to finish the job, and it doesn't result in random, hard to diagnose issues for userspace. > > Do we want to preempt that? I think it depends. For regular cloud > > usage, we don't. But for any other usages where we want to prioritize > > KVM/VMM profiling over guest vPMU, it is useful. > > > > My current opinion is that optimization 4 is something nice to have. > > But we should allow people to turn it off just like we could choose to > > disable preempt kernel. > > The exclude_guest means everything but the guest. I don't see a reason > why people want to turn it off and get some random blind spots.
On Fri, Apr 26, 2024 at 12:46 PM Sean Christopherson <seanjc@google.com> wrote: > > On Fri, Apr 26, 2024, Kan Liang wrote: > > > Optimization 4 > > > allows the host side to immediately profiling this part instead of > > > waiting for vcpu to reach to PMU context switch locations. Doing so > > > will generate more accurate results. > > > > If so, I think the 4 is a must to have. Otherwise, it wouldn't honer the > > definition of the exclude_guest. Without 4, it brings some random blind > > spots, right? > > +1, I view it as a hard requirement. It's not an optimization, it's about > accuracy and functional correctness. Well. Does it have to be a _hard_ requirement? no? The irq handler triggered by "perf record -a" could just inject a "state". Instead of immediately preempting the guest PMU context, perf subsystem could allow KVM defer the context switch when it reaches the next PMU context switch location. This is the same as the preemption kernel logic. Do you want me to stop the work immediately? Yes (if you enable preemption), or No, let me finish my job and get to the scheduling point. Implementing this might be more difficult to debug. That's my real concern. If we do not enable preemption, the PMU context switch will only happen at the 2 pairs of locations. If we enable preemption, it could happen at any time. > > What _is_ an optimization is keeping guest state loaded while KVM is in its > run loop, i.e. initial mediated/passthrough PMU support could land upstream with > unconditional switches at entry/exit. The performance of KVM would likely be > unacceptable for any production use cases, but that would give us motivation to > finish the job, and it doesn't result in random, hard to diagnose issues for > userspace. That's true. I agree with that. > > > > Do we want to preempt that? I think it depends. For regular cloud > > > usage, we don't. But for any other usages where we want to prioritize > > > KVM/VMM profiling over guest vPMU, it is useful. > > > > > > My current opinion is that optimization 4 is something nice to have. > > > But we should allow people to turn it off just like we could choose to > > > disable preempt kernel. > > > > The exclude_guest means everything but the guest. I don't see a reason > > why people want to turn it off and get some random blind spots.
On 4/27/2024 11:04 AM, Mingwei Zhang wrote: > On Fri, Apr 26, 2024 at 12:46 PM Sean Christopherson <seanjc@google.com> wrote: >> On Fri, Apr 26, 2024, Kan Liang wrote: >>>> Optimization 4 >>>> allows the host side to immediately profiling this part instead of >>>> waiting for vcpu to reach to PMU context switch locations. Doing so >>>> will generate more accurate results. >>> If so, I think the 4 is a must to have. Otherwise, it wouldn't honer the >>> definition of the exclude_guest. Without 4, it brings some random blind >>> spots, right? >> +1, I view it as a hard requirement. It's not an optimization, it's about >> accuracy and functional correctness. > Well. Does it have to be a _hard_ requirement? no? The irq handler > triggered by "perf record -a" could just inject a "state". Instead of > immediately preempting the guest PMU context, perf subsystem could > allow KVM defer the context switch when it reaches the next PMU > context switch location. > > This is the same as the preemption kernel logic. Do you want me to > stop the work immediately? Yes (if you enable preemption), or No, let > me finish my job and get to the scheduling point. > > Implementing this might be more difficult to debug. That's my real > concern. If we do not enable preemption, the PMU context switch will > only happen at the 2 pairs of locations. If we enable preemption, it > could happen at any time. IMO I don't prefer to add a switch to enable/disable the preemption. I think current implementation is already complicated enough and unnecessary to introduce an new parameter to confuse users. Furthermore, the switch could introduce an uncertainty and may mislead the perf user to read the perf stats incorrectly. As for debug, it won't bring any difference as long as no host event is created. > >> What _is_ an optimization is keeping guest state loaded while KVM is in its >> run loop, i.e. initial mediated/passthrough PMU support could land upstream with >> unconditional switches at entry/exit. The performance of KVM would likely be >> unacceptable for any production use cases, but that would give us motivation to >> finish the job, and it doesn't result in random, hard to diagnose issues for >> userspace. > That's true. I agree with that. > >>>> Do we want to preempt that? I think it depends. For regular cloud >>>> usage, we don't. But for any other usages where we want to prioritize >>>> KVM/VMM profiling over guest vPMU, it is useful. >>>> >>>> My current opinion is that optimization 4 is something nice to have. >>>> But we should allow people to turn it off just like we could choose to >>>> disable preempt kernel. >>> The exclude_guest means everything but the guest. I don't see a reason >>> why people want to turn it off and get some random blind spots.
On Sat, Apr 27, 2024 at 5:59 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > On 4/27/2024 11:04 AM, Mingwei Zhang wrote: > > On Fri, Apr 26, 2024 at 12:46 PM Sean Christopherson <seanjc@google.com> wrote: > >> On Fri, Apr 26, 2024, Kan Liang wrote: > >>>> Optimization 4 > >>>> allows the host side to immediately profiling this part instead of > >>>> waiting for vcpu to reach to PMU context switch locations. Doing so > >>>> will generate more accurate results. > >>> If so, I think the 4 is a must to have. Otherwise, it wouldn't honer the > >>> definition of the exclude_guest. Without 4, it brings some random blind > >>> spots, right? > >> +1, I view it as a hard requirement. It's not an optimization, it's about > >> accuracy and functional correctness. > > Well. Does it have to be a _hard_ requirement? no? The irq handler > > triggered by "perf record -a" could just inject a "state". Instead of > > immediately preempting the guest PMU context, perf subsystem could > > allow KVM defer the context switch when it reaches the next PMU > > context switch location. > > > > This is the same as the preemption kernel logic. Do you want me to > > stop the work immediately? Yes (if you enable preemption), or No, let > > me finish my job and get to the scheduling point. > > > > Implementing this might be more difficult to debug. That's my real > > concern. If we do not enable preemption, the PMU context switch will > > only happen at the 2 pairs of locations. If we enable preemption, it > > could happen at any time. > > IMO I don't prefer to add a switch to enable/disable the preemption. I > think current implementation is already complicated enough and > unnecessary to introduce an new parameter to confuse users. Furthermore, > the switch could introduce an uncertainty and may mislead the perf user > to read the perf stats incorrectly. As for debug, it won't bring any > difference as long as no host event is created. > That's ok. It is about opinions and brainstorming. Adding a parameter to disable preemption is from the cloud usage perspective. The conflict of opinions is which one you prioritize: guest PMU or the host PMU? If you stand on the guest vPMU usage perspective, do you want anyone on the host to shoot a profiling command and generate turbulence? no. If you stand on the host PMU perspective and you want to profile VMM/KVM, you definitely want accuracy and no delay at all. Thanks. -Mingwei > > > > >> What _is_ an optimization is keeping guest state loaded while KVM is in its > >> run loop, i.e. initial mediated/passthrough PMU support could land upstream with > >> unconditional switches at entry/exit. The performance of KVM would likely be > >> unacceptable for any production use cases, but that would give us motivation to > >> finish the job, and it doesn't result in random, hard to diagnose issues for > >> userspace. > > That's true. I agree with that. > > > >>>> Do we want to preempt that? I think it depends. For regular cloud > >>>> usage, we don't. But for any other usages where we want to prioritize > >>>> KVM/VMM profiling over guest vPMU, it is useful. > >>>> > >>>> My current opinion is that optimization 4 is something nice to have. > >>>> But we should allow people to turn it off just like we could choose to > >>>> disable preempt kernel. > >>> The exclude_guest means everything but the guest. I don't see a reason > >>> why people want to turn it off and get some random blind spots.
On 2024-04-26 11:04 p.m., Mingwei Zhang wrote: > On Fri, Apr 26, 2024 at 12:46 PM Sean Christopherson <seanjc@google.com> wrote: >> >> On Fri, Apr 26, 2024, Kan Liang wrote: >>>> Optimization 4 >>>> allows the host side to immediately profiling this part instead of >>>> waiting for vcpu to reach to PMU context switch locations. Doing so >>>> will generate more accurate results. >>> >>> If so, I think the 4 is a must to have. Otherwise, it wouldn't honer the >>> definition of the exclude_guest. Without 4, it brings some random blind >>> spots, right? >> >> +1, I view it as a hard requirement. It's not an optimization, it's about >> accuracy and functional correctness. > > Well. Does it have to be a _hard_ requirement? no? The irq handler > triggered by "perf record -a" could just inject a "state". Instead of > immediately preempting the guest PMU context, perf subsystem could > allow KVM defer the context switch when it reaches the next PMU > context switch location. It depends on what is the upcoming PMU context switch location. If it's the upcoming VM-exit/entry, the defer should be fine. Because it's a exclude_guest event, nothing should be counted when a VM is running. If it's the upcoming vCPU boundary, no. I think there may be several VM-exit/entry before the upcoming vCPU switch. We may lose some results. > > This is the same as the preemption kernel logic. Do you want me to > stop the work immediately? Yes (if you enable preemption), or No, let > me finish my job and get to the scheduling point. I don't think it's necessary. Just make sure that the counters are scheduled in the upcoming VM-exit/entry boundary should be fine. Thanks, Kan > > Implementing this might be more difficult to debug. That's my real > concern. If we do not enable preemption, the PMU context switch will > only happen at the 2 pairs of locations. If we enable preemption, it > could happen at any time. > >> >> What _is_ an optimization is keeping guest state loaded while KVM is in its >> run loop, i.e. initial mediated/passthrough PMU support could land upstream with >> unconditional switches at entry/exit. The performance of KVM would likely be >> unacceptable for any production use cases, but that would give us motivation to >> finish the job, and it doesn't result in random, hard to diagnose issues for >> userspace. > > That's true. I agree with that. > >> >>>> Do we want to preempt that? I think it depends. For regular cloud >>>> usage, we don't. But for any other usages where we want to prioritize >>>> KVM/VMM profiling over guest vPMU, it is useful. >>>> >>>> My current opinion is that optimization 4 is something nice to have. >>>> But we should allow people to turn it off just like we could choose to >>>> disable preempt kernel. >>> >>> The exclude_guest means everything but the guest. I don't see a reason >>> why people want to turn it off and get some random blind spots. >
On Sat, Apr 27, 2024, Mingwei Zhang wrote: > On Sat, Apr 27, 2024 at 5:59 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > > > > On 4/27/2024 11:04 AM, Mingwei Zhang wrote: > > > On Fri, Apr 26, 2024 at 12:46 PM Sean Christopherson <seanjc@google.com> wrote: > > >> On Fri, Apr 26, 2024, Kan Liang wrote: > > >>>> Optimization 4 > > >>>> allows the host side to immediately profiling this part instead of > > >>>> waiting for vcpu to reach to PMU context switch locations. Doing so > > >>>> will generate more accurate results. > > >>> If so, I think the 4 is a must to have. Otherwise, it wouldn't honer the > > >>> definition of the exclude_guest. Without 4, it brings some random blind > > >>> spots, right? > > >> +1, I view it as a hard requirement. It's not an optimization, it's about > > >> accuracy and functional correctness. > > > Well. Does it have to be a _hard_ requirement? no? Assuming I understand how perf_event_open() works, which may be a fairly big assumption, for me, yes, this is a hard requirement. > > > The irq handler triggered by "perf record -a" could just inject a > > > "state". Instead of immediately preempting the guest PMU context, perf > > > subsystem could allow KVM defer the context switch when it reaches the > > > next PMU context switch location. FWIW, forcefully interrupting the guest isn't a hard requirement, but practically speaking I think that will yield the simplest, most robust implementation. > > > This is the same as the preemption kernel logic. Do you want me to > > > stop the work immediately? Yes (if you enable preemption), or No, let > > > me finish my job and get to the scheduling point. Not really. Task scheduling is by its nature completely exclusive, i.e. it's not possible to concurrently run multiple tasks on a single logical CPU. Given a single CPU, to run task B, task A _must_ be scheduled out. That is not the case here. Profiling the host with exclude_guest=1 isn't mutually exclusive with the guest using the PMU. There's obviously the additional overhead of switching PMU context, but the two uses themselves are not mutually exclusive. And more importantly, perf_event_open() already has well-established ABI where it can install events across CPUs. And when perf_event_open() returns, userspace can rely on the event being active and counting (assuming it wasn't disabled by default). > > > Implementing this might be more difficult to debug. That's my real > > > concern. If we do not enable preemption, the PMU context switch will > > > only happen at the 2 pairs of locations. If we enable preemption, it > > > could happen at any time. Yes and no. I agree that swapping guest/host state from IRQ context could lead to hard to debug issues, but NOT doing so could also lead to hard to debug issues. And even worse, those issues would likely be unique to specific kernel and/or system configurations. E.g. userspace creates an event, but sometimes it randomly doesn't count correctly. Is the problem simply that it took a while for KVM to get to a scheduling point, or is there a race lurking? And what happens if the vCPU is the only runnable task on its pCPU, i.e. never gets scheduled out? Mix in all of the possible preemption and scheduler models, and other sources of forced rescheduling, e.g. RCU, and the number of factors to account for becomes quite terrifying. > > IMO I don't prefer to add a switch to enable/disable the preemption. I > > think current implementation is already complicated enough and > > unnecessary to introduce an new parameter to confuse users. Furthermore, > > the switch could introduce an uncertainty and may mislead the perf user > > to read the perf stats incorrectly. +1000. > > As for debug, it won't bring any difference as long as no host event is created. > > > That's ok. It is about opinions and brainstorming. Adding a parameter > to disable preemption is from the cloud usage perspective. The > conflict of opinions is which one you prioritize: guest PMU or the > host PMU? If you stand on the guest vPMU usage perspective, do you > want anyone on the host to shoot a profiling command and generate > turbulence? no. If you stand on the host PMU perspective and you want > to profile VMM/KVM, you definitely want accuracy and no delay at all. Hard no from me. Attempting to support two fundamentally different models means twice the maintenance burden. The *best* case scenario is that usage is roughly a 50/50 spit. The worst case scenario is that the majority of users favor one model over the other, thus resulting in extremely limited tested of the minority model. KVM already has this problem with scheduler preemption models, and it's painful. The overwhelming majority of KVM users run non-preemptible kernels, and so our test coverage for preemtible kernels is abysmal. E.g. the TDP MMU effectively had a fatal flaw with preemptible kernels that went unnoticed for many kernel releases[*], until _another_ bug introduced with dynamic preemption models resulted in users running code that was supposed to be specific to preemtible kernels. [* https://lore.kernel.org/kvm/ef81ff36-64bb-4cfe-ae9b-e3acf47bff24@proxmox.com
On Mon, Apr 29, 2024 at 10:44 AM Sean Christopherson <seanjc@google.com> wrote: > > On Sat, Apr 27, 2024, Mingwei Zhang wrote: > > On Sat, Apr 27, 2024 at 5:59 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: > > > > > > > > > On 4/27/2024 11:04 AM, Mingwei Zhang wrote: > > > > On Fri, Apr 26, 2024 at 12:46 PM Sean Christopherson <seanjc@google.com> wrote: > > > >> On Fri, Apr 26, 2024, Kan Liang wrote: > > > >>>> Optimization 4 > > > >>>> allows the host side to immediately profiling this part instead of > > > >>>> waiting for vcpu to reach to PMU context switch locations. Doing so > > > >>>> will generate more accurate results. > > > >>> If so, I think the 4 is a must to have. Otherwise, it wouldn't honer the > > > >>> definition of the exclude_guest. Without 4, it brings some random blind > > > >>> spots, right? > > > >> +1, I view it as a hard requirement. It's not an optimization, it's about > > > >> accuracy and functional correctness. > > > > Well. Does it have to be a _hard_ requirement? no? > > Assuming I understand how perf_event_open() works, which may be a fairly big > assumption, for me, yes, this is a hard requirement. > > > > > The irq handler triggered by "perf record -a" could just inject a > > > > "state". Instead of immediately preempting the guest PMU context, perf > > > > subsystem could allow KVM defer the context switch when it reaches the > > > > next PMU context switch location. > > FWIW, forcefully interrupting the guest isn't a hard requirement, but practically > speaking I think that will yield the simplest, most robust implementation. > > > > > This is the same as the preemption kernel logic. Do you want me to > > > > stop the work immediately? Yes (if you enable preemption), or No, let > > > > me finish my job and get to the scheduling point. > > Not really. Task scheduling is by its nature completely exclusive, i.e. it's > not possible to concurrently run multiple tasks on a single logical CPU. Given > a single CPU, to run task B, task A _must_ be scheduled out. > > That is not the case here. Profiling the host with exclude_guest=1 isn't mutually > exclusive with the guest using the PMU. There's obviously the additional overhead > of switching PMU context, but the two uses themselves are not mutually exclusive. > > And more importantly, perf_event_open() already has well-established ABI where it > can install events across CPUs. And when perf_event_open() returns, userspace can > rely on the event being active and counting (assuming it wasn't disabled by default). > > > > > Implementing this might be more difficult to debug. That's my real > > > > concern. If we do not enable preemption, the PMU context switch will > > > > only happen at the 2 pairs of locations. If we enable preemption, it > > > > could happen at any time. > > Yes and no. I agree that swapping guest/host state from IRQ context could lead > to hard to debug issues, but NOT doing so could also lead to hard to debug issues. > And even worse, those issues would likely be unique to specific kernel and/or > system configurations. > > E.g. userspace creates an event, but sometimes it randomly doesn't count correctly. > Is the problem simply that it took a while for KVM to get to a scheduling point, > or is there a race lurking? And what happens if the vCPU is the only runnable task > on its pCPU, i.e. never gets scheduled out? > > Mix in all of the possible preemption and scheduler models, and other sources of > forced rescheduling, e.g. RCU, and the number of factors to account for becomes > quite terrifying. > > > > IMO I don't prefer to add a switch to enable/disable the preemption. I > > > think current implementation is already complicated enough and > > > unnecessary to introduce an new parameter to confuse users. Furthermore, > > > the switch could introduce an uncertainty and may mislead the perf user > > > to read the perf stats incorrectly. > > +1000. > > > > As for debug, it won't bring any difference as long as no host event is created. > > > > > That's ok. It is about opinions and brainstorming. Adding a parameter > > to disable preemption is from the cloud usage perspective. The > > conflict of opinions is which one you prioritize: guest PMU or the > > host PMU? If you stand on the guest vPMU usage perspective, do you > > want anyone on the host to shoot a profiling command and generate > > turbulence? no. If you stand on the host PMU perspective and you want > > to profile VMM/KVM, you definitely want accuracy and no delay at all. > > Hard no from me. Attempting to support two fundamentally different models means > twice the maintenance burden. The *best* case scenario is that usage is roughly > a 50/50 spit. The worst case scenario is that the majority of users favor one > model over the other, thus resulting in extremely limited tested of the minority > model. > > KVM already has this problem with scheduler preemption models, and it's painful. > The overwhelming majority of KVM users run non-preemptible kernels, and so our > test coverage for preemtible kernels is abysmal. > > E.g. the TDP MMU effectively had a fatal flaw with preemptible kernels that went > unnoticed for many kernel releases[*], until _another_ bug introduced with dynamic > preemption models resulted in users running code that was supposed to be specific > to preemtible kernels. > > [* https://lore.kernel.org/kvm/ef81ff36-64bb-4cfe-ae9b-e3acf47bff24@proxmox.com > I hear your voice, Sean. In our cloud, we have a host-level profiling going on for all cores periodically. It will be profiling X seconds every Y minute. Having the host-level profiling using exclude_guest is fine, but stopping the host-level profiling is a no no. Tweaking the X and Y is theoretically possible, but highly likely out of the scope of virtualization. Now, some of the VMs might be actively using vPMU at the same time. How can we properly ensure the guest vPMU has consistent performance? Instead of letting the VM suffer from the high overhead of PMU for X seconds of every Y minute? Any thought/help is appreciated. I see the logic of having preemption there for correctness of the profiling on the host level. Doing this, however, negatively impacts the above business usage. One of the things on top of the mind is that: there seems to be no way for the perf subsystem to express this: "no, your host-level profiling is not interested in profiling the KVM_RUN loop when our guest vPMU is actively running". Thanks. -Mingwei -Mingwei
On 2024-05-01 1:43 p.m., Mingwei Zhang wrote: > One of the things on top of the mind is that: there seems to be no way > for the perf subsystem to express this: "no, your host-level profiling > is not interested in profiling the KVM_RUN loop when our guest vPMU is > actively running". exclude_hv? Although it seems the option is not well supported on X86 for now. Thanks, Kan
On Wed, May 01, 2024, Mingwei Zhang wrote: > On Mon, Apr 29, 2024 at 10:44 AM Sean Christopherson <seanjc@google.com> wrote: > > > > On Sat, Apr 27, 2024, Mingwei Zhang wrote: > > > That's ok. It is about opinions and brainstorming. Adding a parameter > > > to disable preemption is from the cloud usage perspective. The > > > conflict of opinions is which one you prioritize: guest PMU or the > > > host PMU? If you stand on the guest vPMU usage perspective, do you > > > want anyone on the host to shoot a profiling command and generate > > > turbulence? no. If you stand on the host PMU perspective and you want > > > to profile VMM/KVM, you definitely want accuracy and no delay at all. > > > > Hard no from me. Attempting to support two fundamentally different models means > > twice the maintenance burden. The *best* case scenario is that usage is roughly > > a 50/50 spit. The worst case scenario is that the majority of users favor one > > model over the other, thus resulting in extremely limited tested of the minority > > model. > > > > KVM already has this problem with scheduler preemption models, and it's painful. > > The overwhelming majority of KVM users run non-preemptible kernels, and so our > > test coverage for preemtible kernels is abysmal. > > > > E.g. the TDP MMU effectively had a fatal flaw with preemptible kernels that went > > unnoticed for many kernel releases[*], until _another_ bug introduced with dynamic > > preemption models resulted in users running code that was supposed to be specific > > to preemtible kernels. > > > > [* https://lore.kernel.org/kvm/ef81ff36-64bb-4cfe-ae9b-e3acf47bff24@proxmox.com > > > > I hear your voice, Sean. > > In our cloud, we have a host-level profiling going on for all cores > periodically. It will be profiling X seconds every Y minute. Having > the host-level profiling using exclude_guest is fine, but stopping the > host-level profiling is a no no. Tweaking the X and Y is theoretically > possible, but highly likely out of the scope of virtualization. Now, > some of the VMs might be actively using vPMU at the same time. How can > we properly ensure the guest vPMU has consistent performance? Instead > of letting the VM suffer from the high overhead of PMU for X seconds > of every Y minute? > > Any thought/help is appreciated. I see the logic of having preemption > there for correctness of the profiling on the host level. Doing this, > however, negatively impacts the above business usage. > > One of the things on top of the mind is that: there seems to be no way > for the perf subsystem to express this: "no, your host-level profiling > is not interested in profiling the KVM_RUN loop when our guest vPMU is > actively running". For good reason, IMO. The KVM_RUN loop can reach _far_ outside of KVM, especially when IRQs and NMIs are involved. I don't think anyone can reasonably say that profiling is never interested in what happens while a task in KVM_RUN. E.g. if there's a bottleneck in some memory allocation flow that happens to be triggered in the greater KVM_RUN loop, that's something we'd want to show up in our profiling data. And if our systems our properly configured, for VMs with a mediated/passthrough PMU, 99.99999% of their associated pCPU's time should be spent in KVM_RUN. If that's our reality, what's the point of profiling if KVM_RUN is out of scope? We could make the context switching logic more sophisticated, e.g. trigger a context switch when control leaves KVM, a la the ASI concepts, but that's all but guaranteed to be overkill, and would have a very high maintenance cost. But we can likely get what we want (low observed overhead from the guest) while still context switching PMU state in vcpu_enter_guest(). KVM already handles the hottest VM-Exit reasons in its fastpath, i.e without triggering a PMU context switch. For a variety of reason, I think we should be more aggressive and handle more VM-Exits in the fastpath, e.g. I can't think of any reason KVM can't handle fast page faults in the fastpath. If we handle that overwhelming majority of VM-Exits in the fastpath when the guest is already booted, e.g. when vCPUs aren't taking a high number of "slow" VM-Exits, then the fact that slow VM-Exits trigger a PMU context switch should be a non-issue, because taking a slow exit would be a rare operation. I.e. rather than solving the overhead problem by moving around the context switch logic, solve the problem by moving KVM code inside the "guest PMU" section. It's essentially a different way of doing the same thing, with the critical difference being that only hand-selected flows are excluded from profiling, i.e. only the flows that need to be blazing fast and should be uninteresting from a profiling perspective are excluded.
On 4/26/2024 11:12 AM, Mingwei Zhang wrote: > On Thu, Apr 25, 2024 at 6:46 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote: >> >> On 4/26/2024 5:46 AM, Sean Christopherson wrote: >>> On Thu, Apr 25, 2024, Kan Liang wrote: >>>> On 2024-04-25 4:16 p.m., Mingwei Zhang wrote: >>>>> On Thu, Apr 25, 2024 at 9:13 AM Liang, Kan <kan.liang@linux.intel.com> wrote: >>>>>> It should not happen. For the current implementation, perf rejects all >>>>>> the !exclude_guest system-wide event creation if a guest with the vPMU >>>>>> is running. >>>>>> However, it's possible to create an exclude_guest system-wide event at >>>>>> any time. KVM cannot use the information from the VM-entry to decide if >>>>>> there will be active perf events in the VM-exit. >>>>> Hmm, why not? If there is any exclude_guest system-wide event, >>>>> perf_guest_enter() can return something to tell KVM "hey, some active >>>>> host events are swapped out. they are originally in counter #2 and >>>>> #3". If so, at the time when perf_guest_enter() returns, KVM will ack >>>>> that and keep it in its pmu data structure. >>>> I think it's possible that someone creates !exclude_guest event after >>> I assume you mean an exclude_guest=1 event? Because perf should be in a state >>> where it rejects exclude_guest=0 events. >> Suppose should be exclude_guest=1 event, the perf event without >> exclude_guest attribute would be blocked to create in the v2 patches >> which we are working on. >> >> >>>> the perf_guest_enter(). The stale information is saved in the KVM. Perf >>>> will schedule the event in the next perf_guest_exit(). KVM will not know it. >>> Ya, the creation of an event on a CPU that currently has guest PMU state loaded >>> is what I had in mind when I suggested a callback in my sketch: >>> >>> : D. Add a perf callback that is invoked from IRQ context when perf wants to >>> : configure a new PMU-based events, *before* actually programming the MSRs, >>> : and have KVM's callback put the guest PMU state >> >> when host creates a perf event with exclude_guest attribute which is >> used to profile KVM/VMM user space, the vCPU process could work at three >> places. >> >> 1. in guest state (non-root mode) >> >> 2. inside vcpu-loop >> >> 3. outside vcpu-loop >> >> Since the PMU state has already been switched to host state, we don't >> need to consider the case 3 and only care about cases 1 and 2. >> >> when host creates a perf event with exclude_guest attribute to profile >> KVM/VMM user space, an IPI is triggered to enable the perf event >> eventually like the following code shows. >> >> event_function_call(event, __perf_event_enable, NULL); >> >> For case 1, a vm-exit is triggered and KVM starts to process the >> vm-exit and then run IPI irq handler, exactly speaking >> __perf_event_enable() to enable the perf event. >> >> For case 2, the IPI irq handler would preempt the vcpu-loop and call >> __perf_event_enable() to enable the perf event. >> >> So IMO KVM just needs to provide a callback to switch guest/host PMU >> state, and __perf_event_enable() calls this callback before really >> touching PMU MSRs. > ok, in this case, do we still need KVM to query perf if there are > active exclude_guest events? yes? Because there is an ordering issue. > The above suggests that the host-level perf profiling comes when a VM > is already running, there is an IPI that can invoke the callback and > trigger preemption. In this case, KVM should switch the context from > guest to host. What if it is the other way around, ie., host-level > profiling runs first and then VM runs? > > In this case, just before entering the vcpu loop, kvm should check > whether there is an active host event and save that into a pmu data > structure. If none, do the context switch early (so that KVM saves a > huge amount of unnecessary PMU context switches in the future). > Otherwise, keep the host PMU context until vm-enter. At the time of > vm-exit, do the check again using the data stored in pmu structure. If > there is an active event do the context switch to the host PMU, > otherwise defer that until exiting the vcpu loop. Of course, in the > meantime, if there is any perf profiling started causing the IPI, the > irq handler calls the callback, preempting the guest PMU context. If > that happens, at the time of exiting the vcpu boundary, PMU context > switch is skipped since it is already done. Of course, note that the > irq could come at any time, so the PMU context switch in all 4 > locations need to check the state flag (and skip the context switch if > needed). > > So this requires vcpu->pmu has two pieces of state information: 1) the > flag similar to TIF_NEED_FPU_LOAD; 2) host perf context info (phase #1 > just a boolean; phase #2, bitmap of occupied counters). > > This is a non-trivial optimization on the PMU context switch. I am > thinking about splitting them into the following phases: > > 1) lazy PMU context switch, i.e., wait until the guest touches PMU MSR > for the 1st time. > 2) fast PMU context switch on KVM side, i.e., KVM checking event > selector value (enable/disable) and selectively switch PMU state > (reducing rd/wr msrs) > 3) dynamic PMU context boundary, ie., KVM can dynamically choose PMU > context switch boundary depending on existing active host-level > events. > 3.1) more accurate dynamic PMU context switch, ie., KVM checking > host-level counter position and further reduces the number of msr > accesses. > 4) guest PMU context preemption, i.e., any new host-level perf > profiling can immediately preempt the guest PMU in the vcpu loop > (instead of waiting for the next PMU context switch in KVM). Sorry to recall this discussion again. I want to update some latest progress on it. I have implemented a draft code for the optimization step 1 & 2 and done some basic tests, but found there could be some security risks on the optimization. The initial proposal is sample, a counter bitmap 'pmc_bitmap64' is introduced to dynamically record the guest using PMU counters. Only guest using counters would be saved/cleared and restored at VM-Exit and VM-Entry. This would reduce the overhead from MSR reading/writing. But this introduces potential security risks. Since only partial MSRs are save/restored to guest values. Malicious guest could get some host sensitive information or attack host by reading or writing these counters which guest doesn't use but host uses. To mitigate these security risks, our initial thought is to change the static interception configuration of PMU at vCPU creation to dynamic interception configuration for per PMU counter before VM-entry. This can mitigate the security risks from rdmsr/wrmsr instructions. But it still can't address the security issue from rdpmc instruction as long as rdpmc is set to passthrough mode. Malicious guest still can read the host sensitive information by rdpmc instruction. Possible solution could be a. disable rdpmc passthrough This can solve the security hole, but rdpmc interception would bring heavy performance overhead. b. partially save/clear PMU MSRs at VM-Exit, but fully restore PMU MSRs at VM-Entry This can fix the security issue as well, but suppose the performance overhead won't be reduced too much since wrmsr brings more overhead. c. Fall back to only optimization 1, no any PMU MSR save/restore and rdpmc is set to interception by default if guest doesn't use any PMU counters. Suspect if it's a real scenario, at least for Linux, nmi_watchdog is enabled by default and it would manipulated PMU counters. > > Thanks. > -Mingwei >>> It's a similar idea to TIF_NEED_FPU_LOAD, just that instead of a common chunk of >>> kernel code swapping out the guest state (kernel_fpu_begin()), it's a callback >>> into KVM.
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 0d58fe7d243e..f79bebe7093d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -823,10 +823,83 @@ void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) static void intel_save_pmu_context(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 i; + + if (pmu->version != 2) { + pr_warn("only PerfMon v2 is supported for passthrough PMU"); + return; + } + + /* Global ctrl register is already saved at VM-exit. */ + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ + if (pmu->global_status) + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); + + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + rdpmcl(i, pmc->counter); + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel); + /* + * Clear hardware PERFMON_EVENTSELx and its counter to avoid + * leakage and also avoid this guest GP counter get accidentally + * enabled during host running when host enable global ctrl. + */ + if (pmc->eventsel) + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0); + if (pmc->counter) + wrmsrl(MSR_IA32_PMC0 + i, 0); + } + + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); + /* + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and + * also avoid these guest fixed counters get accidentially enabled + * during host running when host enable global ctrl. + */ + if (pmu->fixed_ctr_ctrl) + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); + if (pmc->counter) + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0); + } } static void intel_restore_pmu_context(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u64 global_status; + int i; + + if (pmu->version != 2) { + pr_warn("only PerfMon v2 is supported for passthrough PMU"); + return; + } + + /* Clear host global_ctrl and global_status MSR if non-zero. */ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); + if (global_status) + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status); + + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status); + + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + wrmsrl(MSR_IA32_PMC0 + i, pmc->counter); + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, pmc->eventsel); + } + + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, pmc->counter); + } } struct kvm_pmu_ops intel_pmu_ops __initdata = {