Message ID | 20220831085328.45489-6-likexu@tencent.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | x86/pmu: Corner cases fixes and optimization | expand |
On Wed, Aug 31, 2022, Like Xu wrote: > diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c > index 7f391750ebd3..3c42df3a55ff 100644 > --- a/arch/x86/kvm/pmu.c > +++ b/arch/x86/kvm/pmu.c > @@ -349,6 +349,10 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) > } > > reprogram_counter(pmc); > + > + if (pmc->counter < pmc->prev_counter) > + __kvm_perf_overflow(pmc, false); I would prefer to stick this in repgrogram_counter(), after pausing the counter and checking that the event is enabled, but before the actual programming/resume. I don't think false positives are actually possible, especially without my fixes for clearing reprogram_pmi bits (incoming), but I don't like the twisty logic that's required to suss out that prev_counter can be non-zero if and only if the PMC is enabled. The bigger issue is that calling __kvm_perf_overflow() here can get false negatives. If reprogramming fails due to contention, the reprogram_pmi bit will be left set and so this check in __kvm_perf_overflow() will suppress the PMI. if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) return; And the related issue is that because __kvm_perf_overflow() sets the bit and makes another KVM_REQ_PMU, overflow will cause KVM to reprogram the counter a second time. That's especially inefficient since KVM will get quite far into the VM-Enter flow before detecting the new event. So, I think this? (goes on top of patches I'm about to post) static void reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 eventsel = pmc->eventsel; u64 new_config = eventsel; u8 fixed_ctr_ctrl; pmc_pause_counter(pmc); if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) goto reprogram_complete; if (!check_pmu_event_filter(pmc)) goto reprogram_complete; if (pmc->counter < pmc->prev_counter) __kvm_perf_overflow(pmc, false); if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); if (pmc_is_fixed(pmc)) { fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, pmc->idx - INTEL_PMC_IDX_FIXED); if (fixed_ctr_ctrl & 0x1) eventsel |= ARCH_PERFMON_EVENTSEL_OS; if (fixed_ctr_ctrl & 0x2) eventsel |= ARCH_PERFMON_EVENTSEL_USR; if (fixed_ctr_ctrl & 0x8) eventsel |= ARCH_PERFMON_EVENTSEL_INT; new_config = (u64)fixed_ctr_ctrl; } if (pmc->current_config == new_config && pmc_resume_counter(pmc)) goto reprogram_complete; pmc_release_perf_event(pmc); pmc->current_config = new_config; /* * If reprogramming fails, e.g. due to contention, leave the counter's * regprogram bit set, i.e. opportunistically try again on the next PMU * refresh. Don't make a new request as doing so can stall the guest * if reprogramming repeatedly fails. */ if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW, (eventsel & pmu->raw_event_mask), !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), eventsel & ARCH_PERFMON_EVENTSEL_INT)) return; reprogram_complete: clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); pmc->prev_counter = 0; } static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); bool skip_pmi = false; if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { if (!in_pmi) { /* * TODO: KVM is currently _choosing_ to not generate records * for emulated instructions, avoiding BUFFER_OVF PMI when * there are no records. Strictly speaking, it should be done * as well in the right context to improve sampling accuracy. */ skip_pmi = true; } else { /* Indicate PEBS overflow PMI to guest. */ skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, (unsigned long *)&pmu->global_status); } } else { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); } if (!pmc->intr || skip_pmi) return; /* * Inject PMI. If vcpu was in a guest mode during NMI PMI * can be ejected on a guest mode re-entry. Otherwise we can't * be sure that vcpu wasn't executing hlt instruction at the * time of vmexit and is not going to re-enter guest mode until * woken up. So we should wake it, but this is impossible from * NMI context. Do it from irq work instead. */ if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu)) irq_work_queue(&pmc_to_pmu(pmc)->irq_work); else kvm_make_request(KVM_REQ_PMI, pmc->vcpu); } static void kvm_perf_overflow(struct perf_event *perf_event, struct perf_sample_data *data, struct pt_regs *regs) { struct kvm_pmc *pmc = perf_event->overflow_handler_context; /* * Ignore overflow events for counters that are scheduled to be * reprogrammed, e.g. if a PMI for the previous event races with KVM's * handling of a related guest WRMSR. */ if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi)) return; __kvm_perf_overflow(pmc, true); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); }
On 23/9/2022 7:59 am, Sean Christopherson wrote: > The bigger issue is that calling __kvm_perf_overflow() here can get false negatives. > If reprogramming fails due to contention, the reprogram_pmi bit will be left set > and so this check in __kvm_perf_overflow() will suppress the PMI. I understand your motivation, and before we reprocess on "left set reprogram_pmi bit", we may need reproducible cases to cover "reprogramming fails".
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4e568a7ef464..08c3f90b4ac3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -488,7 +488,10 @@ enum pmc_type { struct kvm_pmc { enum pmc_type type; u8 idx; + bool is_paused; + bool intr; u64 counter; + u64 prev_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; @@ -498,8 +501,6 @@ struct kvm_pmc { * ctrl value for fixed counters. */ u64 current_config; - bool is_paused; - bool intr; }; #define KVM_PMC_MAX_FIXED 3 diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 7f391750ebd3..3c42df3a55ff 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -349,6 +349,10 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) } reprogram_counter(pmc); + + if (pmc->counter < pmc->prev_counter) + __kvm_perf_overflow(pmc, false); + pmc->prev_counter = 0; } /* @@ -521,14 +525,9 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - u64 prev_count; - - prev_count = pmc->counter; + pmc->prev_counter = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - - reprogram_counter(pmc); - if (pmc->counter < prev_count) - __kvm_perf_overflow(pmc, false); + kvm_pmu_request_counter_reprogam(pmc); } static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 70219c19b872..0166f3bc6447 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -290,7 +290,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmc *pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; + pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 863a6ff9e214..1d3d0bd3e0e7 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -647,14 +647,14 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; + pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { pmc = &pmu->fixed_counters[i]; pmc_stop_counter(pmc); - pmc->counter = 0; + pmc->counter = pmc->prev_counter = 0; } pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;