@@ -522,6 +522,7 @@ struct kvm_pmc {
struct kvm_pmu {
unsigned nr_arch_gp_counters;
unsigned nr_arch_fixed_counters;
+ unsigned int nr_perf_metrics;
unsigned available_event_types;
u64 fixed_ctr_ctrl;
u64 fixed_ctr_ctrl_mask;
@@ -564,6 +565,11 @@ struct kvm_pmu {
*/
u64 host_cross_mapped_mask;
+ /* Intel Topdown Performance Metrics */
+ u64 perf_metrics;
+ u64 perf_metrics_mask;
+ bool metric_event_ovf;
+
/*
* The gate to release perf_events not marked in
* pmc_in_use only once in a vcpu time slice.
@@ -246,6 +246,7 @@
#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE)
+#define MSR_CAP_PERF_METRICS BIT_ULL(15)
#define MSR_IA32_RTIT_CTL 0x00000570
#define RTIT_CTL_TRACEEN BIT(0)
@@ -96,6 +96,11 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
kvm_pmu_deliver_pmi(vcpu);
}
+static inline bool pmc_support_perf_metrics(struct kvm_pmc *pmc)
+{
+ return pmc->idx == (INTEL_PMC_IDX_FIXED + 3);
+}
+
static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -115,6 +120,10 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
(unsigned long *)&pmu->global_status);
}
+ } else if (pmu->metric_event_ovf && pmc_support_perf_metrics(pmc)) {
+ /* At least one of PERF_METRICS sub-counter has overflowed */
+ __set_bit(INTEL_PERF_METRICS_IDX, (unsigned long *)&pmu->global_status);
+ pmu->metric_event_ovf = false;
} else {
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
}
@@ -155,6 +164,16 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
}
+static inline bool perf_metrics_is_enabled(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ return !pmu->perf_metrics && !pmc->counter &&
+ pmc_support_perf_metrics(pmc) && pmc_speculative_in_use(pmc) &&
+ pmc_is_enabled(pmc) && test_bit(GLOBAL_CTRL_EN_PERF_METRICS,
+ (unsigned long *)&pmu->global_ctrl);
+}
+
static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
bool exclude_user, bool exclude_kernel,
bool intr)
@@ -172,6 +191,7 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
.config = config,
};
bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
+ bool topdown = perf_metrics_is_enabled(pmc);
unsigned int i;
attr.sample_period = get_sample_period(pmc, pmc->counter);
@@ -205,6 +225,10 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
attr.precise_ip = 3;
}
+ /* A group of events is needed to emulate each perf_metric. */
+ if (topdown)
+ pmc->max_nr_events = pmu->nr_perf_metrics + 1;
+
/*
* To create grouped events, the first created perf_event doesn't
* know it will be the group_leader and may move to an unexpected
@@ -215,6 +239,17 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
attr.disabled = 1;
for (i = 0; i < pmc->max_nr_events; i++) {
+ if (topdown) {
+ /*
+ * According to perf core, the group_leader slots event must
+ * not be a sampling event for topdown metric use, and the
+ * topdown metric events don't support sampling.
+ */
+ attr.sample_period = 0;
+ if (i)
+ attr.config = INTEL_TD_METRIC_RETIRING + 0x100 * (i - 1);
+ }
+
group_leader = i ? pmc->perf_event : NULL;
event = perf_event_create_kernel_counter(&attr, -1, current, group_leader,
kvm_perf_overflow, pmc);
@@ -229,7 +264,7 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
}
pmc->is_paused = false;
- pmc->intr = intr || pebs;
+ pmc->intr = intr || pebs || topdown;
if (!attr.disabled)
return 0;
@@ -263,6 +298,9 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
if (!pmc->perf_event)
return false;
+ if (perf_metrics_is_enabled(pmc) == (pmc->max_nr_events == 1))
+ return false;
+
/* recalibrate sample period and check if it's accepted by perf core */
if (is_sampling_event(pmc->perf_event) &&
perf_event_period(pmc->perf_event,
@@ -447,6 +485,60 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
return 0;
}
+void pmu_read_perf_metrics(struct kvm_pmu *pmu, u64 slots)
+{
+ struct kvm_pmc *pmc = &pmu->fixed_counters[3];
+ u64 old_counter, enabled, running, delta;
+ union {
+ u8 field[KVM_PERF_METRICS_NUM_MAX];
+ u64 data;
+ } perf_metrics;
+ unsigned int i;
+
+ perf_metrics.data = pmu->perf_metrics;
+ for (i = 1; i < pmc->max_nr_events; i++) {
+ if (!pmc->perf_events[i])
+ continue;
+
+ old_counter = perf_metrics.field[i - 1];
+ delta = perf_event_read_value(pmc->perf_events[i], &enabled, &running);
+
+ /*
+ * Reverse the actual metric counter value out
+ * according to icl_get_metrics_event_value().
+ */
+ delta = mul_u64_u64_div_u64(delta, 0xff, slots) + 1;
+ perf_metrics.field[i - 1] = 0xff & (old_counter + delta);
+
+ /* Check if any metric counter have been overflowed. */
+ if (perf_metrics.field[i - 1] < old_counter)
+ pmu->metric_event_ovf = true;
+ }
+
+ if (pmu->metric_event_ovf)
+ __kvm_perf_overflow(pmc, false);
+
+ pmu->perf_metrics = perf_metrics.data & (~pmu->perf_metrics_mask);
+ __set_bit(INTEL_PERF_METRICS_IDX, pmu->pmc_in_use);
+}
+
+u64 pmc_read_counter(struct kvm_pmc *pmc)
+{
+ u64 counter, enabled, running, delta;
+
+ counter = pmc->counter;
+ if (pmc->perf_event && !pmc->is_paused) {
+ delta = perf_event_read_value(pmc->perf_event, &enabled, &running);
+ if (delta && pmc_support_perf_metrics(pmc))
+ pmu_read_perf_metrics(pmc_to_pmu(pmc), delta);
+ counter += delta;
+ }
+
+ /* FIXME: Scaling needed? */
+ return counter & pmc_bitmask(pmc);
+}
+EXPORT_SYMBOL_GPL(pmc_read_counter);
+
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
{
bool fast_mode = idx & (1u << 31);
@@ -469,7 +561,16 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
(kvm_read_cr0(vcpu) & X86_CR0_PE))
return 1;
- *data = pmc_read_counter(pmc) & mask;
+ if (idx & INTEL_PMC_FIXED_RDPMC_METRICS) {
+ if (!(vcpu->arch.perf_capabilities & MSR_CAP_PERF_METRICS))
+ return 1;
+
+ pmc_read_counter(&pmu->fixed_counters[3]);
+ *data = pmu->perf_metrics;
+ } else {
+ *data = pmc_read_counter(pmc) & mask;
+ }
+
return 0;
}
@@ -10,6 +10,8 @@
#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \
MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
+#define INTEL_PERF_METRICS_IDX GLOBAL_CTRL_EN_PERF_METRICS
+#define KVM_PERF_METRICS_NUM_MAX 8
/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
@@ -42,6 +44,8 @@ struct kvm_pmu_ops {
};
void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
+void pmu_read_perf_metrics(struct kvm_pmu *pmu, u64 slots);
+u64 pmc_read_counter(struct kvm_pmc *pmc);
static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
{
@@ -50,18 +54,6 @@ static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
return pmu->counter_bitmask[pmc->type];
}
-static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
-{
- u64 counter, enabled, running;
-
- counter = pmc->counter;
- if (pmc->perf_event && !pmc->is_paused)
- counter += perf_event_read_value(pmc->perf_event,
- &enabled, &running);
- /* FIXME: Scaling needed? */
- return counter & pmc_bitmask(pmc);
-}
-
static inline void pmc_release_perf_event(struct kvm_pmc *pmc)
{
unsigned int i;
@@ -81,6 +81,8 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
if (pmc_idx < INTEL_PMC_IDX_FIXED) {
return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
MSR_P6_EVNTSEL0);
+ } else if (pmc_idx == INTEL_PERF_METRICS_IDX) {
+ return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR3);
} else {
u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
@@ -160,6 +162,12 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
counters = pmu->gp_counters;
num_counters = pmu->nr_arch_gp_counters;
}
+
+ if (idx & INTEL_PMC_FIXED_RDPMC_METRICS) {
+ fixed = true;
+ idx = 3;
+ }
+
if (idx >= num_counters)
return NULL;
*mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
@@ -229,6 +237,9 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) &&
((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3);
break;
+ case MSR_PERF_METRICS:
+ ret = vcpu->arch.perf_capabilities & MSR_CAP_PERF_METRICS;
+ break;
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
@@ -385,6 +396,10 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_PEBS_DATA_CFG:
msr_info->data = pmu->pebs_data_cfg;
return 0;
+ case MSR_PERF_METRICS:
+ pmc_read_counter(&pmu->fixed_counters[3]);
+ msr_info->data = pmu->perf_metrics;
+ return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -472,6 +487,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
break;
+ case MSR_PERF_METRICS:
+ if (pmu->perf_metrics == data)
+ return 0;
+ if (!(data & pmu->perf_metrics_mask)) {
+ pmu->perf_metrics = data;
+ return 0;
+ }
+ break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -545,6 +568,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->fixed_ctr_ctrl_mask = ~0ull;
pmu->pebs_enable_mask = ~0ull;
pmu->pebs_data_cfg_mask = ~0ull;
+ pmu->perf_metrics_mask = ~0ull;
entry = kvm_find_cpuid_entry(vcpu, 0xa);
if (!entry || !vcpu->kvm->arch.enable_pmu)
@@ -584,6 +608,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
+ if (vcpu->arch.perf_capabilities & MSR_CAP_PERF_METRICS) {
+ counter_mask &= ~(1ULL << INTEL_PERF_METRICS_IDX);
+ pmu->nr_perf_metrics = min_t(int, KVM_PERF_METRICS_NUM_MAX,
+ kvm_pmu_cap.num_topdown_events);
+ for (i = 0; i < pmu->nr_perf_metrics; i++)
+ pmu->perf_metrics_mask &= ~(0xffull << (i * 8));
+ }
pmu->global_ctrl_mask = counter_mask;
pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
@@ -656,6 +687,8 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
lbr_desc->records.nr = 0;
lbr_desc->event = NULL;
lbr_desc->msr_passthrough = false;
+
+ BUILD_BUG_ON(KVM_PERF_METRICS_NUM_MAX > INTEL_TD_METRIC_NUM);
}
static void intel_pmu_reset(struct kvm_vcpu *vcpu)
@@ -7721,6 +7721,9 @@ static u64 vmx_get_perf_capabilities(void)
perf_cap &= ~PERF_CAP_PEBS_BASELINE;
}
+ if (kvm_pmu_cap.num_topdown_events)
+ perf_cap |= host_perf_cap & MSR_CAP_PERF_METRICS;
+
return perf_cap;
}
@@ -1439,6 +1439,7 @@ static const u32 msrs_to_save_all[] = {
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
+ MSR_PERF_METRICS,
/* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */
MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
@@ -3880,6 +3881,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_PEBS_ENABLE:
case MSR_IA32_DS_AREA:
case MSR_PEBS_DATA_CFG:
+ case MSR_PERF_METRICS:
case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
@@ -3983,6 +3985,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_PEBS_ENABLE:
case MSR_IA32_DS_AREA:
case MSR_PEBS_DATA_CFG:
+ case MSR_PERF_METRICS:
case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);