@@ -175,6 +175,24 @@ static inline bool lbr_is_enabled(struct kvm_vcpu *vcpu)
return lbr->nr && (vcpu->arch.perf_capabilities & PMU_CAP_LBR_FMT);
}
+static bool intel_is_valid_lbr_record_msr(struct kvm_vcpu *vcpu, u32 index)
+{
+ struct x86_pmu_lbr *lbr = &to_vmx(vcpu)->lbr_desc.lbr;
+ bool ret = false;
+
+ if (!lbr_is_enabled(vcpu))
+ return ret;
+
+ ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS) ||
+ (index >= lbr->from && index < lbr->from + lbr->nr) ||
+ (index >= lbr->to && index < lbr->to + lbr->nr);
+
+ if (!ret && lbr->info)
+ ret = (index >= lbr->info && index < lbr->info + lbr->nr);
+
+ return ret;
+}
+
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -194,7 +212,8 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
- get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr);
+ get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) ||
+ intel_is_valid_lbr_record_msr(vcpu, msr);
break;
}
@@ -213,6 +232,113 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
return pmc;
}
+static int intel_pmu_create_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = &to_vmx(vcpu)->lbr_desc;
+ struct perf_event *event;
+
+ /*
+ * The perf_event_attr is constructed in the minimum efficient way:
+ * - set 'pinned = true' to make it task pinned so that if another
+ * cpu pinned event reclaims LBR, the event->oncpu will be set to -1;
+ * - set '.exclude_host = true' to record guest branches behavior;
+ *
+ * - set '.config = INTEL_FIXED_VLBR_EVENT' to indicates host perf
+ * schedule the event without a real HW counter but a fake one;
+ * check is_guest_lbr_event() and __intel_get_event_constraints();
+ *
+ * - set 'sample_type = PERF_SAMPLE_BRANCH_STACK' and
+ * 'branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+ * PERF_SAMPLE_BRANCH_USER' to configure it as a LBR callstack
+ * event, which helps KVM to save/restore guest LBR records
+ * during host context switches and reduces quite a lot overhead,
+ * check branch_user_callstack() and intel_pmu_lbr_sched_task();
+ */
+ struct perf_event_attr attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(attr),
+ .config = INTEL_FIXED_VLBR_EVENT,
+ .sample_type = PERF_SAMPLE_BRANCH_STACK,
+ .pinned = true,
+ .exclude_host = true,
+ .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+ PERF_SAMPLE_BRANCH_USER,
+ };
+
+ if (unlikely(lbr_desc->event))
+ return 0;
+
+ event = perf_event_create_kernel_counter(&attr, -1,
+ current, NULL, NULL);
+ if (IS_ERR(event)) {
+ pr_debug_ratelimited("%s: failed %ld\n",
+ __func__, PTR_ERR(event));
+ return -ENOENT;
+ }
+ lbr_desc->event = event;
+ pmu->event_count++;
+ return 0;
+}
+
+static void intel_pmu_free_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = &to_vmx(vcpu)->lbr_desc;
+ struct perf_event *event = lbr_desc->event;
+
+ if (!event)
+ return;
+
+ perf_event_release_kernel(event);
+ lbr_desc->event = NULL;
+ pmu->event_count--;
+}
+
+/*
+ * It's safe to access LBR msrs from guest when they have not
+ * been passthrough since the host would help restore or reset
+ * the LBR msrs records when the guest LBR event is scheduled in.
+ */
+static bool access_lbr_record_msr(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info, bool read)
+{
+ struct lbr_desc *lbr_desc = &to_vmx(vcpu)->lbr_desc;
+ u32 index = msr_info->index;
+
+ if (!intel_is_valid_lbr_record_msr(vcpu, index))
+ return false;
+
+ if (msr_info->host_initiated)
+ goto dummy;
+
+ if (!lbr_desc->event && !intel_pmu_create_lbr_event(vcpu))
+ goto dummy;
+
+ /*
+ * Disable irq to ensure the LBR feature doesn't get reclaimed by the
+ * host at the time the value is read from the msr, and this avoids the
+ * host LBR value to be leaked to the guest. If LBR has been reclaimed,
+ * return 0 on guest reads.
+ */
+ local_irq_disable();
+ if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) {
+ if (read)
+ rdmsrl(index, msr_info->data);
+ else
+ wrmsrl(index, msr_info->data);
+ } else if (read)
+ msr_info->data = 0;
+ local_irq_enable();
+
+ return true;
+
+dummy:
+ if (read)
+ msr_info->data = 0;
+ return true;
+}
+
static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -256,7 +382,8 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
msr_info->data = pmc->eventsel;
return 0;
- }
+ } else if (access_lbr_record_msr(vcpu, msr_info, true))
+ return 0;
}
return 1;
@@ -354,6 +481,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~vcpu_get_supported_debugctl(vcpu))
return 1;
vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ if (!msr_info->host_initiated && !to_vmx(vcpu)->lbr_desc.event)
+ intel_pmu_create_lbr_event(vcpu);
return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
@@ -382,7 +511,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
reprogram_gp_counter(pmc, data);
return 0;
}
- }
+ } else if (access_lbr_record_msr(vcpu, msr_info, false))
+ return 0;
}
return 1;
@@ -483,6 +613,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
vcpu->arch.perf_capabilities = guest_cpuid_has(vcpu, X86_FEATURE_PDCM) ?
vmx_get_perf_capabilities() : 0;
lbr_desc->lbr.nr = 0;
+ lbr_desc->event = NULL;
}
static void intel_pmu_reset(struct kvm_vcpu *vcpu)
@@ -507,6 +638,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
pmu->global_ovf_ctrl = 0;
+ intel_pmu_free_lbr_event(vcpu);
}
struct kvm_pmu_ops intel_pmu_ops = {
@@ -3857,6 +3857,71 @@ void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
}
}
+static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set)
+{
+ unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
+ struct x86_pmu_lbr *lbr = &to_vmx(vcpu)->lbr_desc.lbr;
+ int i;
+
+ WARN_ON_ONCE(!lbr->nr);
+
+ vmx_set_intercept_for_msr(msr_bitmap, MSR_LBR_SELECT, MSR_TYPE_RW, set);
+ vmx_set_intercept_for_msr(msr_bitmap, MSR_LBR_TOS, MSR_TYPE_RW, set);
+ for (i = 0; i < lbr->nr; i++) {
+ vmx_set_intercept_for_msr(msr_bitmap,
+ lbr->from + i, MSR_TYPE_RW, set);
+ vmx_set_intercept_for_msr(msr_bitmap,
+ lbr->to + i, MSR_TYPE_RW, set);
+ if (lbr->info)
+ vmx_set_intercept_for_msr(msr_bitmap,
+ lbr->info + i, MSR_TYPE_RW, set);
+ }
+}
+
+static inline void vmx_lbr_disable_passthrough(struct kvm_vcpu *vcpu)
+{
+ vmx_update_intercept_for_lbr_msrs(vcpu, true);
+}
+
+static inline void vmx_lbr_enable_passthrough(struct kvm_vcpu *vcpu)
+{
+ vmx_update_intercept_for_lbr_msrs(vcpu, false);
+}
+
+/*
+ * Higher priority host perf events (e.g. cpu pinned) could reclaim the
+ * pmu resources (e.g. LBR) that were assigned to the guest. This is
+ * usually done via ipi calls (more details in perf_install_in_context).
+ *
+ * Before entering the non-root mode (with irq disabled here), double
+ * confirm that the pmu features enabled to the guest are not reclaimed
+ * by higher priority host events. Otherwise, disallow vcpu's access to
+ * the reclaimed features.
+ */
+static void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = &to_vmx(vcpu)->lbr_desc;
+
+ if (!lbr_desc->event) {
+ vmx_lbr_disable_passthrough(vcpu);
+ if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
+ goto warn;
+ return;
+ }
+
+ if (lbr_desc->event->state < PERF_EVENT_STATE_ACTIVE) {
+ vmx_lbr_disable_passthrough(vcpu);
+ goto warn;
+ } else
+ vmx_lbr_enable_passthrough(vcpu);
+
+ return;
+
+warn:
+ pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n",
+ vcpu->vcpu_id);
+}
+
static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6728,8 +6793,11 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
pt_guest_enter(vmx);
- if (vcpu_to_pmu(vcpu)->version)
+ if (vcpu_to_pmu(vcpu)->version) {
atomic_switch_perf_msrs(vmx);
+ if (vcpu->arch.perf_capabilities & PMU_CAP_LBR_FMT)
+ vmx_passthrough_lbr_msrs(vcpu);
+ }
atomic_switch_umwait_control_msr(vmx);
if (enable_preemption_timer)
@@ -94,6 +94,14 @@ struct pt_desc {
struct lbr_desc {
/* Basic information about LBR records. */
struct x86_pmu_lbr lbr;
+
+ /*
+ * Emulate LBR feature via passthrough LBR registers when the
+ * per-vcpu guest LBR event is scheduled on the current pcpu.
+ *
+ * The records may be inaccurate if the host reclaims the LBR.
+ */
+ struct perf_event *event;
};
/*