diff mbox series

[v11,10/11] KVM: x86/pmu: Check guest LBR availability in case host reclaims them

Message ID 20200514083054.62538-11-like.xu@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Guest Last Branch Recording Enabling | expand

Commit Message

Like Xu May 14, 2020, 8:30 a.m. UTC
When the guest LBR event exists and the LBR stack is available (defined
as 'event->oncpu != -1'), the LBR records msrs access would not be
intercepted but passthrough to the vcpu before vm-entry. This kind
of availability check is always performed before vm-entry, but as late
as possible to avoid reclaiming resources from any higher priority event.
A negative check result would bring the registers interception back, and
it also prevents real registers accesses and potential data leakage.

The KVM emits a pr_warn() on the host when the guest LBR is unavailable.
The administer is supposed to reminder users that the guest result may be
inaccurate if someone is using LBR to record hypervisor on the host side.

Suggested-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Like Xu <like.xu@linux.intel.com>
---
 arch/x86/kvm/pmu.h           |  1 +
 arch/x86/kvm/vmx/pmu_intel.c | 38 ++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/vmx.c       |  4 +++-
 3 files changed, 42 insertions(+), 1 deletion(-)

Comments

Peter Zijlstra May 19, 2020, 11:15 a.m. UTC | #1
On Thu, May 14, 2020 at 04:30:53PM +0800, Like Xu wrote:

> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> index ea4faae56473..db185dca903d 100644
> --- a/arch/x86/kvm/vmx/pmu_intel.c
> +++ b/arch/x86/kvm/vmx/pmu_intel.c
> @@ -646,6 +646,43 @@ static void intel_pmu_lbr_cleanup(struct kvm_vcpu *vcpu)
>  		intel_pmu_free_lbr_event(vcpu);
>  }
>  
> +static bool intel_pmu_lbr_is_availabile(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +
> +	if (!pmu->lbr_event)
> +		return false;
> +
> +	if (event_is_oncpu(pmu->lbr_event)) {
> +		intel_pmu_intercept_lbr_msrs(vcpu, false);
> +	} else {
> +		intel_pmu_intercept_lbr_msrs(vcpu, true);
> +		return false;
> +	}
> +
> +	return true;
> +}

This is unreadable gunk, what?

> +/*
> + * Higher priority host perf events (e.g. cpu pinned) could reclaim the
> + * pmu resources (e.g. LBR) that were assigned to the guest. This is
> + * usually done via ipi calls (more details in perf_install_in_context).
> + *
> + * Before entering the non-root mode (with irq disabled here), double
> + * confirm that the pmu features enabled to the guest are not reclaimed
> + * by higher priority host events. Otherwise, disallow vcpu's access to
> + * the reclaimed features.
> + */
> +static void intel_pmu_availability_check(struct kvm_vcpu *vcpu)
> +{
> +	lockdep_assert_irqs_disabled();
> +
> +	if (lbr_is_enabled(vcpu) && !intel_pmu_lbr_is_availabile(vcpu) &&
> +		(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
> +		pr_warn_ratelimited("kvm: vcpu-%d: LBR is temporarily unavailable.\n",
> +			vcpu->vcpu_id);

More unreadable nonsense; when the events go into ERROR state, it's a
permanent fail, they'll not come back.

> +}
> +
>  struct kvm_pmu_ops intel_pmu_ops = {
>  	.find_arch_event = intel_find_arch_event,
>  	.find_fixed_event = intel_find_fixed_event,
> @@ -662,4 +699,5 @@ struct kvm_pmu_ops intel_pmu_ops = {
>  	.reset = intel_pmu_reset,
>  	.deliver_pmi = intel_pmu_deliver_pmi,
>  	.lbr_cleanup = intel_pmu_lbr_cleanup,
> +	.availability_check = intel_pmu_availability_check,
>  };
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 9969d663826a..80d036c5f64a 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -6696,8 +6696,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
>  
>  	pt_guest_enter(vmx);
>  
> -	if (vcpu_to_pmu(vcpu)->version)
> +	if (vcpu_to_pmu(vcpu)->version) {
>  		atomic_switch_perf_msrs(vmx);
> +		kvm_x86_ops.pmu_ops->availability_check(vcpu);
> +	}

AFAICT you just did a call out to the kvm_pmu crud in
atomic_switch_perf_msrs(), why do another call?
Xu, Like May 19, 2020, 1:10 p.m. UTC | #2
On 2020/5/19 19:15, Peter Zijlstra wrote:
> On Thu, May 14, 2020 at 04:30:53PM +0800, Like Xu wrote:
>
>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
>> index ea4faae56473..db185dca903d 100644
>> --- a/arch/x86/kvm/vmx/pmu_intel.c
>> +++ b/arch/x86/kvm/vmx/pmu_intel.c
>> @@ -646,6 +646,43 @@ static void intel_pmu_lbr_cleanup(struct kvm_vcpu *vcpu)
>>   		intel_pmu_free_lbr_event(vcpu);
>>   }
>>   
>> +static bool intel_pmu_lbr_is_availabile(struct kvm_vcpu *vcpu)
>> +{
>> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
>> +
>> +	if (!pmu->lbr_event)
>> +		return false;
>> +
>> +	if (event_is_oncpu(pmu->lbr_event)) {
>> +		intel_pmu_intercept_lbr_msrs(vcpu, false);
>> +	} else {
>> +		intel_pmu_intercept_lbr_msrs(vcpu, true);
>> +		return false;
>> +	}
>> +
>> +	return true;
>> +}
> This is unreadable gunk, what?

Abstractly, it is saying "KVM would passthrough the LBR satck MSRs if
event_is_oncpu() is true, otherwise cancel the passthrough state if any."

I'm using 'event->oncpu != -1' to represent the guest LBR event
is scheduled on rather than 'event->state == PERF_EVENT_STATE_ERROR'.

For intel_pmu_intercept_lbr_msrs(), false means to passthrough the LBR stack
MSRs to the vCPU, and true means to cancel the passthrough state and make
LBR MSR accesses trapped by the KVM.

>
>> +/*
>> + * Higher priority host perf events (e.g. cpu pinned) could reclaim the
>> + * pmu resources (e.g. LBR) that were assigned to the guest. This is
>> + * usually done via ipi calls (more details in perf_install_in_context).
>> + *
>> + * Before entering the non-root mode (with irq disabled here), double
>> + * confirm that the pmu features enabled to the guest are not reclaimed
>> + * by higher priority host events. Otherwise, disallow vcpu's access to
>> + * the reclaimed features.
>> + */
>> +static void intel_pmu_availability_check(struct kvm_vcpu *vcpu)
>> +{
>> +	lockdep_assert_irqs_disabled();
>> +
>> +	if (lbr_is_enabled(vcpu) && !intel_pmu_lbr_is_availabile(vcpu) &&
>> +		(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
>> +		pr_warn_ratelimited("kvm: vcpu-%d: LBR is temporarily unavailable.\n",
>> +			vcpu->vcpu_id);
> More unreadable nonsense; when the events go into ERROR state, it's a
> permanent fail, they'll not come back.
It's not true.  The guest LBR event with 'ERROR state' or 'oncpu != -1' 
would be
lazy released and re-created in the next time the 
intel_pmu_create_lbr_event() is
called and it's supposed to be re-scheduled and re-do availability_check() 
as well.

 From the perspective of the guest user, the guest LBR is only temporarily 
unavailable
until the host no longer reclaims the LBR.
>
>> +}
>> +
>>   struct kvm_pmu_ops intel_pmu_ops = {
>>   	.find_arch_event = intel_find_arch_event,
>>   	.find_fixed_event = intel_find_fixed_event,
>> @@ -662,4 +699,5 @@ struct kvm_pmu_ops intel_pmu_ops = {
>>   	.reset = intel_pmu_reset,
>>   	.deliver_pmi = intel_pmu_deliver_pmi,
>>   	.lbr_cleanup = intel_pmu_lbr_cleanup,
>> +	.availability_check = intel_pmu_availability_check,
>>   };
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 9969d663826a..80d036c5f64a 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -6696,8 +6696,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
>>   
>>   	pt_guest_enter(vmx);
>>   
>> -	if (vcpu_to_pmu(vcpu)->version)
>> +	if (vcpu_to_pmu(vcpu)->version) {
>>   		atomic_switch_perf_msrs(vmx);
>> +		kvm_x86_ops.pmu_ops->availability_check(vcpu);
>> +	}
> AFAICT you just did a call out to the kvm_pmu crud in
> atomic_switch_perf_msrs(), why do another call?
In fact, availability_check() is only called here for just one time.

The callchain looks like:
- vmx_vcpu_run()
     - kvm_x86_ops.pmu_ops->availability_check();
         - intel_pmu_availability_check()
             - intel_pmu_lbr_is_availabile()
                 - event_is_oncpu() ...

Thanks,
Like Xu
>
>
Peter Zijlstra May 19, 2020, 2:57 p.m. UTC | #3
On Tue, May 19, 2020 at 09:10:58PM +0800, Xu, Like wrote:
> On 2020/5/19 19:15, Peter Zijlstra wrote:
> > On Thu, May 14, 2020 at 04:30:53PM +0800, Like Xu wrote:
> > 
> > > diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> > > index ea4faae56473..db185dca903d 100644
> > > --- a/arch/x86/kvm/vmx/pmu_intel.c
> > > +++ b/arch/x86/kvm/vmx/pmu_intel.c
> > > @@ -646,6 +646,43 @@ static void intel_pmu_lbr_cleanup(struct kvm_vcpu *vcpu)
> > >   		intel_pmu_free_lbr_event(vcpu);
> > >   }
> > > +static bool intel_pmu_lbr_is_availabile(struct kvm_vcpu *vcpu)
> > > +{
> > > +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> > > +
> > > +	if (!pmu->lbr_event)
> > > +		return false;
> > > +
> > > +	if (event_is_oncpu(pmu->lbr_event)) {
> > > +		intel_pmu_intercept_lbr_msrs(vcpu, false);
> > > +	} else {
> > > +		intel_pmu_intercept_lbr_msrs(vcpu, true);
> > > +		return false;
> > > +	}
> > > +
> > > +	return true;
> > > +}
> > This is unreadable gunk, what?
> 
> Abstractly, it is saying "KVM would passthrough the LBR satck MSRs if
> event_is_oncpu() is true, otherwise cancel the passthrough state if any."
> 
> I'm using 'event->oncpu != -1' to represent the guest LBR event
> is scheduled on rather than 'event->state == PERF_EVENT_STATE_ERROR'.
> 
> For intel_pmu_intercept_lbr_msrs(), false means to passthrough the LBR stack
> MSRs to the vCPU, and true means to cancel the passthrough state and make
> LBR MSR accesses trapped by the KVM.

To me it seems very weird to change state in a function that is supposed
to just query state.

'is_available' seems to suggest a simple: return 'lbr_event->state ==
PERF_EVENT_STATE_ACTIVE' or something.


> > > +static void intel_pmu_availability_check(struct kvm_vcpu *vcpu)
> > > +{
> > > +	lockdep_assert_irqs_disabled();
> > > +
> > > +	if (lbr_is_enabled(vcpu) && !intel_pmu_lbr_is_availabile(vcpu) &&
> > > +		(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
> > > +		pr_warn_ratelimited("kvm: vcpu-%d: LBR is temporarily unavailable.\n",
> > > +			vcpu->vcpu_id);
> > More unreadable nonsense; when the events go into ERROR state, it's a
> > permanent fail, they'll not come back.
> It's not true.  The guest LBR event with 'ERROR state' or 'oncpu != -1'
> would be
> lazy released and re-created in the next time the
> intel_pmu_create_lbr_event() is
> called and it's supposed to be re-scheduled and re-do availability_check()
> as well.

Where? Also, wth would you need to destroy and re-create an event for
that?

> > > @@ -6696,8 +6696,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
> > >   	pt_guest_enter(vmx);
> > > -	if (vcpu_to_pmu(vcpu)->version)
> > > +	if (vcpu_to_pmu(vcpu)->version) {
> > >   		atomic_switch_perf_msrs(vmx);
> > > +		kvm_x86_ops.pmu_ops->availability_check(vcpu);
> > > +	}
> > AFAICT you just did a call out to the kvm_pmu crud in
> > atomic_switch_perf_msrs(), why do another call?
> In fact, availability_check() is only called here for just one time.
> 
> The callchain looks like:
> - vmx_vcpu_run()
>     - kvm_x86_ops.pmu_ops->availability_check();
>         - intel_pmu_availability_check()
>             - intel_pmu_lbr_is_availabile()
>                 - event_is_oncpu() ...
> 

What I'm saying is that you just did a pmu_ops indirect call in
atomic_switch_perf_msrs(), why add another?
Xu, Like May 20, 2020, 2:01 a.m. UTC | #4
On 2020/5/19 22:57, Peter Zijlstra wrote:
> On Tue, May 19, 2020 at 09:10:58PM +0800, Xu, Like wrote:
>> On 2020/5/19 19:15, Peter Zijlstra wrote:
>>> On Thu, May 14, 2020 at 04:30:53PM +0800, Like Xu wrote:
>>>
>>>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
>>>> index ea4faae56473..db185dca903d 100644
>>>> --- a/arch/x86/kvm/vmx/pmu_intel.c
>>>> +++ b/arch/x86/kvm/vmx/pmu_intel.c
>>>> @@ -646,6 +646,43 @@ static void intel_pmu_lbr_cleanup(struct kvm_vcpu *vcpu)
>>>>    		intel_pmu_free_lbr_event(vcpu);
>>>>    }
>>>> +static bool intel_pmu_lbr_is_availabile(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
>>>> +
>>>> +	if (!pmu->lbr_event)
>>>> +		return false;
>>>> +
>>>> +	if (event_is_oncpu(pmu->lbr_event)) {
>>>> +		intel_pmu_intercept_lbr_msrs(vcpu, false);
>>>> +	} else {
>>>> +		intel_pmu_intercept_lbr_msrs(vcpu, true);
>>>> +		return false;
>>>> +	}
>>>> +
>>>> +	return true;
>>>> +}
>>> This is unreadable gunk, what?
>> Abstractly, it is saying "KVM would passthrough the LBR satck MSRs if
>> event_is_oncpu() is true, otherwise cancel the passthrough state if any."
>>
>> I'm using 'event->oncpu != -1' to represent the guest LBR event
>> is scheduled on rather than 'event->state == PERF_EVENT_STATE_ERROR'.
>>
>> For intel_pmu_intercept_lbr_msrs(), false means to passthrough the LBR stack
>> MSRs to the vCPU, and true means to cancel the passthrough state and make
>> LBR MSR accesses trapped by the KVM.
> To me it seems very weird to change state in a function that is supposed
> to just query state.
>
> 'is_available' seems to suggest a simple: return 'lbr_event->state ==
> PERF_EVENT_STATE_ACTIVE' or something.
This clarification led me to reconsider the use of a more readable name here.

Do you accept the check usage of "event->oncpu != -1" instead of
'event->state == PERF_EVENT_STATE_ERROR' before KVM do passthrough ?
>
>>>> +static void intel_pmu_availability_check(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	lockdep_assert_irqs_disabled();
>>>> +
>>>> +	if (lbr_is_enabled(vcpu) && !intel_pmu_lbr_is_availabile(vcpu) &&
>>>> +		(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
>>>> +		pr_warn_ratelimited("kvm: vcpu-%d: LBR is temporarily unavailable.\n",
>>>> +			vcpu->vcpu_id);
>>> More unreadable nonsense; when the events go into ERROR state, it's a
>>> permanent fail, they'll not come back.
>> It's not true.  The guest LBR event with 'ERROR state' or 'oncpu != -1'
>> would be
>> lazy released and re-created in the next time the
>> intel_pmu_create_lbr_event() is
>> called and it's supposed to be re-scheduled and re-do availability_check()
>> as well.
> Where? Also, wth would you need to destroy and re-create an event for
> that?
If the guest does not set the EN_LBR bit and did not touch any LBR-related 
registers
in the last time slice, KVM will destroy the guest LBR event in 
kvm_pmu_cleanup()
which is called once every time the vCPU thread is scheduled in.

The re-creation is not directly called after the destruction
but is triggered by the next guest access to the LBR-related registers if any.

 From the time when the guest LBR event enters the "oncpu! = -1" state
to the next re-creation, the guest LBR is not available. After the re-creation,
the guest LBR is hopefully available and if it's true, the LBR will be 
passthrough
and used by the guest normally.

That's the reason for "LBR is temporarily unavailable"
and please let me know if it doesn't make sense to you.

>>>> @@ -6696,8 +6696,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
>>>>    	pt_guest_enter(vmx);
>>>> -	if (vcpu_to_pmu(vcpu)->version)
>>>> +	if (vcpu_to_pmu(vcpu)->version) {
>>>>    		atomic_switch_perf_msrs(vmx);
>>>> +		kvm_x86_ops.pmu_ops->availability_check(vcpu);
>>>> +	}
>>> AFAICT you just did a call out to the kvm_pmu crud in
>>> atomic_switch_perf_msrs(), why do another call?
>> In fact, availability_check() is only called here for just one time.
>>
>> The callchain looks like:
>> - vmx_vcpu_run()
>>      - kvm_x86_ops.pmu_ops->availability_check();
>>          - intel_pmu_availability_check()
>>              - intel_pmu_lbr_is_availabile()
>>                  - event_is_oncpu() ...
>>
> What I'm saying is that you just did a pmu_ops indirect call in
> atomic_switch_perf_msrs(), why add another?
Do you mean the indirect call:
- atomic_switch_perf_msrs()
     - perf_guest_get_msrs()
         - x86_pmu.guest_get_msrs()
?

The two pmu_ops are quite different:
- the first one in atomic_switch_perf_msrs() is defined in the host side;
- the second one for availability_check() is defined in the KVM side;

The availability_check() for guest LBR event and MSRs pass-through
operations are definitely KVM context specific.

Thanks,
Like Xu
Like Xu May 27, 2020, 8:17 a.m. UTC | #5
Hi Peter,

On 2020/5/20 10:01, Xu, Like wrote:
> On 2020/5/19 22:57, Peter Zijlstra wrote:
>> On Tue, May 19, 2020 at 09:10:58PM +0800, Xu, Like wrote:
>>> On 2020/5/19 19:15, Peter Zijlstra wrote:
>>>> On Thu, May 14, 2020 at 04:30:53PM +0800, Like Xu wrote:
>>>>
>>>>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
>>>>> index ea4faae56473..db185dca903d 100644
>>>>> --- a/arch/x86/kvm/vmx/pmu_intel.c
>>>>> +++ b/arch/x86/kvm/vmx/pmu_intel.c
>>>>> @@ -646,6 +646,43 @@ static void intel_pmu_lbr_cleanup(struct kvm_vcpu 
>>>>> *vcpu)
>>>>>            intel_pmu_free_lbr_event(vcpu);
>>>>>    }
>>>>> +static bool intel_pmu_lbr_is_availabile(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +    struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
>>>>> +
>>>>> +    if (!pmu->lbr_event)
>>>>> +        return false;
>>>>> +
>>>>> +    if (event_is_oncpu(pmu->lbr_event)) {
>>>>> +        intel_pmu_intercept_lbr_msrs(vcpu, false);
>>>>> +    } else {
>>>>> +        intel_pmu_intercept_lbr_msrs(vcpu, true);
>>>>> +        return false;
>>>>> +    }
>>>>> +
>>>>> +    return true;
>>>>> +}
>>>> This is unreadable gunk, what?
>>> Abstractly, it is saying "KVM would passthrough the LBR satck MSRs if
>>> event_is_oncpu() is true, otherwise cancel the passthrough state if any."
>>>
>>> I'm using 'event->oncpu != -1' to represent the guest LBR event
>>> is scheduled on rather than 'event->state == PERF_EVENT_STATE_ERROR'.
>>>
>>> For intel_pmu_intercept_lbr_msrs(), false means to passthrough the LBR 
>>> stack
>>> MSRs to the vCPU, and true means to cancel the passthrough state and make
>>> LBR MSR accesses trapped by the KVM.
>> To me it seems very weird to change state in a function that is supposed
>> to just query state.
>>
>> 'is_available' seems to suggest a simple: return 'lbr_event->state ==
>> PERF_EVENT_STATE_ACTIVE' or something.
> This clarification led me to reconsider the use of a more readable name here.
> 
> Do you accept the check usage of "event->oncpu != -1" instead of
> 'event->state == PERF_EVENT_STATE_ERROR' before KVM do passthrough ?
>>
>>>>> +static void intel_pmu_availability_check(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +    lockdep_assert_irqs_disabled();
>>>>> +
>>>>> +    if (lbr_is_enabled(vcpu) && !intel_pmu_lbr_is_availabile(vcpu) &&
>>>>> +        (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
>>>>> +        pr_warn_ratelimited("kvm: vcpu-%d: LBR is temporarily 
>>>>> unavailable.\n",
>>>>> +            vcpu->vcpu_id);
>>>> More unreadable nonsense; when the events go into ERROR state, it's a
>>>> permanent fail, they'll not come back.
>>> It's not true.  The guest LBR event with 'ERROR state' or 'oncpu != -1'
>>> would be
>>> lazy released and re-created in the next time the
>>> intel_pmu_create_lbr_event() is
>>> called and it's supposed to be re-scheduled and re-do availability_check()
>>> as well.
>> Where? Also, wth would you need to destroy and re-create an event for
>> that?
> If the guest does not set the EN_LBR bit and did not touch any LBR-related 
> registers
> in the last time slice, KVM will destroy the guest LBR event in 
> kvm_pmu_cleanup()
> which is called once every time the vCPU thread is scheduled in.
> 
> The re-creation is not directly called after the destruction
> but is triggered by the next guest access to the LBR-related registers if any.
> 
>  From the time when the guest LBR event enters the "oncpu! = -1" state
> to the next re-creation, the guest LBR is not available. After the 
> re-creation,
> the guest LBR is hopefully available and if it's true, the LBR will be 
> passthrough
> and used by the guest normally.
> 
> That's the reason for "LBR is temporarily unavailable"

Do you still have any concerns on this issue?

> and please let me know if it doesn't make sense to you.
> 
>>>>> @@ -6696,8 +6696,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu 
>>>>> *vcpu)
>>>>>        pt_guest_enter(vmx);
>>>>> -    if (vcpu_to_pmu(vcpu)->version)
>>>>> +    if (vcpu_to_pmu(vcpu)->version) {
>>>>>            atomic_switch_perf_msrs(vmx);
>>>>> +        kvm_x86_ops.pmu_ops->availability_check(vcpu);
>>>>> +    }
>>>> AFAICT you just did a call out to the kvm_pmu crud in
>>>> atomic_switch_perf_msrs(), why do another call?
>>> In fact, availability_check() is only called here for just one time.
>>>
>>> The callchain looks like:
>>> - vmx_vcpu_run()
>>>      - kvm_x86_ops.pmu_ops->availability_check();
>>>          - intel_pmu_availability_check()
>>>              - intel_pmu_lbr_is_availabile()
>>>                  - event_is_oncpu() ...
>>>
>> What I'm saying is that you just did a pmu_ops indirect call in
>> atomic_switch_perf_msrs(), why add another?
> Do you mean the indirect call:
> - atomic_switch_perf_msrs()
>      - perf_guest_get_msrs()
>          - x86_pmu.guest_get_msrs()
> ?
> 
> The two pmu_ops are quite different:
> - the first one in atomic_switch_perf_msrs() is defined in the host side;
> - the second one for availability_check() is defined in the KVM side;
> 
> The availability_check() for guest LBR event and MSRs pass-through
> operations are definitely KVM context specific.

Do you still have any concerns on this issue?

If you have more comments on the patchset, please let me know.

> 
> Thanks,
> Like Xu
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 78f0cfe1622f..bb3f3ef3386e 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -42,6 +42,7 @@  struct kvm_pmu_ops {
 	void (*reset)(struct kvm_vcpu *vcpu);
 	void (*deliver_pmi)(struct kvm_vcpu *vcpu);
 	void (*lbr_cleanup)(struct kvm_vcpu *vcpu);
+	void (*availability_check)(struct kvm_vcpu *vcpu);
 };
 
 static inline bool event_is_oncpu(struct perf_event *event)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index ea4faae56473..db185dca903d 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -646,6 +646,43 @@  static void intel_pmu_lbr_cleanup(struct kvm_vcpu *vcpu)
 		intel_pmu_free_lbr_event(vcpu);
 }
 
+static bool intel_pmu_lbr_is_availabile(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+	if (!pmu->lbr_event)
+		return false;
+
+	if (event_is_oncpu(pmu->lbr_event)) {
+		intel_pmu_intercept_lbr_msrs(vcpu, false);
+	} else {
+		intel_pmu_intercept_lbr_msrs(vcpu, true);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Higher priority host perf events (e.g. cpu pinned) could reclaim the
+ * pmu resources (e.g. LBR) that were assigned to the guest. This is
+ * usually done via ipi calls (more details in perf_install_in_context).
+ *
+ * Before entering the non-root mode (with irq disabled here), double
+ * confirm that the pmu features enabled to the guest are not reclaimed
+ * by higher priority host events. Otherwise, disallow vcpu's access to
+ * the reclaimed features.
+ */
+static void intel_pmu_availability_check(struct kvm_vcpu *vcpu)
+{
+	lockdep_assert_irqs_disabled();
+
+	if (lbr_is_enabled(vcpu) && !intel_pmu_lbr_is_availabile(vcpu) &&
+		(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
+		pr_warn_ratelimited("kvm: vcpu-%d: LBR is temporarily unavailable.\n",
+			vcpu->vcpu_id);
+}
+
 struct kvm_pmu_ops intel_pmu_ops = {
 	.find_arch_event = intel_find_arch_event,
 	.find_fixed_event = intel_find_fixed_event,
@@ -662,4 +699,5 @@  struct kvm_pmu_ops intel_pmu_ops = {
 	.reset = intel_pmu_reset,
 	.deliver_pmi = intel_pmu_deliver_pmi,
 	.lbr_cleanup = intel_pmu_lbr_cleanup,
+	.availability_check = intel_pmu_availability_check,
 };
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9969d663826a..80d036c5f64a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6696,8 +6696,10 @@  static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	pt_guest_enter(vmx);
 
-	if (vcpu_to_pmu(vcpu)->version)
+	if (vcpu_to_pmu(vcpu)->version) {
 		atomic_switch_perf_msrs(vmx);
+		kvm_x86_ops.pmu_ops->availability_check(vcpu);
+	}
 
 	atomic_switch_umwait_control_msr(vmx);