Message ID | 20200628085341.5107-3-chenyi.qiang@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add support for bus lock VM exit | expand |
Chenyi Qiang <chenyi.qiang@intel.com> writes: > Virtual Machine can exploit bus locks to degrade the performance of > system. Bus lock can be caused by split locked access to writeback(WB) > memory or by using locks on uncacheable(UC) memory. The bus lock is > typically >1000 cycles slower than an atomic operation within a cache > line. It also disrupts performance on other cores (which must wait for > the bus lock to be released before their memory operations can > complete). > > To address the threat, bus lock VM exit is introduced to notify the VMM > when a bus lock was acquired, allowing it to enforce throttling or other > policy based mitigations. > > A VMM can enable VM exit due to bus locks by setting a new "Bus Lock > Detection" VM-execution control(bit 30 of Secondary Processor-based VM > execution controls). If delivery of this VM exit was pre-empted by a > higher priority VM exit, bit 26 of exit-reason is set to 1. > > In current implementation, it only records every bus lock acquired in > non-root mode in vcpu->stat.bus_locks and exposes the data through > debugfs. It doesn't implement any further handling but leave it to user. > > Document for Bus Lock VM exit is now available at the latest "Intel > Architecture Instruction Set Extensions Programming Reference". > > Document Link: > https://software.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html > > Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com> > Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com> > Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> > --- > arch/x86/include/asm/kvm_host.h | 1 + > arch/x86/include/asm/vmx.h | 1 + > arch/x86/include/asm/vmxfeatures.h | 1 + > arch/x86/include/uapi/asm/vmx.h | 4 +++- > arch/x86/kvm/vmx/vmx.c | 17 ++++++++++++++++- > arch/x86/kvm/vmx/vmx.h | 2 +- > arch/x86/kvm/x86.c | 1 + > 7 files changed, 24 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index f852ee350beb..efb5a117e11a 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1051,6 +1051,7 @@ struct kvm_vcpu_stat { > u64 req_event; > u64 halt_poll_success_ns; > u64 halt_poll_fail_ns; > + u64 bus_locks; > }; > > struct x86_instruction_info; > diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h > index cd7de4b401fe..93a880bc31a7 100644 > --- a/arch/x86/include/asm/vmx.h > +++ b/arch/x86/include/asm/vmx.h > @@ -73,6 +73,7 @@ > #define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA) > #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) > #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE) > +#define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION) > > #define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING) > #define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING) > diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h > index 9915990fd8cf..d9a74681a77d 100644 > --- a/arch/x86/include/asm/vmxfeatures.h > +++ b/arch/x86/include/asm/vmxfeatures.h > @@ -83,5 +83,6 @@ > #define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* Scale hardware TSC when read in guest */ > #define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */ > #define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ > +#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */ > > #endif /* _ASM_X86_VMXFEATURES_H */ > diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h > index b8ff9e8ac0d5..6bddfd7b87be 100644 > --- a/arch/x86/include/uapi/asm/vmx.h > +++ b/arch/x86/include/uapi/asm/vmx.h > @@ -88,6 +88,7 @@ > #define EXIT_REASON_XRSTORS 64 > #define EXIT_REASON_UMWAIT 67 > #define EXIT_REASON_TPAUSE 68 > +#define EXIT_REASON_BUS_LOCK 74 > > #define VMX_EXIT_REASONS \ > { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ > @@ -148,7 +149,8 @@ > { EXIT_REASON_XSAVES, "XSAVES" }, \ > { EXIT_REASON_XRSTORS, "XRSTORS" }, \ > { EXIT_REASON_UMWAIT, "UMWAIT" }, \ > - { EXIT_REASON_TPAUSE, "TPAUSE" } > + { EXIT_REASON_TPAUSE, "TPAUSE" }, \ > + { EXIT_REASON_BUS_LOCK, "BUS_LOCK" } > > #define VMX_EXIT_REASON_FLAGS \ > { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > index 647ee9a1b4e6..9622d7486f6d 100644 > --- a/arch/x86/kvm/vmx/vmx.c > +++ b/arch/x86/kvm/vmx/vmx.c > @@ -2463,7 +2463,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, > SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | > SECONDARY_EXEC_PT_USE_GPA | > SECONDARY_EXEC_PT_CONCEAL_VMX | > - SECONDARY_EXEC_ENABLE_VMFUNC; > + SECONDARY_EXEC_ENABLE_VMFUNC | > + SECONDARY_EXEC_BUS_LOCK_DETECTION; > if (cpu_has_sgx()) > opt2 |= SECONDARY_EXEC_ENCLS_EXITING; > if (adjust_vmx_controls(min2, opt2, > @@ -5664,6 +5665,12 @@ static int handle_encls(struct kvm_vcpu *vcpu) > return 1; > } > > +static int handle_bus_lock(struct kvm_vcpu *vcpu) > +{ > + vcpu->stat.bus_locks++; > + return 1; > +} > + > /* > * The exit handlers return 1 if the exit was handled fully and guest execution > * may resume. Otherwise they set the kvm_run parameter to indicate what needs > @@ -5720,6 +5727,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { > [EXIT_REASON_VMFUNC] = handle_vmx_instruction, > [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, > [EXIT_REASON_ENCLS] = handle_encls, > + [EXIT_REASON_BUS_LOCK] = handle_bus_lock, > }; > > static const int kvm_vmx_max_exit_handlers = > @@ -6830,6 +6838,13 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) > if (unlikely(vmx->exit_reason.failed_vmentry)) > return EXIT_FASTPATH_NONE; > > + /* > + * check the exit_reason to see if there is a bus lock > + * happened in guest. > + */ > + if (vmx->exit_reason.bus_lock_detected) > + handle_bus_lock(vcpu); In case the ultimate goal is to have an exit to userspace on bus lock, the two ways to reach handle_bus_lock() are very different: in case we're handling EXIT_REASON_BUS_LOCK we can easily drop to userspace by returning 0 but what are we going to do in case of exit_reason.bus_lock_detected? The 'higher priority VM exit' may require exit to userspace too. So what's the plan? Maybe we can ignore the case when we're exiting to userspace for some other reason as this is slow already and force the exit otherwise? And should we actually introduce the KVM_EXIT_BUS_LOCK and a capability to enable it here? > + > vmx->loaded_vmcs->launched = 1; > vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); > > diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h > index a676db7ac03c..b501a26778fc 100644 > --- a/arch/x86/kvm/vmx/vmx.h > +++ b/arch/x86/kvm/vmx/vmx.h > @@ -104,7 +104,7 @@ union vmx_exit_reason { > u32 reserved23 : 1; > u32 reserved24 : 1; > u32 reserved25 : 1; > - u32 reserved26 : 1; > + u32 bus_lock_detected : 1; > u32 enclave_mode : 1; > u32 smi_pending_mtf : 1; > u32 smi_from_vmx_root : 1; > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 00c88c2f34e4..6258b453b8dd 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -220,6 +220,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { > VCPU_STAT("l1d_flush", l1d_flush), > VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), > VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), > + VCPU_STAT("bus_locks", bus_locks), > VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), > VM_STAT("mmu_pte_write", mmu_pte_write), > VM_STAT("mmu_pte_updated", mmu_pte_updated),
On 7/1/2020 5:04 PM, Vitaly Kuznetsov wrote: > Chenyi Qiang <chenyi.qiang@intel.com> writes: [...] >> static const int kvm_vmx_max_exit_handlers = >> @@ -6830,6 +6838,13 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) >> if (unlikely(vmx->exit_reason.failed_vmentry)) >> return EXIT_FASTPATH_NONE; >> >> + /* >> + * check the exit_reason to see if there is a bus lock >> + * happened in guest. >> + */ >> + if (vmx->exit_reason.bus_lock_detected) >> + handle_bus_lock(vcpu); > > In case the ultimate goal is to have an exit to userspace on bus lock, I don't think we will need an exit to userspace on bus lock. See below. > the two ways to reach handle_bus_lock() are very different: in case > we're handling EXIT_REASON_BUS_LOCK we can easily drop to userspace by > returning 0 but what are we going to do in case of > exit_reason.bus_lock_detected? The 'higher priority VM exit' may require > exit to userspace too. So what's the plan? Maybe we can ignore the case > when we're exiting to userspace for some other reason as this is slow > already and force the exit otherwise? > And should we actually introduce > the KVM_EXIT_BUS_LOCK and a capability to enable it here? > Introducing KVM_EXIT_BUS_LOCK maybe help nothing. No matter EXIT_REASON_BUS_LOCK or exit_reason.bus_lock_detected, the bus lock has already happened. Exit to userspace cannot prevent bus lock, so what userspace can do is recording and counting as what this patch does in vcpu->stat.bus_locks.
Xiaoyao Li <xiaoyao.li@intel.com> writes: > On 7/1/2020 5:04 PM, Vitaly Kuznetsov wrote: >> Chenyi Qiang <chenyi.qiang@intel.com> writes: > [...] >>> static const int kvm_vmx_max_exit_handlers = >>> @@ -6830,6 +6838,13 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) >>> if (unlikely(vmx->exit_reason.failed_vmentry)) >>> return EXIT_FASTPATH_NONE; >>> >>> + /* >>> + * check the exit_reason to see if there is a bus lock >>> + * happened in guest. >>> + */ >>> + if (vmx->exit_reason.bus_lock_detected) >>> + handle_bus_lock(vcpu); >> >> In case the ultimate goal is to have an exit to userspace on bus lock, > > I don't think we will need an exit to userspace on bus lock. See below. > >> the two ways to reach handle_bus_lock() are very different: in case >> we're handling EXIT_REASON_BUS_LOCK we can easily drop to userspace by >> returning 0 but what are we going to do in case of >> exit_reason.bus_lock_detected? The 'higher priority VM exit' may require >> exit to userspace too. So what's the plan? Maybe we can ignore the case >> when we're exiting to userspace for some other reason as this is slow >> already and force the exit otherwise? > >> And should we actually introduce >> the KVM_EXIT_BUS_LOCK and a capability to enable it here? >> > > Introducing KVM_EXIT_BUS_LOCK maybe help nothing. No matter > EXIT_REASON_BUS_LOCK or exit_reason.bus_lock_detected, the bus lock has > already happened. Exit to userspace cannot prevent bus lock, so what > userspace can do is recording and counting as what this patch does in > vcpu->stat.bus_locks. Exiting to userspace would allow to implement custom 'throttling' policies to mitigate the 'noisy neighbour' problem. The simplest would be to just inject some sleep time.
On 7/1/2020 8:44 PM, Vitaly Kuznetsov wrote: > Xiaoyao Li <xiaoyao.li@intel.com> writes: > >> On 7/1/2020 5:04 PM, Vitaly Kuznetsov wrote: >>> Chenyi Qiang <chenyi.qiang@intel.com> writes: >> [...] >>>> static const int kvm_vmx_max_exit_handlers = >>>> @@ -6830,6 +6838,13 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) >>>> if (unlikely(vmx->exit_reason.failed_vmentry)) >>>> return EXIT_FASTPATH_NONE; >>>> >>>> + /* >>>> + * check the exit_reason to see if there is a bus lock >>>> + * happened in guest. >>>> + */ >>>> + if (vmx->exit_reason.bus_lock_detected) >>>> + handle_bus_lock(vcpu); >>> >>> In case the ultimate goal is to have an exit to userspace on bus lock, >> >> I don't think we will need an exit to userspace on bus lock. See below. >> >>> the two ways to reach handle_bus_lock() are very different: in case >>> we're handling EXIT_REASON_BUS_LOCK we can easily drop to userspace by >>> returning 0 but what are we going to do in case of >>> exit_reason.bus_lock_detected? The 'higher priority VM exit' may require >>> exit to userspace too. So what's the plan? Maybe we can ignore the case >>> when we're exiting to userspace for some other reason as this is slow >>> already and force the exit otherwise? >> >>> And should we actually introduce >>> the KVM_EXIT_BUS_LOCK and a capability to enable it here? >>> >> >> Introducing KVM_EXIT_BUS_LOCK maybe help nothing. No matter >> EXIT_REASON_BUS_LOCK or exit_reason.bus_lock_detected, the bus lock has >> already happened. Exit to userspace cannot prevent bus lock, so what >> userspace can do is recording and counting as what this patch does in >> vcpu->stat.bus_locks. > > Exiting to userspace would allow to implement custom 'throttling' > policies to mitigate the 'noisy neighbour' problem. The simplest would > be to just inject some sleep time. > So you want an exit to userspace for every bus lock and leave it all to userspace. Yes, it's doable. As you said, the exit_reason.bus_lock_detected case is the tricky one. We cannot do the similar to extend vcpu->run->exit_reason, this breaks ABI. Maybe we can extend the vcpu->run->flags to indicate bus lock detected for the other exit reason?
Xiaoyao Li <xiaoyao.li@intel.com> writes: > On 7/1/2020 8:44 PM, Vitaly Kuznetsov wrote: >> Xiaoyao Li <xiaoyao.li@intel.com> writes: >> >>> On 7/1/2020 5:04 PM, Vitaly Kuznetsov wrote: >>>> Chenyi Qiang <chenyi.qiang@intel.com> writes: >>> [...] >>>>> static const int kvm_vmx_max_exit_handlers = >>>>> @@ -6830,6 +6838,13 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) >>>>> if (unlikely(vmx->exit_reason.failed_vmentry)) >>>>> return EXIT_FASTPATH_NONE; >>>>> >>>>> + /* >>>>> + * check the exit_reason to see if there is a bus lock >>>>> + * happened in guest. >>>>> + */ >>>>> + if (vmx->exit_reason.bus_lock_detected) >>>>> + handle_bus_lock(vcpu); >>>> >>>> In case the ultimate goal is to have an exit to userspace on bus lock, >>> >>> I don't think we will need an exit to userspace on bus lock. See below. >>> >>>> the two ways to reach handle_bus_lock() are very different: in case >>>> we're handling EXIT_REASON_BUS_LOCK we can easily drop to userspace by >>>> returning 0 but what are we going to do in case of >>>> exit_reason.bus_lock_detected? The 'higher priority VM exit' may require >>>> exit to userspace too. So what's the plan? Maybe we can ignore the case >>>> when we're exiting to userspace for some other reason as this is slow >>>> already and force the exit otherwise? >>> >>>> And should we actually introduce >>>> the KVM_EXIT_BUS_LOCK and a capability to enable it here? >>>> >>> >>> Introducing KVM_EXIT_BUS_LOCK maybe help nothing. No matter >>> EXIT_REASON_BUS_LOCK or exit_reason.bus_lock_detected, the bus lock has >>> already happened. Exit to userspace cannot prevent bus lock, so what >>> userspace can do is recording and counting as what this patch does in >>> vcpu->stat.bus_locks. >> >> Exiting to userspace would allow to implement custom 'throttling' >> policies to mitigate the 'noisy neighbour' problem. The simplest would >> be to just inject some sleep time. >> > > So you want an exit to userspace for every bus lock and leave it all to > userspace. Yes, it's doable. > In some cases we may not even want to have a VM exit: think e.g. real-time/partitioning case when even in case of bus lock we may not want to add additional latency just to count such events. I'd suggest we make the new capability tri-state: - disabled (no vmexit, default) - stats only (what this patch does) - userspace exit But maybe this is an overkill, I'd like to hear what others think. > As you said, the exit_reason.bus_lock_detected case is the tricky one. > We cannot do the similar to extend vcpu->run->exit_reason, this breaks > ABI. Maybe we can extend the vcpu->run->flags to indicate bus lock > detected for the other exit reason? This is likely the easiest solution.
On 7/1/2020 10:49 PM, Vitaly Kuznetsov wrote: > Xiaoyao Li <xiaoyao.li@intel.com> writes: > >> On 7/1/2020 8:44 PM, Vitaly Kuznetsov wrote: >>> Xiaoyao Li <xiaoyao.li@intel.com> writes: >>> >>>> On 7/1/2020 5:04 PM, Vitaly Kuznetsov wrote: >>>>> Chenyi Qiang <chenyi.qiang@intel.com> writes: >>>> [...] >>>>>> static const int kvm_vmx_max_exit_handlers = >>>>>> @@ -6830,6 +6838,13 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) >>>>>> if (unlikely(vmx->exit_reason.failed_vmentry)) >>>>>> return EXIT_FASTPATH_NONE; >>>>>> >>>>>> + /* >>>>>> + * check the exit_reason to see if there is a bus lock >>>>>> + * happened in guest. >>>>>> + */ >>>>>> + if (vmx->exit_reason.bus_lock_detected) >>>>>> + handle_bus_lock(vcpu); >>>>> >>>>> In case the ultimate goal is to have an exit to userspace on bus lock, >>>> >>>> I don't think we will need an exit to userspace on bus lock. See below. >>>> >>>>> the two ways to reach handle_bus_lock() are very different: in case >>>>> we're handling EXIT_REASON_BUS_LOCK we can easily drop to userspace by >>>>> returning 0 but what are we going to do in case of >>>>> exit_reason.bus_lock_detected? The 'higher priority VM exit' may require >>>>> exit to userspace too. So what's the plan? Maybe we can ignore the case >>>>> when we're exiting to userspace for some other reason as this is slow >>>>> already and force the exit otherwise? >>>> >>>>> And should we actually introduce >>>>> the KVM_EXIT_BUS_LOCK and a capability to enable it here? >>>>> >>>> >>>> Introducing KVM_EXIT_BUS_LOCK maybe help nothing. No matter >>>> EXIT_REASON_BUS_LOCK or exit_reason.bus_lock_detected, the bus lock has >>>> already happened. Exit to userspace cannot prevent bus lock, so what >>>> userspace can do is recording and counting as what this patch does in >>>> vcpu->stat.bus_locks. >>> >>> Exiting to userspace would allow to implement custom 'throttling' >>> policies to mitigate the 'noisy neighbour' problem. The simplest would >>> be to just inject some sleep time. >>> >> >> So you want an exit to userspace for every bus lock and leave it all to >> userspace. Yes, it's doable. >> > > In some cases we may not even want to have a VM exit: think > e.g. real-time/partitioning case when even in case of bus lock we may > not want to add additional latency just to count such events. For real-time case, the bus lock needs to be avoided at all, since bus lock takes many cycles and prevents others accessing memory. If no bus lock, then no bus lock vm exit to worry about. If bus lock, the latency requirement maybe cannot be met due to it. > I'd > suggest we make the new capability tri-state: > - disabled (no vmexit, default) > - stats only (what this patch does) > - userspace exit > But maybe this is an overkill, I'd like to hear what others think. Yeah. Others' thought is very welcomed. Besides, for how to throttle, KVM maybe has to take kernel policy into account. Since in the spec, there is another feature for bare metal to raise a #DB for bus lock. Native kernel likely implements some policy to restrict the rate the bus lock can happen. So qemu threads will have to follow that as well. >> As you said, the exit_reason.bus_lock_detected case is the tricky one. >> We cannot do the similar to extend vcpu->run->exit_reason, this breaks >> ABI. Maybe we can extend the vcpu->run->flags to indicate bus lock >> detected for the other exit reason? > > This is likely the easiest solution. >
On Wed, Jul 01, 2020 at 04:49:49PM +0200, Vitaly Kuznetsov wrote: > Xiaoyao Li <xiaoyao.li@intel.com> writes: > > So you want an exit to userspace for every bus lock and leave it all to > > userspace. Yes, it's doable. > > In some cases we may not even want to have a VM exit: think > e.g. real-time/partitioning case when even in case of bus lock we may > not want to add additional latency just to count such events. Hmm, I suspect this isn't all that useful for real-time cases because they'd probably want to prevent the split lock in the first place, e.g. would prefer to use the #AC variant in fatal mode. Of course, the availability of split lock #AC is a whole other can of worms. But anyways, I 100% agree that this needs either an off-by-default module param or an opt-in per-VM capability. > I'd suggest we make the new capability tri-state: > - disabled (no vmexit, default) > - stats only (what this patch does) > - userspace exit > But maybe this is an overkill, I'd like to hear what others think. Userspace exit would also be interesting for debug. Another throttling option would be schedule() or cond_reched(), though that's probably getting into overkill territory.
On 7/23/2020 9:21 AM, Sean Christopherson wrote: > On Wed, Jul 01, 2020 at 04:49:49PM +0200, Vitaly Kuznetsov wrote: >> Xiaoyao Li <xiaoyao.li@intel.com> writes: >>> So you want an exit to userspace for every bus lock and leave it all to >>> userspace. Yes, it's doable. >> >> In some cases we may not even want to have a VM exit: think >> e.g. real-time/partitioning case when even in case of bus lock we may >> not want to add additional latency just to count such events. > > Hmm, I suspect this isn't all that useful for real-time cases because they'd > probably want to prevent the split lock in the first place, e.g. would prefer > to use the #AC variant in fatal mode. Of course, the availability of split > lock #AC is a whole other can of worms. > > But anyways, I 100% agree that this needs either an off-by-default module > param or an opt-in per-VM capability. > Maybe on-by-default or an opt-out per-VM capability? Turning it on introduces no overhead if no bus lock happens in guest but gives KVM the capability to track every potential bus lock. If user doesn't want the extra latency due to bus lock VM exit, it's better try to fix the bus lock, which also incurs high latency. >> I'd suggest we make the new capability tri-state: >> - disabled (no vmexit, default) >> - stats only (what this patch does) >> - userspace exit >> But maybe this is an overkill, I'd like to hear what others think. > > Userspace exit would also be interesting for debug. Another throttling > option would be schedule() or cond_reched(), though that's probably getting > into overkill territory. > We're going to leverage host's policy, i.e., calling handle_user_bus_lock(), for throttling, as proposed in https://lkml.kernel.org/r/1595021700-68460-1-git-send-email-fenghua.yu@intel.com
On Mon, Jul 27, 2020 at 12:38:53PM +0800, Xiaoyao Li wrote: > On 7/23/2020 9:21 AM, Sean Christopherson wrote: > >On Wed, Jul 01, 2020 at 04:49:49PM +0200, Vitaly Kuznetsov wrote: > >>Xiaoyao Li <xiaoyao.li@intel.com> writes: > >>>So you want an exit to userspace for every bus lock and leave it all to > >>>userspace. Yes, it's doable. > >> > >>In some cases we may not even want to have a VM exit: think > >>e.g. real-time/partitioning case when even in case of bus lock we may > >>not want to add additional latency just to count such events. > > > >Hmm, I suspect this isn't all that useful for real-time cases because they'd > >probably want to prevent the split lock in the first place, e.g. would prefer > >to use the #AC variant in fatal mode. Of course, the availability of split > >lock #AC is a whole other can of worms. > > > >But anyways, I 100% agree that this needs either an off-by-default module > >param or an opt-in per-VM capability. > > > > Maybe on-by-default or an opt-out per-VM capability? > Turning it on introduces no overhead if no bus lock happens in guest but > gives KVM the capability to track every potential bus lock. If user doesn't > want the extra latency due to bus lock VM exit, it's better try to fix the > bus lock, which also incurs high latency. Except that I doubt the physical system owner and VM owner are the same entity in the vast majority of KVM use cases. So yeah, in a perfect world the guest application that's causing bus locks would be fixed, but in practice there is likely no sane way for the KVM owner to inform the guest application owner that their application is broken, let alone fix said application. The caveat would be that we may need to enable this by default if the host kernel policy mandates it. > >>I'd suggest we make the new capability tri-state: > >>- disabled (no vmexit, default) > >>- stats only (what this patch does) > >>- userspace exit > >>But maybe this is an overkill, I'd like to hear what others think. > > > >Userspace exit would also be interesting for debug. Another throttling > >option would be schedule() or cond_reched(), though that's probably getting > >into overkill territory. > > > > We're going to leverage host's policy, i.e., calling handle_user_bus_lock(), > for throttling, as proposed in https://lkml.kernel.org/r/1595021700-68460-1-git-send-email-fenghua.yu@intel.com >
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f852ee350beb..efb5a117e11a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1051,6 +1051,7 @@ struct kvm_vcpu_stat { u64 req_event; u64 halt_poll_success_ns; u64 halt_poll_fail_ns; + u64 bus_locks; }; struct x86_instruction_info; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index cd7de4b401fe..93a880bc31a7 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -73,6 +73,7 @@ #define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA) #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE) +#define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION) #define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING) #define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING) diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index 9915990fd8cf..d9a74681a77d 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -83,5 +83,6 @@ #define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* Scale hardware TSC when read in guest */ #define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */ #define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ +#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */ #endif /* _ASM_X86_VMXFEATURES_H */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index b8ff9e8ac0d5..6bddfd7b87be 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -88,6 +88,7 @@ #define EXIT_REASON_XRSTORS 64 #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 +#define EXIT_REASON_BUS_LOCK 74 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -148,7 +149,8 @@ { EXIT_REASON_XSAVES, "XSAVES" }, \ { EXIT_REASON_XRSTORS, "XRSTORS" }, \ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ - { EXIT_REASON_TPAUSE, "TPAUSE" } + { EXIT_REASON_TPAUSE, "TPAUSE" }, \ + { EXIT_REASON_BUS_LOCK, "BUS_LOCK" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 647ee9a1b4e6..9622d7486f6d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2463,7 +2463,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | - SECONDARY_EXEC_ENABLE_VMFUNC; + SECONDARY_EXEC_ENABLE_VMFUNC | + SECONDARY_EXEC_BUS_LOCK_DETECTION; if (cpu_has_sgx()) opt2 |= SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, @@ -5664,6 +5665,12 @@ static int handle_encls(struct kvm_vcpu *vcpu) return 1; } +static int handle_bus_lock(struct kvm_vcpu *vcpu) +{ + vcpu->stat.bus_locks++; + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5720,6 +5727,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMFUNC] = handle_vmx_instruction, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, + [EXIT_REASON_BUS_LOCK] = handle_bus_lock, }; static const int kvm_vmx_max_exit_handlers = @@ -6830,6 +6838,13 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(vmx->exit_reason.failed_vmentry)) return EXIT_FASTPATH_NONE; + /* + * check the exit_reason to see if there is a bus lock + * happened in guest. + */ + if (vmx->exit_reason.bus_lock_detected) + handle_bus_lock(vcpu); + vmx->loaded_vmcs->launched = 1; vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a676db7ac03c..b501a26778fc 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -104,7 +104,7 @@ union vmx_exit_reason { u32 reserved23 : 1; u32 reserved24 : 1; u32 reserved25 : 1; - u32 reserved26 : 1; + u32 bus_lock_detected : 1; u32 enclave_mode : 1; u32 smi_pending_mtf : 1; u32 smi_from_vmx_root : 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00c88c2f34e4..6258b453b8dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -220,6 +220,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("l1d_flush", l1d_flush), VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VCPU_STAT("bus_locks", bus_locks), VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), VM_STAT("mmu_pte_write", mmu_pte_write), VM_STAT("mmu_pte_updated", mmu_pte_updated),