diff mbox series

[v2,1/2] KVM: VMX: FIXED+PHYSICAL mode single target IPI fastpath

Message ID 1574145389-12149-1-git-send-email-wanpengli@tencent.com (mailing list archive)
State New, archived
Headers show
Series [v2,1/2] KVM: VMX: FIXED+PHYSICAL mode single target IPI fastpath | expand

Commit Message

Wanpeng Li Nov. 19, 2019, 6:36 a.m. UTC
From: Wanpeng Li <wanpengli@tencent.com>

ICR and TSCDEADLINE MSRs write cause the main MSRs write vmexits in 
our product observation, multicast IPIs are not as common as unicast 
IPI like RESCHEDULE_VECTOR and CALL_FUNCTION_SINGLE_VECTOR etc.

This patch tries to optimize x2apic physical destination mode, fixed 
delivery mode single target IPI by delivering IPI to receiver as soon 
as possible after sender writes ICR vmexit to avoid various checks 
when possible, especially when running guest w/ --overcommit cpu-pm=on
or guest can keep running, IPI can be injected to target vCPU by 
posted-interrupt immediately.

Testing on Xeon Skylake server:

The virtual IPI latency from sender send to receiver receive reduces 
more than 200+ cpu cycles.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
v1 -> v2:
 * add tracepoint
 * Instead of a separate vcpu->fast_vmexit, set exit_reason
   to vmx->exit_reason to -1 if the fast path succeeds.
 * move the "kvm_skip_emulated_instruction(vcpu)" to vmx_handle_exit
 * moving the handling into vmx_handle_exit_irqoff()

 arch/x86/include/asm/kvm_host.h |  4 ++--
 arch/x86/include/uapi/asm/vmx.h |  1 +
 arch/x86/kvm/svm.c              |  4 ++--
 arch/x86/kvm/vmx/vmx.c          | 40 +++++++++++++++++++++++++++++++++++++---
 arch/x86/kvm/x86.c              |  5 +++--
 5 files changed, 45 insertions(+), 9 deletions(-)

Comments

Vitaly Kuznetsov Nov. 19, 2019, 11:54 a.m. UTC | #1
Wanpeng Li <kernellwp@gmail.com> writes:

> From: Wanpeng Li <wanpengli@tencent.com>
>
> ICR and TSCDEADLINE MSRs write cause the main MSRs write vmexits in 
> our product observation, multicast IPIs are not as common as unicast 
> IPI like RESCHEDULE_VECTOR and CALL_FUNCTION_SINGLE_VECTOR etc.
>
> This patch tries to optimize x2apic physical destination mode, fixed 
> delivery mode single target IPI by delivering IPI to receiver as soon 
> as possible after sender writes ICR vmexit to avoid various checks 
> when possible, especially when running guest w/ --overcommit cpu-pm=on
> or guest can keep running, IPI can be injected to target vCPU by 
> posted-interrupt immediately.
>
> Testing on Xeon Skylake server:
>
> The virtual IPI latency from sender send to receiver receive reduces 
> more than 200+ cpu cycles.
>
> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> ---
> v1 -> v2:
>  * add tracepoint
>  * Instead of a separate vcpu->fast_vmexit, set exit_reason
>    to vmx->exit_reason to -1 if the fast path succeeds.
>  * move the "kvm_skip_emulated_instruction(vcpu)" to vmx_handle_exit
>  * moving the handling into vmx_handle_exit_irqoff()
>
>  arch/x86/include/asm/kvm_host.h |  4 ++--
>  arch/x86/include/uapi/asm/vmx.h |  1 +
>  arch/x86/kvm/svm.c              |  4 ++--
>  arch/x86/kvm/vmx/vmx.c          | 40 +++++++++++++++++++++++++++++++++++++---
>  arch/x86/kvm/x86.c              |  5 +++--
>  5 files changed, 45 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 898ab9e..0daafa9 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1084,7 +1084,7 @@ struct kvm_x86_ops {
>  	void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
>  
>  	void (*run)(struct kvm_vcpu *vcpu);
> -	int (*handle_exit)(struct kvm_vcpu *vcpu);
> +	int (*handle_exit)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
>  	int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
>  	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
>  	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
> @@ -1134,7 +1134,7 @@ struct kvm_x86_ops {
>  	int (*check_intercept)(struct kvm_vcpu *vcpu,
>  			       struct x86_instruction_info *info,
>  			       enum x86_intercept_stage stage);
> -	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
> +	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
>  	bool (*mpx_supported)(void);
>  	bool (*xsaves_supported)(void);
>  	bool (*umip_emulated)(void);
> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
> index 3eb8411..b33c6e1 100644
> --- a/arch/x86/include/uapi/asm/vmx.h
> +++ b/arch/x86/include/uapi/asm/vmx.h
> @@ -88,6 +88,7 @@
>  #define EXIT_REASON_XRSTORS             64
>  #define EXIT_REASON_UMWAIT              67
>  #define EXIT_REASON_TPAUSE              68
> +#define EXIT_REASON_NEED_SKIP_EMULATED_INSN -1

Maybe just EXIT_REASON_INSN_SKIP ?

>  
>  #define VMX_EXIT_REASONS \
>  	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index d02a73a..c8e063a 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -4929,7 +4929,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
>  	*info2 = control->exit_info_2;
>  }
>  
> -static int handle_exit(struct kvm_vcpu *vcpu)
> +static int handle_exit(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
>  	struct kvm_run *kvm_run = vcpu->run;
> @@ -6187,7 +6187,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
>  	return ret;
>  }
>  
> -static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
>  {
>  
>  }
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 621142e5..b98198d 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -5792,7 +5792,7 @@ void dump_vmcs(void)
>   * The guest has exited.  See if we can fix it or if we need userspace
>   * assistance.
>   */
> -static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> +static int vmx_handle_exit(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	u32 exit_reason = vmx->exit_reason;
> @@ -5878,7 +5878,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>  		}
>  	}
>  
> -	if (exit_reason < kvm_vmx_max_exit_handlers
> +	if (*vcpu_exit_reason == EXIT_REASON_NEED_SKIP_EMULATED_INSN) {
> +		kvm_skip_emulated_instruction(vcpu);
> +		return 1;
> +	} else if (exit_reason < kvm_vmx_max_exit_handlers
>  	    && kvm_vmx_exit_handlers[exit_reason]) {
>  #ifdef CONFIG_RETPOLINE
>  		if (exit_reason == EXIT_REASON_MSR_WRITE)
> @@ -6223,7 +6226,36 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
>  }
>  STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
>  
> -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> +static u32 handle_ipi_fastpath(struct kvm_vcpu *vcpu)
> +{
> +	u32 index;
> +	u64 data;
> +	int ret = 0;
> +
> +	if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic)) {
> +		/*
> +		 * fastpath to IPI target, FIXED+PHYSICAL which is popular
> +		 */
> +		index = kvm_rcx_read(vcpu);
> +		data = kvm_read_edx_eax(vcpu);
> +
> +		if (((index - APIC_BASE_MSR) << 4 == APIC_ICR) &&

What if index (RCX) is < APIC_BASE_MSR?

> +			((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
> +			((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
> +
> +			trace_kvm_msr_write(index, data);
> +			kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
> +			ret = kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
> +
> +			if (ret == 0)
> +				return EXIT_REASON_NEED_SKIP_EMULATED_INSN;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, u32 *exit_reason)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  
> @@ -6231,6 +6263,8 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
>  		handle_external_interrupt_irqoff(vcpu);
>  	else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
>  		handle_exception_nmi_irqoff(vmx);
> +	else if (vmx->exit_reason == EXIT_REASON_MSR_WRITE)
> +		*exit_reason = handle_ipi_fastpath(vcpu);
>  }
>  
>  static bool vmx_has_emulated_msr(int index)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 991dd01..a53bce3 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7981,6 +7981,7 @@ EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
>  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  {
>  	int r;
> +	u32 exit_reason = 0;
>  	bool req_int_win =
>  		dm_request_for_irq_injection(vcpu) &&
>  		kvm_cpu_accept_dm_intr(vcpu);
> @@ -8226,7 +8227,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  	vcpu->mode = OUTSIDE_GUEST_MODE;
>  	smp_wmb();
>  
> -	kvm_x86_ops->handle_exit_irqoff(vcpu);
> +	kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_reason);
>  
>  	/*
>  	 * Consume any pending interrupts, including the possible source of
> @@ -8270,7 +8271,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  		kvm_lapic_sync_from_vapic(vcpu);
>  
>  	vcpu->arch.gpa_available = false;
> -	r = kvm_x86_ops->handle_exit(vcpu);
> +	r = kvm_x86_ops->handle_exit(vcpu, &exit_reason);
>  	return r;
>  
>  cancel_injection:
Wanpeng Li Nov. 19, 2019, 11:59 a.m. UTC | #2
On Tue, 19 Nov 2019 at 19:54, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>
> Wanpeng Li <kernellwp@gmail.com> writes:
>
> > From: Wanpeng Li <wanpengli@tencent.com>
> >
> > ICR and TSCDEADLINE MSRs write cause the main MSRs write vmexits in
> > our product observation, multicast IPIs are not as common as unicast
> > IPI like RESCHEDULE_VECTOR and CALL_FUNCTION_SINGLE_VECTOR etc.
> >
> > This patch tries to optimize x2apic physical destination mode, fixed
> > delivery mode single target IPI by delivering IPI to receiver as soon
> > as possible after sender writes ICR vmexit to avoid various checks
> > when possible, especially when running guest w/ --overcommit cpu-pm=on
> > or guest can keep running, IPI can be injected to target vCPU by
> > posted-interrupt immediately.
> >
> > Testing on Xeon Skylake server:
> >
> > The virtual IPI latency from sender send to receiver receive reduces
> > more than 200+ cpu cycles.
> >
> > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > ---
> > v1 -> v2:
> >  * add tracepoint
> >  * Instead of a separate vcpu->fast_vmexit, set exit_reason
> >    to vmx->exit_reason to -1 if the fast path succeeds.
> >  * move the "kvm_skip_emulated_instruction(vcpu)" to vmx_handle_exit
> >  * moving the handling into vmx_handle_exit_irqoff()
> >
> >  arch/x86/include/asm/kvm_host.h |  4 ++--
> >  arch/x86/include/uapi/asm/vmx.h |  1 +
> >  arch/x86/kvm/svm.c              |  4 ++--
> >  arch/x86/kvm/vmx/vmx.c          | 40 +++++++++++++++++++++++++++++++++++++---
> >  arch/x86/kvm/x86.c              |  5 +++--
> >  5 files changed, 45 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 898ab9e..0daafa9 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1084,7 +1084,7 @@ struct kvm_x86_ops {
> >       void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
> >
> >       void (*run)(struct kvm_vcpu *vcpu);
> > -     int (*handle_exit)(struct kvm_vcpu *vcpu);
> > +     int (*handle_exit)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
> >       int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
> >       void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
> >       u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
> > @@ -1134,7 +1134,7 @@ struct kvm_x86_ops {
> >       int (*check_intercept)(struct kvm_vcpu *vcpu,
> >                              struct x86_instruction_info *info,
> >                              enum x86_intercept_stage stage);
> > -     void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
> > +     void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
> >       bool (*mpx_supported)(void);
> >       bool (*xsaves_supported)(void);
> >       bool (*umip_emulated)(void);
> > diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
> > index 3eb8411..b33c6e1 100644
> > --- a/arch/x86/include/uapi/asm/vmx.h
> > +++ b/arch/x86/include/uapi/asm/vmx.h
> > @@ -88,6 +88,7 @@
> >  #define EXIT_REASON_XRSTORS             64
> >  #define EXIT_REASON_UMWAIT              67
> >  #define EXIT_REASON_TPAUSE              68
> > +#define EXIT_REASON_NEED_SKIP_EMULATED_INSN -1
>
> Maybe just EXIT_REASON_INSN_SKIP ?
>
> >
> >  #define VMX_EXIT_REASONS \
> >       { EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index d02a73a..c8e063a 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -4929,7 +4929,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
> >       *info2 = control->exit_info_2;
> >  }
> >
> > -static int handle_exit(struct kvm_vcpu *vcpu)
> > +static int handle_exit(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
> >  {
> >       struct vcpu_svm *svm = to_svm(vcpu);
> >       struct kvm_run *kvm_run = vcpu->run;
> > @@ -6187,7 +6187,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
> >       return ret;
> >  }
> >
> > -static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> > +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
> >  {
> >
> >  }
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 621142e5..b98198d 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -5792,7 +5792,7 @@ void dump_vmcs(void)
> >   * The guest has exited.  See if we can fix it or if we need userspace
> >   * assistance.
> >   */
> > -static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> > +static int vmx_handle_exit(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
> >  {
> >       struct vcpu_vmx *vmx = to_vmx(vcpu);
> >       u32 exit_reason = vmx->exit_reason;
> > @@ -5878,7 +5878,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> >               }
> >       }
> >
> > -     if (exit_reason < kvm_vmx_max_exit_handlers
> > +     if (*vcpu_exit_reason == EXIT_REASON_NEED_SKIP_EMULATED_INSN) {
> > +             kvm_skip_emulated_instruction(vcpu);
> > +             return 1;
> > +     } else if (exit_reason < kvm_vmx_max_exit_handlers
> >           && kvm_vmx_exit_handlers[exit_reason]) {
> >  #ifdef CONFIG_RETPOLINE
> >               if (exit_reason == EXIT_REASON_MSR_WRITE)
> > @@ -6223,7 +6226,36 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
> >  }
> >  STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
> >
> > -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> > +static u32 handle_ipi_fastpath(struct kvm_vcpu *vcpu)
> > +{
> > +     u32 index;
> > +     u64 data;
> > +     int ret = 0;
> > +
> > +     if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic)) {
> > +             /*
> > +              * fastpath to IPI target, FIXED+PHYSICAL which is popular
> > +              */
> > +             index = kvm_rcx_read(vcpu);
> > +             data = kvm_read_edx_eax(vcpu);
> > +
> > +             if (((index - APIC_BASE_MSR) << 4 == APIC_ICR) &&
>
> What if index (RCX) is < APIC_BASE_MSR?

How about if (index == (APIC_BASE_MSR + 0x300) &&

>
> > +                     ((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
> > +                     ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
> > +
> > +                     trace_kvm_msr_write(index, data);
> > +                     kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
> > +                     ret = kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
> > +
> > +                     if (ret == 0)
> > +                             return EXIT_REASON_NEED_SKIP_EMULATED_INSN;
> > +             }
> > +     }
> > +
> > +     return ret;
> > +}
> > +
> > +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, u32 *exit_reason)
> >  {
> >       struct vcpu_vmx *vmx = to_vmx(vcpu);
> >
> > @@ -6231,6 +6263,8 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> >               handle_external_interrupt_irqoff(vcpu);
> >       else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
> >               handle_exception_nmi_irqoff(vmx);
> > +     else if (vmx->exit_reason == EXIT_REASON_MSR_WRITE)
> > +             *exit_reason = handle_ipi_fastpath(vcpu);
> >  }
> >
> >  static bool vmx_has_emulated_msr(int index)
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 991dd01..a53bce3 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -7981,6 +7981,7 @@ EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
> >  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >  {
> >       int r;
> > +     u32 exit_reason = 0;
> >       bool req_int_win =
> >               dm_request_for_irq_injection(vcpu) &&
> >               kvm_cpu_accept_dm_intr(vcpu);
> > @@ -8226,7 +8227,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >       vcpu->mode = OUTSIDE_GUEST_MODE;
> >       smp_wmb();
> >
> > -     kvm_x86_ops->handle_exit_irqoff(vcpu);
> > +     kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_reason);
> >
> >       /*
> >        * Consume any pending interrupts, including the possible source of
> > @@ -8270,7 +8271,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >               kvm_lapic_sync_from_vapic(vcpu);
> >
> >       vcpu->arch.gpa_available = false;
> > -     r = kvm_x86_ops->handle_exit(vcpu);
> > +     r = kvm_x86_ops->handle_exit(vcpu, &exit_reason);
> >       return r;
> >
> >  cancel_injection:
>
> --
> Vitaly
>
Liran Alon Nov. 19, 2019, 12:11 p.m. UTC | #3
> On 19 Nov 2019, at 8:36, Wanpeng Li <kernellwp@gmail.com> wrote:
> 
> From: Wanpeng Li <wanpengli@tencent.com>
> 
> ICR and TSCDEADLINE MSRs write cause the main MSRs write vmexits in 
> our product observation, multicast IPIs are not as common as unicast 
> IPI like RESCHEDULE_VECTOR and CALL_FUNCTION_SINGLE_VECTOR etc.
> 
> This patch tries to optimize x2apic physical destination mode, fixed 
> delivery mode single target IPI by delivering IPI to receiver as soon 
> as possible after sender writes ICR vmexit to avoid various checks 
> when possible, especially when running guest w/ --overcommit cpu-pm=on
> or guest can keep running, IPI can be injected to target vCPU by 
> posted-interrupt immediately.
> 
> Testing on Xeon Skylake server:
> 
> The virtual IPI latency from sender send to receiver receive reduces 
> more than 200+ cpu cycles.
> 
> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> ---
> v1 -> v2:
> * add tracepoint
> * Instead of a separate vcpu->fast_vmexit, set exit_reason
>   to vmx->exit_reason to -1 if the fast path succeeds.
> * move the "kvm_skip_emulated_instruction(vcpu)" to vmx_handle_exit
> * moving the handling into vmx_handle_exit_irqoff()
> 
> arch/x86/include/asm/kvm_host.h |  4 ++--
> arch/x86/include/uapi/asm/vmx.h |  1 +
> arch/x86/kvm/svm.c              |  4 ++--
> arch/x86/kvm/vmx/vmx.c          | 40 +++++++++++++++++++++++++++++++++++++---
> arch/x86/kvm/x86.c              |  5 +++--
> 5 files changed, 45 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 898ab9e..0daafa9 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1084,7 +1084,7 @@ struct kvm_x86_ops {
> 	void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
> 
> 	void (*run)(struct kvm_vcpu *vcpu);
> -	int (*handle_exit)(struct kvm_vcpu *vcpu);
> +	int (*handle_exit)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
> 	int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
> 	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
> 	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
> @@ -1134,7 +1134,7 @@ struct kvm_x86_ops {
> 	int (*check_intercept)(struct kvm_vcpu *vcpu,
> 			       struct x86_instruction_info *info,
> 			       enum x86_intercept_stage stage);
> -	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
> +	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
> 	bool (*mpx_supported)(void);
> 	bool (*xsaves_supported)(void);
> 	bool (*umip_emulated)(void);
> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
> index 3eb8411..b33c6e1 100644
> --- a/arch/x86/include/uapi/asm/vmx.h
> +++ b/arch/x86/include/uapi/asm/vmx.h
> @@ -88,6 +88,7 @@
> #define EXIT_REASON_XRSTORS             64
> #define EXIT_REASON_UMWAIT              67
> #define EXIT_REASON_TPAUSE              68
> +#define EXIT_REASON_NEED_SKIP_EMULATED_INSN -1
> 
> #define VMX_EXIT_REASONS \
> 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index d02a73a..c8e063a 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -4929,7 +4929,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
> 	*info2 = control->exit_info_2;
> }
> 
> -static int handle_exit(struct kvm_vcpu *vcpu)
> +static int handle_exit(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
> {
> 	struct vcpu_svm *svm = to_svm(vcpu);
> 	struct kvm_run *kvm_run = vcpu->run;
> @@ -6187,7 +6187,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
> 	return ret;
> }
> 
> -static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
> {
> 
> }
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 621142e5..b98198d 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -5792,7 +5792,7 @@ void dump_vmcs(void)
>  * The guest has exited.  See if we can fix it or if we need userspace
>  * assistance.
>  */
> -static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> +static int vmx_handle_exit(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)

vmx_handle_exit() should get second parameter by value and not by pointer. As it doesn’t need to modify it.

I would also rename parameter to “accel_exit_completion” to indicate this is additional work that needs to happen to complete accelerated-exit handling.
This parameter should be an enum that currently only have 2 values: ACCEL_EXIT_NONE and ACCEL_EXIT_SKIP_EMUL_INS.

> {
> 	struct vcpu_vmx *vmx = to_vmx(vcpu);
> 	u32 exit_reason = vmx->exit_reason;
> @@ -5878,7 +5878,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> 		}
> 	}
> 
> -	if (exit_reason < kvm_vmx_max_exit_handlers
> +	if (*vcpu_exit_reason == EXIT_REASON_NEED_SKIP_EMULATED_INSN) {
> +		kvm_skip_emulated_instruction(vcpu);
> +		return 1;
> +	} else if (exit_reason < kvm_vmx_max_exit_handlers
> 	    && kvm_vmx_exit_handlers[exit_reason]) {
> #ifdef CONFIG_RETPOLINE
> 		if (exit_reason == EXIT_REASON_MSR_WRITE)
> @@ -6223,7 +6226,36 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
> }
> STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
> 
> -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> +static u32 handle_ipi_fastpath(struct kvm_vcpu *vcpu)
> +{
> +	u32 index;
> +	u64 data;
> +	int ret = 0;
> +
> +	if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic)) {
> +		/*
> +		 * fastpath to IPI target, FIXED+PHYSICAL which is popular
> +		 */
> +		index = kvm_rcx_read(vcpu);
> +		data = kvm_read_edx_eax(vcpu);
> +
> +		if (((index - APIC_BASE_MSR) << 4 == APIC_ICR) &&
> +			((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
> +			((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
> +
> +			trace_kvm_msr_write(index, data);

On a standard EXIT_REASON_MSR_WRITE VMExit, this trace will be printed only after LAPIC emulation logic happens.
You should preserve same ordering.

> +			kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
> +			ret = kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
> +
> +			if (ret == 0)
> +				return EXIT_REASON_NEED_SKIP_EMULATED_INSN;
> +		}
> +	}
> +
> +	return ret;
> +}

Maybe it would be more elegant to modify this function as follows?

static int handle_accel_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
    if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) &&
        ((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
        ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {

        kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
        return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
    }

    return 1;
}

static enum accel_exit_completion handle_accel_set_msr_irqoff(struct kvm_vcpu *vcpu)
{
    u32 msr = kvm_rcx_read(vcpu);
    u64 data = kvm_read_edx_eax(vcpu);
    int ret = 0;

    switch (msr) {
    case APIC_BASE_MSR + (APIC_ICR >> 4):
        ret = handle_accel_set_x2apic_icr_irqoff(vcpu, msr, data);
        break;
    default:
        return ACCEL_EXIT_NONE;
    }

    if (!ret) {
        trace_kvm_msr_write(msr, data);
        return ACCEL_EXIT_SKIP_EMUL_INS;
    }

    return ACCEL_EXIT_NONE;
}

> +
> +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, u32 *exit_reason)
> {
> 	struct vcpu_vmx *vmx = to_vmx(vcpu);
> 
> @@ -6231,6 +6263,8 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> 		handle_external_interrupt_irqoff(vcpu);
> 	else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
> 		handle_exception_nmi_irqoff(vmx);
> +	else if (vmx->exit_reason == EXIT_REASON_MSR_WRITE)
> +		*exit_reason = handle_ipi_fastpath(vcpu);

1) This case requires a comment as the only reason it is called here is an optimisation.
In contrast to the other cases which must be called before interrupts are enabled on the host.

2) I would rename handler to handle_accel_set_msr_irqoff().
To signal this handler runs with host interrupts disabled and to make it a general place for accelerating WRMSRs in case we would require more in the future.

-Liran

> }
> 
> static bool vmx_has_emulated_msr(int index)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 991dd01..a53bce3 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7981,6 +7981,7 @@ EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
> static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> {
> 	int r;
> +	u32 exit_reason = 0;
> 	bool req_int_win =
> 		dm_request_for_irq_injection(vcpu) &&
> 		kvm_cpu_accept_dm_intr(vcpu);
> @@ -8226,7 +8227,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> 	vcpu->mode = OUTSIDE_GUEST_MODE;
> 	smp_wmb();
> 
> -	kvm_x86_ops->handle_exit_irqoff(vcpu);
> +	kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_reason);
> 
> 	/*
> 	 * Consume any pending interrupts, including the possible source of
> @@ -8270,7 +8271,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> 		kvm_lapic_sync_from_vapic(vcpu);
> 
> 	vcpu->arch.gpa_available = false;
> -	r = kvm_x86_ops->handle_exit(vcpu);
> +	r = kvm_x86_ops->handle_exit(vcpu, &exit_reason);
> 	return r;
> 
> cancel_injection:
> -- 
> 2.7.4
>
Vitaly Kuznetsov Nov. 19, 2019, 12:26 p.m. UTC | #4
Wanpeng Li <kernellwp@gmail.com> writes:

> On Tue, 19 Nov 2019 at 19:54, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>>
>> Wanpeng Li <kernellwp@gmail.com> writes:
>>
>> > From: Wanpeng Li <wanpengli@tencent.com>
>> >
>> > +     if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic)) {
>> > +             /*
>> > +              * fastpath to IPI target, FIXED+PHYSICAL which is popular
>> > +              */
>> > +             index = kvm_rcx_read(vcpu);
>> > +             data = kvm_read_edx_eax(vcpu);
>> > +
>> > +             if (((index - APIC_BASE_MSR) << 4 == APIC_ICR) &&
>>
>> What if index (RCX) is < APIC_BASE_MSR?
>
> How about if (index == (APIC_BASE_MSR + 0x300) &&
>

What about ' << 4', don't we still need it? :-) And better APIC_ICR
instead of 0x300...

Personally, I'd write something like

if (index > APIC_BASE_MSR && (index - APIC_BASE_MSR) == APIC_ICR >> 4)

and let compiler optimize this, I bet it's going to be equally good.
Sean Christopherson Nov. 19, 2019, 6:36 p.m. UTC | #5
On Tue, Nov 19, 2019 at 02:36:28PM +0800, Wanpeng Li wrote:
> From: Wanpeng Li <wanpengli@tencent.com>
> 
> ICR and TSCDEADLINE MSRs write cause the main MSRs write vmexits in 
> our product observation, multicast IPIs are not as common as unicast 
> IPI like RESCHEDULE_VECTOR and CALL_FUNCTION_SINGLE_VECTOR etc.
> 
> This patch tries to optimize x2apic physical destination mode, fixed 
> delivery mode single target IPI by delivering IPI to receiver as soon 
> as possible after sender writes ICR vmexit to avoid various checks 
> when possible, especially when running guest w/ --overcommit cpu-pm=on
> or guest can keep running, IPI can be injected to target vCPU by 
> posted-interrupt immediately.
> 
> Testing on Xeon Skylake server:
> 
> The virtual IPI latency from sender send to receiver receive reduces 
> more than 200+ cpu cycles.
> 
> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> ---
> v1 -> v2:
>  * add tracepoint
>  * Instead of a separate vcpu->fast_vmexit, set exit_reason
>    to vmx->exit_reason to -1 if the fast path succeeds.
>  * move the "kvm_skip_emulated_instruction(vcpu)" to vmx_handle_exit
>  * moving the handling into vmx_handle_exit_irqoff()
> 
>  arch/x86/include/asm/kvm_host.h |  4 ++--
>  arch/x86/include/uapi/asm/vmx.h |  1 +
>  arch/x86/kvm/svm.c              |  4 ++--
>  arch/x86/kvm/vmx/vmx.c          | 40 +++++++++++++++++++++++++++++++++++++---
>  arch/x86/kvm/x86.c              |  5 +++--
>  5 files changed, 45 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 898ab9e..0daafa9 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1084,7 +1084,7 @@ struct kvm_x86_ops {
>  	void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
>  
>  	void (*run)(struct kvm_vcpu *vcpu);
> -	int (*handle_exit)(struct kvm_vcpu *vcpu);
> +	int (*handle_exit)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
>  	int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
>  	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
>  	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
> @@ -1134,7 +1134,7 @@ struct kvm_x86_ops {
>  	int (*check_intercept)(struct kvm_vcpu *vcpu,
>  			       struct x86_instruction_info *info,
>  			       enum x86_intercept_stage stage);
> -	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
> +	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
>  	bool (*mpx_supported)(void);
>  	bool (*xsaves_supported)(void);
>  	bool (*umip_emulated)(void);
> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
> index 3eb8411..b33c6e1 100644
> --- a/arch/x86/include/uapi/asm/vmx.h
> +++ b/arch/x86/include/uapi/asm/vmx.h
> @@ -88,6 +88,7 @@
>  #define EXIT_REASON_XRSTORS             64
>  #define EXIT_REASON_UMWAIT              67
>  #define EXIT_REASON_TPAUSE              68
> +#define EXIT_REASON_NEED_SKIP_EMULATED_INSN -1
>  
>  #define VMX_EXIT_REASONS \
>  	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \

Rather than pass a custom exit reason around, can we simply handle *all*
x2apic ICR writes during handle_exit_irqoff() for both VMX and SVM?  The
only risk I can think of is that KVM could stall too long before enabling
IRQs.


From 1ea8ff1aa766928c869ef7c1eb437fe4f7b8daf9 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 19 Nov 2019 09:50:42 -0800
Subject: [PATCH] KVM: x86: Add a fast path for sending virtual IPIs in x2APIC
 mode

Add a fast path to handle writes to the ICR when the local APIC is
emulated in the kernel and x2APIC is enabled.  The fast path is invoked
at ->handle_exit_irqoff() to emulate only the effect of the ICR write
itself, i.e. the sending of IPIs.  Sending IPIs early in the VM-Exit
flow reduces the latency of virtual IPIs by avoiding the expensive bits
of transitioning from guest to host, e.g. reacquiring KVM's SRCU lock.

Suggested-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
---
 arch/x86/kvm/emulate.c |  1 -
 arch/x86/kvm/lapic.c   |  5 +++--
 arch/x86/kvm/lapic.h   | 25 +++++++++++++++++++++++++
 arch/x86/kvm/svm.c     |  3 +++
 arch/x86/kvm/vmx/vmx.c |  2 ++
 5 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 952d1a4f4d7e..8313234e7d64 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -19,7 +19,6 @@
  */
 
 #include <linux/kvm_host.h>
-#include "kvm_cache_regs.h"
 #include <asm/kvm_emulate.h>
 #include <linux/stringify.h>
 #include <asm/debugreg.h>
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 452cedd6382b..0f02820332d4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2655,9 +2655,10 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	if (reg == APIC_ICR2)
 		return 1;
 
-	/* if this is ICR write vector before command */
+	/* ICR writes are handled early by kvm_x2apic_fast_icr_write(). */
 	if (reg == APIC_ICR)
-		kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+		return 0;
+
 	return kvm_lapic_reg_write(apic, reg, (u32)data);
 }
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c1d77436126a..19fd2734d9e6 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -6,6 +6,8 @@
 
 #include <linux/kvm_host.h>
 
+#include "kvm_cache_regs.h"
+
 #define KVM_APIC_INIT		0
 #define KVM_APIC_SIPI		1
 #define KVM_APIC_LVT_NUM	6
@@ -245,4 +247,27 @@ static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
 	return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
 }
 
+/*
+ * Fast path for sending virtual IPIs immediately after VM-Exit.  Fault
+ * detection and injection, e.g. if x2apic is disabled, tracing and/or skipping
+ * of the emulated instruction are all handled in the standard WRMSR path,
+ * kvm_emulate_wrmsr().
+ */
+static inline void kvm_x2apic_fast_icr_write(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u64 data;
+
+	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
+		return;
+
+	if (kvm_rcx_read(vcpu) != (APIC_BASE_MSR + (APIC_ICR >> 4)))
+		return;
+
+	data = kvm_read_edx_eax(vcpu);
+
+	kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+	WARN_ON_ONCE(kvm_lapic_reg_write(apic, APIC_ICR, (u32)data));
+}
+
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d02a73a48461..713510210b29 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6189,7 +6189,10 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
 
 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (svm->vmcb->control.exit_code && svm->vmcb->control.exit_info_1)
+		kvm_x2apic_fast_icr_write(vcpu);
 }
 
 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 621142e55e28..82412c4085fc 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6231,6 +6231,8 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 		handle_external_interrupt_irqoff(vcpu);
 	else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
 		handle_exception_nmi_irqoff(vmx);
+	else if (vmx->exit_reason == EXIT_REASON_MSR_WRITE)
+		kvm_x2apic_fast_icr_write(vcpu);
 }
 
 static bool vmx_has_emulated_msr(int index)
Liran Alon Nov. 19, 2019, 6:58 p.m. UTC | #6
> On 19 Nov 2019, at 20:36, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> 
> On Tue, Nov 19, 2019 at 02:36:28PM +0800, Wanpeng Li wrote:
>> From: Wanpeng Li <wanpengli@tencent.com>
>> 
>> ICR and TSCDEADLINE MSRs write cause the main MSRs write vmexits in 
>> our product observation, multicast IPIs are not as common as unicast 
>> IPI like RESCHEDULE_VECTOR and CALL_FUNCTION_SINGLE_VECTOR etc.
>> 
>> This patch tries to optimize x2apic physical destination mode, fixed 
>> delivery mode single target IPI by delivering IPI to receiver as soon 
>> as possible after sender writes ICR vmexit to avoid various checks 
>> when possible, especially when running guest w/ --overcommit cpu-pm=on
>> or guest can keep running, IPI can be injected to target vCPU by 
>> posted-interrupt immediately.
>> 
>> Testing on Xeon Skylake server:
>> 
>> The virtual IPI latency from sender send to receiver receive reduces 
>> more than 200+ cpu cycles.
>> 
>> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
>> ---
>> v1 -> v2:
>> * add tracepoint
>> * Instead of a separate vcpu->fast_vmexit, set exit_reason
>>   to vmx->exit_reason to -1 if the fast path succeeds.
>> * move the "kvm_skip_emulated_instruction(vcpu)" to vmx_handle_exit
>> * moving the handling into vmx_handle_exit_irqoff()
>> 
>> arch/x86/include/asm/kvm_host.h |  4 ++--
>> arch/x86/include/uapi/asm/vmx.h |  1 +
>> arch/x86/kvm/svm.c              |  4 ++--
>> arch/x86/kvm/vmx/vmx.c          | 40 +++++++++++++++++++++++++++++++++++++---
>> arch/x86/kvm/x86.c              |  5 +++--
>> 5 files changed, 45 insertions(+), 9 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 898ab9e..0daafa9 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1084,7 +1084,7 @@ struct kvm_x86_ops {
>> 	void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
>> 
>> 	void (*run)(struct kvm_vcpu *vcpu);
>> -	int (*handle_exit)(struct kvm_vcpu *vcpu);
>> +	int (*handle_exit)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
>> 	int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
>> 	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
>> 	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
>> @@ -1134,7 +1134,7 @@ struct kvm_x86_ops {
>> 	int (*check_intercept)(struct kvm_vcpu *vcpu,
>> 			       struct x86_instruction_info *info,
>> 			       enum x86_intercept_stage stage);
>> -	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
>> +	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
>> 	bool (*mpx_supported)(void);
>> 	bool (*xsaves_supported)(void);
>> 	bool (*umip_emulated)(void);
>> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
>> index 3eb8411..b33c6e1 100644
>> --- a/arch/x86/include/uapi/asm/vmx.h
>> +++ b/arch/x86/include/uapi/asm/vmx.h
>> @@ -88,6 +88,7 @@
>> #define EXIT_REASON_XRSTORS             64
>> #define EXIT_REASON_UMWAIT              67
>> #define EXIT_REASON_TPAUSE              68
>> +#define EXIT_REASON_NEED_SKIP_EMULATED_INSN -1
>> 
>> #define VMX_EXIT_REASONS \
>> 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
> 
> Rather than pass a custom exit reason around, can we simply handle *all*
> x2apic ICR writes during handle_exit_irqoff() for both VMX and SVM?  The
> only risk I can think of is that KVM could stall too long before enabling
> IRQs.
> 

I agree that if it doesn’t cause to run with interrupts disabled then this is a nicer approach.
However, I think we may generalise a bit this patch to a clear code-path where accelerated exit handling
should be put. See my other reply in this email thread and tell me what you think:
https://www.spinics.net/lists/kernel/msg3322282.html

-Liran

> 
> From 1ea8ff1aa766928c869ef7c1eb437fe4f7b8daf9 Mon Sep 17 00:00:00 2001
> From: Sean Christopherson <sean.j.christopherson@intel.com>
> Date: Tue, 19 Nov 2019 09:50:42 -0800
> Subject: [PATCH] KVM: x86: Add a fast path for sending virtual IPIs in x2APIC
> mode
> 
> Add a fast path to handle writes to the ICR when the local APIC is
> emulated in the kernel and x2APIC is enabled.  The fast path is invoked
> at ->handle_exit_irqoff() to emulate only the effect of the ICR write
> itself, i.e. the sending of IPIs.  Sending IPIs early in the VM-Exit
> flow reduces the latency of virtual IPIs by avoiding the expensive bits
> of transitioning from guest to host, e.g. reacquiring KVM's SRCU lock.
> 
> Suggested-by: Wanpeng Li <wanpengli@tencent.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> ---
> arch/x86/kvm/emulate.c |  1 -
> arch/x86/kvm/lapic.c   |  5 +++--
> arch/x86/kvm/lapic.h   | 25 +++++++++++++++++++++++++
> arch/x86/kvm/svm.c     |  3 +++
> arch/x86/kvm/vmx/vmx.c |  2 ++
> 5 files changed, 33 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index 952d1a4f4d7e..8313234e7d64 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -19,7 +19,6 @@
>  */
> 
> #include <linux/kvm_host.h>
> -#include "kvm_cache_regs.h"
> #include <asm/kvm_emulate.h>
> #include <linux/stringify.h>
> #include <asm/debugreg.h>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 452cedd6382b..0f02820332d4 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -2655,9 +2655,10 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> 	if (reg == APIC_ICR2)
> 		return 1;
> 
> -	/* if this is ICR write vector before command */
> +	/* ICR writes are handled early by kvm_x2apic_fast_icr_write(). */
> 	if (reg == APIC_ICR)
> -		kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
> +		return 0;
> +
> 	return kvm_lapic_reg_write(apic, reg, (u32)data);
> }
> 
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index c1d77436126a..19fd2734d9e6 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -6,6 +6,8 @@
> 
> #include <linux/kvm_host.h>
> 
> +#include "kvm_cache_regs.h"
> +
> #define KVM_APIC_INIT		0
> #define KVM_APIC_SIPI		1
> #define KVM_APIC_LVT_NUM	6
> @@ -245,4 +247,27 @@ static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
> 	return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
> }
> 
> +/*
> + * Fast path for sending virtual IPIs immediately after VM-Exit.  Fault
> + * detection and injection, e.g. if x2apic is disabled, tracing and/or skipping
> + * of the emulated instruction are all handled in the standard WRMSR path,
> + * kvm_emulate_wrmsr().
> + */
> +static inline void kvm_x2apic_fast_icr_write(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +	u64 data;
> +
> +	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
> +		return;
> +
> +	if (kvm_rcx_read(vcpu) != (APIC_BASE_MSR + (APIC_ICR >> 4)))
> +		return;
> +
> +	data = kvm_read_edx_eax(vcpu);
> +
> +	kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
> +	WARN_ON_ONCE(kvm_lapic_reg_write(apic, APIC_ICR, (u32)data));
> +}
> +
> #endif
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index d02a73a48461..713510210b29 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -6189,7 +6189,10 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
> 
> static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> {
> +	struct vcpu_svm *svm = to_svm(vcpu);
> 
> +	if (svm->vmcb->control.exit_code && svm->vmcb->control.exit_info_1)
> +		kvm_x2apic_fast_icr_write(vcpu);
> }
> 
> static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 621142e55e28..82412c4085fc 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -6231,6 +6231,8 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> 		handle_external_interrupt_irqoff(vcpu);
> 	else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
> 		handle_exception_nmi_irqoff(vmx);
> +	else if (vmx->exit_reason == EXIT_REASON_MSR_WRITE)
> +		kvm_x2apic_fast_icr_write(vcpu);
> }
> 
> static bool vmx_has_emulated_msr(int index)
> -- 
> 2.24.0
>
Wanpeng Li Nov. 20, 2019, 12:24 a.m. UTC | #7
On Wed, 20 Nov 2019 at 02:36, Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
[...]
>
> From 1ea8ff1aa766928c869ef7c1eb437fe4f7b8daf9 Mon Sep 17 00:00:00 2001
> From: Sean Christopherson <sean.j.christopherson@intel.com>
> Date: Tue, 19 Nov 2019 09:50:42 -0800
> Subject: [PATCH] KVM: x86: Add a fast path for sending virtual IPIs in x2APIC
>  mode
>
> Add a fast path to handle writes to the ICR when the local APIC is
> emulated in the kernel and x2APIC is enabled.  The fast path is invoked
> at ->handle_exit_irqoff() to emulate only the effect of the ICR write
> itself, i.e. the sending of IPIs.  Sending IPIs early in the VM-Exit
> flow reduces the latency of virtual IPIs by avoiding the expensive bits
> of transitioning from guest to host, e.g. reacquiring KVM's SRCU lock.
>
> Suggested-by: Wanpeng Li <wanpengli@tencent.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>

Hmm, I welcome the idea to improve the original patch, but this is too
much for me. :(
Wanpeng Li Nov. 20, 2019, 12:34 a.m. UTC | #8
On Wed, 20 Nov 2019 at 02:58, Liran Alon <liran.alon@oracle.com> wrote:
>
>
>
> > On 19 Nov 2019, at 20:36, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> >
> > On Tue, Nov 19, 2019 at 02:36:28PM +0800, Wanpeng Li wrote:
> >> From: Wanpeng Li <wanpengli@tencent.com>
> >>
> >> ICR and TSCDEADLINE MSRs write cause the main MSRs write vmexits in
> >> our product observation, multicast IPIs are not as common as unicast
> >> IPI like RESCHEDULE_VECTOR and CALL_FUNCTION_SINGLE_VECTOR etc.
> >>
> >> This patch tries to optimize x2apic physical destination mode, fixed
> >> delivery mode single target IPI by delivering IPI to receiver as soon
> >> as possible after sender writes ICR vmexit to avoid various checks
> >> when possible, especially when running guest w/ --overcommit cpu-pm=on
> >> or guest can keep running, IPI can be injected to target vCPU by
> >> posted-interrupt immediately.
> >>
> >> Testing on Xeon Skylake server:
> >>
> >> The virtual IPI latency from sender send to receiver receive reduces
> >> more than 200+ cpu cycles.
> >>
> >> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> >> ---
> >> v1 -> v2:
> >> * add tracepoint
> >> * Instead of a separate vcpu->fast_vmexit, set exit_reason
> >>   to vmx->exit_reason to -1 if the fast path succeeds.
> >> * move the "kvm_skip_emulated_instruction(vcpu)" to vmx_handle_exit
> >> * moving the handling into vmx_handle_exit_irqoff()
> >>
> >> arch/x86/include/asm/kvm_host.h |  4 ++--
> >> arch/x86/include/uapi/asm/vmx.h |  1 +
> >> arch/x86/kvm/svm.c              |  4 ++--
> >> arch/x86/kvm/vmx/vmx.c          | 40 +++++++++++++++++++++++++++++++++++++---
> >> arch/x86/kvm/x86.c              |  5 +++--
> >> 5 files changed, 45 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >> index 898ab9e..0daafa9 100644
> >> --- a/arch/x86/include/asm/kvm_host.h
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -1084,7 +1084,7 @@ struct kvm_x86_ops {
> >>      void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
> >>
> >>      void (*run)(struct kvm_vcpu *vcpu);
> >> -    int (*handle_exit)(struct kvm_vcpu *vcpu);
> >> +    int (*handle_exit)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
> >>      int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
> >>      void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
> >>      u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
> >> @@ -1134,7 +1134,7 @@ struct kvm_x86_ops {
> >>      int (*check_intercept)(struct kvm_vcpu *vcpu,
> >>                             struct x86_instruction_info *info,
> >>                             enum x86_intercept_stage stage);
> >> -    void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
> >> +    void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
> >>      bool (*mpx_supported)(void);
> >>      bool (*xsaves_supported)(void);
> >>      bool (*umip_emulated)(void);
> >> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
> >> index 3eb8411..b33c6e1 100644
> >> --- a/arch/x86/include/uapi/asm/vmx.h
> >> +++ b/arch/x86/include/uapi/asm/vmx.h
> >> @@ -88,6 +88,7 @@
> >> #define EXIT_REASON_XRSTORS             64
> >> #define EXIT_REASON_UMWAIT              67
> >> #define EXIT_REASON_TPAUSE              68
> >> +#define EXIT_REASON_NEED_SKIP_EMULATED_INSN -1
> >>
> >> #define VMX_EXIT_REASONS \
> >>      { EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
> >
> > Rather than pass a custom exit reason around, can we simply handle *all*
> > x2apic ICR writes during handle_exit_irqoff() for both VMX and SVM?  The
> > only risk I can think of is that KVM could stall too long before enabling
> > IRQs.
> >
>
> I agree that if it doesn’t cause to run with interrupts disabled then this is a nicer approach.

In x2apic cluster mode, each cluster can contain up to 16 logical IDs,
at the worst case, target vCPUs should be woken up one by one, and the
function select_idle_sibling() in scheduler is a well-known cpu burn
searching logic, I'm afraid we will extend the interrupts disabled and
preemption off time too much.

> However, I think we may generalise a bit this patch to a clear code-path where accelerated exit handling
> should be put. See my other reply in this email thread and tell me what you think:
> https://www.spinics.net/lists/kernel/msg3322282.html

Thanks for the nicer code-path suggestion, I will try it. :)

    Wanpeng
Wanpeng Li Nov. 20, 2019, 3:49 a.m. UTC | #9
On Tue, 19 Nov 2019 at 20:11, Liran Alon <liran.alon@oracle.com> wrote:
>
>
>
> > On 19 Nov 2019, at 8:36, Wanpeng Li <kernellwp@gmail.com> wrote:
> >
> > From: Wanpeng Li <wanpengli@tencent.com>
> >
> > ICR and TSCDEADLINE MSRs write cause the main MSRs write vmexits in
> > our product observation, multicast IPIs are not as common as unicast
> > IPI like RESCHEDULE_VECTOR and CALL_FUNCTION_SINGLE_VECTOR etc.
> >
> > This patch tries to optimize x2apic physical destination mode, fixed
> > delivery mode single target IPI by delivering IPI to receiver as soon
> > as possible after sender writes ICR vmexit to avoid various checks
> > when possible, especially when running guest w/ --overcommit cpu-pm=on
> > or guest can keep running, IPI can be injected to target vCPU by
> > posted-interrupt immediately.
> >
> > Testing on Xeon Skylake server:
> >
> > The virtual IPI latency from sender send to receiver receive reduces
> > more than 200+ cpu cycles.
> >
> > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > ---
> > v1 -> v2:
> > * add tracepoint
> > * Instead of a separate vcpu->fast_vmexit, set exit_reason
> >   to vmx->exit_reason to -1 if the fast path succeeds.
> > * move the "kvm_skip_emulated_instruction(vcpu)" to vmx_handle_exit
> > * moving the handling into vmx_handle_exit_irqoff()
> >
> > arch/x86/include/asm/kvm_host.h |  4 ++--
> > arch/x86/include/uapi/asm/vmx.h |  1 +
> > arch/x86/kvm/svm.c              |  4 ++--
> > arch/x86/kvm/vmx/vmx.c          | 40 +++++++++++++++++++++++++++++++++++++---
> > arch/x86/kvm/x86.c              |  5 +++--
> > 5 files changed, 45 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 898ab9e..0daafa9 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1084,7 +1084,7 @@ struct kvm_x86_ops {
> >       void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
> >
> >       void (*run)(struct kvm_vcpu *vcpu);
> > -     int (*handle_exit)(struct kvm_vcpu *vcpu);
> > +     int (*handle_exit)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
> >       int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
> >       void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
> >       u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
> > @@ -1134,7 +1134,7 @@ struct kvm_x86_ops {
> >       int (*check_intercept)(struct kvm_vcpu *vcpu,
> >                              struct x86_instruction_info *info,
> >                              enum x86_intercept_stage stage);
> > -     void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
> > +     void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
> >       bool (*mpx_supported)(void);
> >       bool (*xsaves_supported)(void);
> >       bool (*umip_emulated)(void);
> > diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
> > index 3eb8411..b33c6e1 100644
> > --- a/arch/x86/include/uapi/asm/vmx.h
> > +++ b/arch/x86/include/uapi/asm/vmx.h
> > @@ -88,6 +88,7 @@
> > #define EXIT_REASON_XRSTORS             64
> > #define EXIT_REASON_UMWAIT              67
> > #define EXIT_REASON_TPAUSE              68
> > +#define EXIT_REASON_NEED_SKIP_EMULATED_INSN -1
> >
> > #define VMX_EXIT_REASONS \
> >       { EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index d02a73a..c8e063a 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -4929,7 +4929,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
> >       *info2 = control->exit_info_2;
> > }
> >
> > -static int handle_exit(struct kvm_vcpu *vcpu)
> > +static int handle_exit(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
> > {
> >       struct vcpu_svm *svm = to_svm(vcpu);
> >       struct kvm_run *kvm_run = vcpu->run;
> > @@ -6187,7 +6187,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
> >       return ret;
> > }
> >
> > -static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> > +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
> > {
> >
> > }
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 621142e5..b98198d 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -5792,7 +5792,7 @@ void dump_vmcs(void)
> >  * The guest has exited.  See if we can fix it or if we need userspace
> >  * assistance.
> >  */
> > -static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> > +static int vmx_handle_exit(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
>
> vmx_handle_exit() should get second parameter by value and not by pointer. As it doesn’t need to modify it.
>
> I would also rename parameter to “accel_exit_completion” to indicate this is additional work that needs to happen to complete accelerated-exit handling.
> This parameter should be an enum that currently only have 2 values: ACCEL_EXIT_NONE and ACCEL_EXIT_SKIP_EMUL_INS.
>
> > {
> >       struct vcpu_vmx *vmx = to_vmx(vcpu);
> >       u32 exit_reason = vmx->exit_reason;
> > @@ -5878,7 +5878,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> >               }
> >       }
> >
> > -     if (exit_reason < kvm_vmx_max_exit_handlers
> > +     if (*vcpu_exit_reason == EXIT_REASON_NEED_SKIP_EMULATED_INSN) {
> > +             kvm_skip_emulated_instruction(vcpu);
> > +             return 1;
> > +     } else if (exit_reason < kvm_vmx_max_exit_handlers
> >           && kvm_vmx_exit_handlers[exit_reason]) {
> > #ifdef CONFIG_RETPOLINE
> >               if (exit_reason == EXIT_REASON_MSR_WRITE)
> > @@ -6223,7 +6226,36 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
> > }
> > STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
> >
> > -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> > +static u32 handle_ipi_fastpath(struct kvm_vcpu *vcpu)
> > +{
> > +     u32 index;
> > +     u64 data;
> > +     int ret = 0;
> > +
> > +     if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic)) {
> > +             /*
> > +              * fastpath to IPI target, FIXED+PHYSICAL which is popular
> > +              */
> > +             index = kvm_rcx_read(vcpu);
> > +             data = kvm_read_edx_eax(vcpu);
> > +
> > +             if (((index - APIC_BASE_MSR) << 4 == APIC_ICR) &&
> > +                     ((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
> > +                     ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
> > +
> > +                     trace_kvm_msr_write(index, data);
>
> On a standard EXIT_REASON_MSR_WRITE VMExit, this trace will be printed only after LAPIC emulation logic happens.
> You should preserve same ordering.
>
> > +                     kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
> > +                     ret = kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
> > +
> > +                     if (ret == 0)
> > +                             return EXIT_REASON_NEED_SKIP_EMULATED_INSN;
> > +             }
> > +     }
> > +
> > +     return ret;
> > +}
>
> Maybe it would be more elegant to modify this function as follows?
>
> static int handle_accel_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> {
>     if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) &&
>         ((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
>         ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
>
>         kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
>         return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
>     }
>
>     return 1;
> }
>
> static enum accel_exit_completion handle_accel_set_msr_irqoff(struct kvm_vcpu *vcpu)
> {
>     u32 msr = kvm_rcx_read(vcpu);
>     u64 data = kvm_read_edx_eax(vcpu);
>     int ret = 0;
>
>     switch (msr) {
>     case APIC_BASE_MSR + (APIC_ICR >> 4):
>         ret = handle_accel_set_x2apic_icr_irqoff(vcpu, msr, data);
>         break;
>     default:
>         return ACCEL_EXIT_NONE;
>     }
>
>     if (!ret) {
>         trace_kvm_msr_write(msr, data);
>         return ACCEL_EXIT_SKIP_EMUL_INS;
>     }
>
>     return ACCEL_EXIT_NONE;
> }
>
> > +
> > +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, u32 *exit_reason)
> > {
> >       struct vcpu_vmx *vmx = to_vmx(vcpu);
> >
> > @@ -6231,6 +6263,8 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> >               handle_external_interrupt_irqoff(vcpu);
> >       else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
> >               handle_exception_nmi_irqoff(vmx);
> > +     else if (vmx->exit_reason == EXIT_REASON_MSR_WRITE)
> > +             *exit_reason = handle_ipi_fastpath(vcpu);
>
> 1) This case requires a comment as the only reason it is called here is an optimisation.
> In contrast to the other cases which must be called before interrupts are enabled on the host.
>
> 2) I would rename handler to handle_accel_set_msr_irqoff().
> To signal this handler runs with host interrupts disabled and to make it a general place for accelerating WRMSRs in case we would require more in the future.

Yes, TSCDEADLINE/VMX PREEMPTION TIMER is in my todo list after this
merged upstream, handle all the comments in v3, thanks for making this
nicer further. :)

    Wanpeng
Sean Christopherson Nov. 20, 2019, 5:02 p.m. UTC | #10
On Wed, Nov 20, 2019 at 11:49:36AM +0800, Wanpeng Li wrote:
> On Tue, 19 Nov 2019 at 20:11, Liran Alon <liran.alon@oracle.com> wrote:
> > > +
> > > +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, u32 *exit_reason)
> > > {
> > >       struct vcpu_vmx *vmx = to_vmx(vcpu);
> > >
> > > @@ -6231,6 +6263,8 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> > >               handle_external_interrupt_irqoff(vcpu);
> > >       else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
> > >               handle_exception_nmi_irqoff(vmx);
> > > +     else if (vmx->exit_reason == EXIT_REASON_MSR_WRITE)
> > > +             *exit_reason = handle_ipi_fastpath(vcpu);
> >
> > 1) This case requires a comment as the only reason it is called here is an
> > optimisation.  In contrast to the other cases which must be called before
> > interrupts are enabled on the host.
> >
> > 2) I would rename handler to handle_accel_set_msr_irqoff().  To signal this
> > handler runs with host interrupts disabled and to make it a general place
> > for accelerating WRMSRs in case we would require more in the future.
> 
> Yes, TSCDEADLINE/VMX PREEMPTION TIMER is in my todo list after this merged
> upstream, handle all the comments in v3, thanks for making this nicer
> further. :)

Handling those is very different than what is being proposed here though.
For this case, only the side effect of the WRMSR is being expedited, KVM
still goes through the heavy VM-Exit handler path to handle emulating the
WRMSR itself.

To truly expedite things like TSCDEADLINE, the entire emulation of WRMSR
would need be handled without going through the standard VM-Exit handler,
which is a much more fundamental change to vcpu_enter_guest() and has
different requirements.  For example, keeping IRQs disabled is pointless
for generic WRMSR emulation since the interrupt will fire as soon as KVM
resumes the guest, whereas keeping IRQs disabled for processing ICR writes
is a valid optimization since recognition of the IPI on the dest vCPU
isn't dependent on KVM resuming the current vCPU.

Rather than optimizing full emulation flows one at a time, i.e. exempting
the ICR case, I wonder if we're better off figuring out a way to improve
the performance of VM-Exit handling at a larger scale, e.g. avoid locking
kvm->srcu unnecessarily, Andrea's retpolin changes, etc...

Oh, a random thought, this fast path needs to be skipped if KVM is
running L2, i.e. is_guest_mode(vcpu) is true.
Paolo Bonzini Nov. 20, 2019, 5:43 p.m. UTC | #11
On 19/11/19 13:26, Vitaly Kuznetsov wrote:
> What about ' << 4', don't we still need it? :-) And better APIC_ICR
> instead of 0x300...
> 
> Personally, I'd write something like
> 
> if (index > APIC_BASE_MSR && (index - APIC_BASE_MSR) == APIC_ICR >> 4)
> 
> and let compiler optimize this, I bet it's going to be equally good.

Or "index == APIC_BASE_MSR + (APIC_ICR >> 4)".

Paolo
Wanpeng Li Nov. 21, 2019, 1:34 a.m. UTC | #12
On Thu, 21 Nov 2019 at 01:02, Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
>
> On Wed, Nov 20, 2019 at 11:49:36AM +0800, Wanpeng Li wrote:
> > On Tue, 19 Nov 2019 at 20:11, Liran Alon <liran.alon@oracle.com> wrote:
> > > > +
> > > > +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, u32 *exit_reason)
> > > > {
> > > >       struct vcpu_vmx *vmx = to_vmx(vcpu);
> > > >
> > > > @@ -6231,6 +6263,8 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> > > >               handle_external_interrupt_irqoff(vcpu);
> > > >       else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
> > > >               handle_exception_nmi_irqoff(vmx);
> > > > +     else if (vmx->exit_reason == EXIT_REASON_MSR_WRITE)
> > > > +             *exit_reason = handle_ipi_fastpath(vcpu);
> > >
> > > 1) This case requires a comment as the only reason it is called here is an
> > > optimisation.  In contrast to the other cases which must be called before
> > > interrupts are enabled on the host.
> > >
> > > 2) I would rename handler to handle_accel_set_msr_irqoff().  To signal this
> > > handler runs with host interrupts disabled and to make it a general place
> > > for accelerating WRMSRs in case we would require more in the future.
> >
> > Yes, TSCDEADLINE/VMX PREEMPTION TIMER is in my todo list after this merged
> > upstream, handle all the comments in v3, thanks for making this nicer
> > further. :)
>
> Handling those is very different than what is being proposed here though.
> For this case, only the side effect of the WRMSR is being expedited, KVM
> still goes through the heavy VM-Exit handler path to handle emulating the
> WRMSR itself.
>
> To truly expedite things like TSCDEADLINE, the entire emulation of WRMSR
> would need be handled without going through the standard VM-Exit handler,
> which is a much more fundamental change to vcpu_enter_guest() and has
> different requirements.  For example, keeping IRQs disabled is pointless
> for generic WRMSR emulation since the interrupt will fire as soon as KVM
> resumes the guest, whereas keeping IRQs disabled for processing ICR writes
> is a valid optimization since recognition of the IPI on the dest vCPU
> isn't dependent on KVM resuming the current vCPU.
>
> Rather than optimizing full emulation flows one at a time, i.e. exempting
> the ICR case, I wonder if we're better off figuring out a way to improve
> the performance of VM-Exit handling at a larger scale, e.g. avoid locking
> kvm->srcu unnecessarily, Andrea's retpolin changes, etc...

I use the latest kvm/queue, so Andrea's patch is there. As you know,
improve the performance of vmexit is a long term effort. But, let's
make v4 upstream firstly. :)

    Wanpeng
Wanpeng Li Nov. 21, 2019, 3:19 a.m. UTC | #13
On Thu, 21 Nov 2019 at 01:43, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 19/11/19 13:26, Vitaly Kuznetsov wrote:
> > What about ' << 4', don't we still need it? :-) And better APIC_ICR
> > instead of 0x300...
> >
> > Personally, I'd write something like
> >
> > if (index > APIC_BASE_MSR && (index - APIC_BASE_MSR) == APIC_ICR >> 4)
> >
> > and let compiler optimize this, I bet it's going to be equally good.
>
> Or "index == APIC_BASE_MSR + (APIC_ICR >> 4)".

It is done in v3 and v4. Please have a look. :)

    Wanpeng
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 898ab9e..0daafa9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1084,7 +1084,7 @@  struct kvm_x86_ops {
 	void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
 
 	void (*run)(struct kvm_vcpu *vcpu);
-	int (*handle_exit)(struct kvm_vcpu *vcpu);
+	int (*handle_exit)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
 	int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
 	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
 	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
@@ -1134,7 +1134,7 @@  struct kvm_x86_ops {
 	int (*check_intercept)(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage);
-	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
+	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason);
 	bool (*mpx_supported)(void);
 	bool (*xsaves_supported)(void);
 	bool (*umip_emulated)(void);
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 3eb8411..b33c6e1 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -88,6 +88,7 @@ 
 #define EXIT_REASON_XRSTORS             64
 #define EXIT_REASON_UMWAIT              67
 #define EXIT_REASON_TPAUSE              68
+#define EXIT_REASON_NEED_SKIP_EMULATED_INSN -1
 
 #define VMX_EXIT_REASONS \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d02a73a..c8e063a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4929,7 +4929,7 @@  static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 	*info2 = control->exit_info_2;
 }
 
-static int handle_exit(struct kvm_vcpu *vcpu)
+static int handle_exit(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct kvm_run *kvm_run = vcpu->run;
@@ -6187,7 +6187,7 @@  static int svm_check_intercept(struct kvm_vcpu *vcpu,
 	return ret;
 }
 
-static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
 {
 
 }
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 621142e5..b98198d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5792,7 +5792,7 @@  void dump_vmcs(void)
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
  */
-static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+static int vmx_handle_exit(struct kvm_vcpu *vcpu, u32 *vcpu_exit_reason)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exit_reason = vmx->exit_reason;
@@ -5878,7 +5878,10 @@  static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		}
 	}
 
-	if (exit_reason < kvm_vmx_max_exit_handlers
+	if (*vcpu_exit_reason == EXIT_REASON_NEED_SKIP_EMULATED_INSN) {
+		kvm_skip_emulated_instruction(vcpu);
+		return 1;
+	} else if (exit_reason < kvm_vmx_max_exit_handlers
 	    && kvm_vmx_exit_handlers[exit_reason]) {
 #ifdef CONFIG_RETPOLINE
 		if (exit_reason == EXIT_REASON_MSR_WRITE)
@@ -6223,7 +6226,36 @@  static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
 }
 STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
 
-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+static u32 handle_ipi_fastpath(struct kvm_vcpu *vcpu)
+{
+	u32 index;
+	u64 data;
+	int ret = 0;
+
+	if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic)) {
+		/*
+		 * fastpath to IPI target, FIXED+PHYSICAL which is popular
+		 */
+		index = kvm_rcx_read(vcpu);
+		data = kvm_read_edx_eax(vcpu);
+
+		if (((index - APIC_BASE_MSR) << 4 == APIC_ICR) &&
+			((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
+			((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
+
+			trace_kvm_msr_write(index, data);
+			kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
+			ret = kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
+
+			if (ret == 0)
+				return EXIT_REASON_NEED_SKIP_EMULATED_INSN;
+		}
+	}
+
+	return ret;
+}
+
+static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, u32 *exit_reason)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
@@ -6231,6 +6263,8 @@  static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 		handle_external_interrupt_irqoff(vcpu);
 	else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
 		handle_exception_nmi_irqoff(vmx);
+	else if (vmx->exit_reason == EXIT_REASON_MSR_WRITE)
+		*exit_reason = handle_ipi_fastpath(vcpu);
 }
 
 static bool vmx_has_emulated_msr(int index)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 991dd01..a53bce3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7981,6 +7981,7 @@  EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
+	u32 exit_reason = 0;
 	bool req_int_win =
 		dm_request_for_irq_injection(vcpu) &&
 		kvm_cpu_accept_dm_intr(vcpu);
@@ -8226,7 +8227,7 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
 
-	kvm_x86_ops->handle_exit_irqoff(vcpu);
+	kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_reason);
 
 	/*
 	 * Consume any pending interrupts, including the possible source of
@@ -8270,7 +8271,7 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		kvm_lapic_sync_from_vapic(vcpu);
 
 	vcpu->arch.gpa_available = false;
-	r = kvm_x86_ops->handle_exit(vcpu);
+	r = kvm_x86_ops->handle_exit(vcpu, &exit_reason);
 	return r;
 
 cancel_injection: