diff mbox series

[v3] KVM: LAPIC: Recalculate apic map in batch

Message ID 1582684862-10880-1-git-send-email-wanpengli@tencent.com (mailing list archive)
State New, archived
Headers show
Series [v3] KVM: LAPIC: Recalculate apic map in batch | expand

Commit Message

Wanpeng Li Feb. 26, 2020, 2:41 a.m. UTC
From: Wanpeng Li <wanpengli@tencent.com>

In the vCPU reset and set APIC_BASE MSR path, the apic map will be recalculated 
several times, each time it will consume 10+ us observed by ftrace in my 
non-overcommit environment since the expensive memory allocate/mutex/rcu etc 
operations. This patch optimizes it by recaluating apic map in batch, I hope 
this can benefit the serverless scenario which can frequently create/destroy 
VMs. 

Before patch:

kvm_lapic_reset  ~27us

After patch:

kvm_lapic_reset  ~14us 

Observed by ftrace, improve ~48%.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
v2 -> v3:
 * move apic_map_dirty to kvm_arch
 * add the suggestions from Paolo

v1 -> v2:
 * add apic_map_dirty to kvm_lapic
 * error condition in kvm_apic_set_state, do recalcuate  unconditionally

 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/lapic.c            | 46 ++++++++++++++++++++++++++++++++---------
 arch/x86/kvm/lapic.h            |  1 +
 arch/x86/kvm/x86.c              |  1 +
 4 files changed, 39 insertions(+), 10 deletions(-)

Comments

Paolo Bonzini March 2, 2020, 4:32 p.m. UTC | #1
On 26/02/20 03:41, Wanpeng Li wrote:
> From: Wanpeng Li <wanpengli@tencent.com>
> 
> In the vCPU reset and set APIC_BASE MSR path, the apic map will be recalculated 
> several times, each time it will consume 10+ us observed by ftrace in my 
> non-overcommit environment since the expensive memory allocate/mutex/rcu etc 
> operations. This patch optimizes it by recaluating apic map in batch, I hope 
> this can benefit the serverless scenario which can frequently create/destroy 
> VMs. 
> 
> Before patch:
> 
> kvm_lapic_reset  ~27us
> 
> After patch:
> 
> kvm_lapic_reset  ~14us 
> 
> Observed by ftrace, improve ~48%.
> 
> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> ---
> v2 -> v3:
>  * move apic_map_dirty to kvm_arch
>  * add the suggestions from Paolo
> 
> v1 -> v2:
>  * add apic_map_dirty to kvm_lapic
>  * error condition in kvm_apic_set_state, do recalcuate  unconditionally
> 
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/lapic.c            | 46 ++++++++++++++++++++++++++++++++---------
>  arch/x86/kvm/lapic.h            |  1 +
>  arch/x86/kvm/x86.c              |  1 +
>  4 files changed, 39 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 40a0c0f..4380ed1 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -920,6 +920,7 @@ struct kvm_arch {
>  	atomic_t vapics_in_nmi_mode;
>  	struct mutex apic_map_lock;
>  	struct kvm_apic_map *apic_map;
> +	bool apic_map_dirty;
>  
>  	bool apic_access_page_done;
>  	unsigned long apicv_inhibit_reasons;
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index afcd30d..de832aa 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -164,14 +164,28 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
>  	kvfree(map);
>  }
>  
> -static void recalculate_apic_map(struct kvm *kvm)
> +void kvm_recalculate_apic_map(struct kvm *kvm)
>  {
>  	struct kvm_apic_map *new, *old = NULL;
>  	struct kvm_vcpu *vcpu;
>  	int i;
>  	u32 max_id = 255; /* enough space for any xAPIC ID */
>  
> +	if (!kvm->arch.apic_map_dirty) {
> +		/*
> +		 * Read kvm->arch.apic_map_dirty before
> +		 * kvm->arch.apic_map
> +		 */
> +		smp_rmb();
> +		return;
> +	}
> +
>  	mutex_lock(&kvm->arch.apic_map_lock);
> +	if (!kvm->arch.apic_map_dirty) {
> +		/* Someone else has updated the map. */
> +		mutex_unlock(&kvm->arch.apic_map_lock);
> +		return;
> +	}
>  
>  	kvm_for_each_vcpu(i, vcpu, kvm)
>  		if (kvm_apic_present(vcpu))
> @@ -236,6 +250,12 @@ static void recalculate_apic_map(struct kvm *kvm)
>  	old = rcu_dereference_protected(kvm->arch.apic_map,
>  			lockdep_is_held(&kvm->arch.apic_map_lock));
>  	rcu_assign_pointer(kvm->arch.apic_map, new);
> +	/*
> +	 * Write kvm->arch.apic_map before
> +	 * clearing apic->apic_map_dirty
> +	 */
> +	smp_wmb();
> +	kvm->arch.apic_map_dirty = false;
>  	mutex_unlock(&kvm->arch.apic_map_lock);
>  
>  	if (old)
> @@ -257,20 +277,20 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
>  		else
>  			static_key_slow_inc(&apic_sw_disabled.key);
>  
> -		recalculate_apic_map(apic->vcpu->kvm);
> +		apic->vcpu->kvm->arch.apic_map_dirty = true;
>  	}
>  }
>  
>  static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
>  {
>  	kvm_lapic_set_reg(apic, APIC_ID, id << 24);
> -	recalculate_apic_map(apic->vcpu->kvm);
> +	apic->vcpu->kvm->arch.apic_map_dirty = true;
>  }
>  
>  static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
>  {
>  	kvm_lapic_set_reg(apic, APIC_LDR, id);
> -	recalculate_apic_map(apic->vcpu->kvm);
> +	apic->vcpu->kvm->arch.apic_map_dirty = true;
>  }
>  
>  static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
> @@ -286,7 +306,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
>  
>  	kvm_lapic_set_reg(apic, APIC_ID, id);
>  	kvm_lapic_set_reg(apic, APIC_LDR, ldr);
> -	recalculate_apic_map(apic->vcpu->kvm);
> +	apic->vcpu->kvm->arch.apic_map_dirty = true;
>  }
>  
>  static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
> @@ -1912,7 +1932,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
>  	case APIC_DFR:
>  		if (!apic_x2apic_mode(apic)) {
>  			kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
> -			recalculate_apic_map(apic->vcpu->kvm);
> +			apic->vcpu->kvm->arch.apic_map_dirty = true;
>  		} else
>  			ret = 1;
>  		break;
> @@ -2018,6 +2038,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
>  		break;
>  	}
>  
> +	kvm_recalculate_apic_map(apic->vcpu->kvm);
> +
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
> @@ -2166,7 +2188,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>  			static_key_slow_dec_deferred(&apic_hw_disabled);
>  		} else {
>  			static_key_slow_inc(&apic_hw_disabled.key);
> -			recalculate_apic_map(vcpu->kvm);
> +			vcpu->kvm->arch.apic_map_dirty = true;
>  		}
>  	}
>  
> @@ -2207,6 +2229,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
>  	if (!apic)
>  		return;
>  
> +	vcpu->kvm->arch.apic_map_dirty = false;
>  	/* Stop the timer in case it's a reset to an active apic */
>  	hrtimer_cancel(&apic->lapic_timer.timer);
>  
> @@ -2258,6 +2281,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
>  
>  	vcpu->arch.apic_arb_prio = 0;
>  	vcpu->arch.apic_attention = 0;
> +
> +	kvm_recalculate_apic_map(vcpu->kvm);
>  }
>  
>  /*
> @@ -2479,17 +2504,18 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
>  	struct kvm_lapic *apic = vcpu->arch.apic;
>  	int r;
>  
> -
>  	kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
>  	/* set SPIV separately to get count of SW disabled APICs right */
>  	apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
>  
>  	r = kvm_apic_state_fixup(vcpu, s, true);
> -	if (r)
> +	if (r) {
> +		kvm_recalculate_apic_map(vcpu->kvm);
>  		return r;
> +	}
>  	memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
>  
> -	recalculate_apic_map(vcpu->kvm);
> +	kvm_recalculate_apic_map(vcpu->kvm);
>  	kvm_apic_set_version(vcpu);
>  
>  	apic_update_ppr(apic);
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index ec6fbfe..7581bc2 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -78,6 +78,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
>  void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
>  void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
>  u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
> +void kvm_recalculate_apic_map(struct kvm *kvm);
>  void kvm_apic_set_version(struct kvm_vcpu *vcpu);
>  int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
>  int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 79bc995..d3802a2 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -350,6 +350,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  	}
>  
>  	kvm_lapic_set_base(vcpu, msr_info->data);
> +	kvm_recalculate_apic_map(vcpu->kvm);
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(kvm_set_apic_base);
> 

Queued, thanks.

Paolo
Igor Mammedov June 19, 2020, 12:36 p.m. UTC | #2
On Wed, 26 Feb 2020 10:41:02 +0800
Wanpeng Li <kernellwp@gmail.com> wrote:

> From: Wanpeng Li <wanpengli@tencent.com>
> 
> In the vCPU reset and set APIC_BASE MSR path, the apic map will be recalculated 
> several times, each time it will consume 10+ us observed by ftrace in my 
> non-overcommit environment since the expensive memory allocate/mutex/rcu etc 
> operations. This patch optimizes it by recaluating apic map in batch, I hope 
> this can benefit the serverless scenario which can frequently create/destroy 
> VMs. 
> 
> Before patch:
> 
> kvm_lapic_reset  ~27us
> 
> After patch:
> 
> kvm_lapic_reset  ~14us 
> 
> Observed by ftrace, improve ~48%.
> 
> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
this broke CPU hotplug,

qemu-kvm -m 2G -smp 4,maxcpus=8  -monitor stdio
(qemu) device_add qemu64-x86_64-cpu,socket-id=4,core-id=0,thread-id=0

in guest fails with:

 smpboot: do_boot_cpu failed(-1) to wakeup CPU#4

which makes me suspect that  INIT/SIPI wasn't delivered

Is it a know issue?

> ---
> v2 -> v3:
>  * move apic_map_dirty to kvm_arch
>  * add the suggestions from Paolo
> 
> v1 -> v2:
>  * add apic_map_dirty to kvm_lapic
>  * error condition in kvm_apic_set_state, do recalcuate  unconditionally
> 
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/lapic.c            | 46 ++++++++++++++++++++++++++++++++---------
>  arch/x86/kvm/lapic.h            |  1 +
>  arch/x86/kvm/x86.c              |  1 +
>  4 files changed, 39 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 40a0c0f..4380ed1 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -920,6 +920,7 @@ struct kvm_arch {
>  	atomic_t vapics_in_nmi_mode;
>  	struct mutex apic_map_lock;
>  	struct kvm_apic_map *apic_map;
> +	bool apic_map_dirty;
>  
>  	bool apic_access_page_done;
>  	unsigned long apicv_inhibit_reasons;
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index afcd30d..de832aa 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -164,14 +164,28 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
>  	kvfree(map);
>  }
>  
> -static void recalculate_apic_map(struct kvm *kvm)
> +void kvm_recalculate_apic_map(struct kvm *kvm)
>  {
>  	struct kvm_apic_map *new, *old = NULL;
>  	struct kvm_vcpu *vcpu;
>  	int i;
>  	u32 max_id = 255; /* enough space for any xAPIC ID */
>  
> +	if (!kvm->arch.apic_map_dirty) {
> +		/*
> +		 * Read kvm->arch.apic_map_dirty before
> +		 * kvm->arch.apic_map
> +		 */
> +		smp_rmb();
> +		return;
> +	}
> +
>  	mutex_lock(&kvm->arch.apic_map_lock);
> +	if (!kvm->arch.apic_map_dirty) {
> +		/* Someone else has updated the map. */
> +		mutex_unlock(&kvm->arch.apic_map_lock);
> +		return;
> +	}
>  
>  	kvm_for_each_vcpu(i, vcpu, kvm)
>  		if (kvm_apic_present(vcpu))
> @@ -236,6 +250,12 @@ static void recalculate_apic_map(struct kvm *kvm)
>  	old = rcu_dereference_protected(kvm->arch.apic_map,
>  			lockdep_is_held(&kvm->arch.apic_map_lock));
>  	rcu_assign_pointer(kvm->arch.apic_map, new);
> +	/*
> +	 * Write kvm->arch.apic_map before
> +	 * clearing apic->apic_map_dirty
> +	 */
> +	smp_wmb();
> +	kvm->arch.apic_map_dirty = false;
>  	mutex_unlock(&kvm->arch.apic_map_lock);
>  
>  	if (old)
> @@ -257,20 +277,20 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
>  		else
>  			static_key_slow_inc(&apic_sw_disabled.key);
>  
> -		recalculate_apic_map(apic->vcpu->kvm);
> +		apic->vcpu->kvm->arch.apic_map_dirty = true;
>  	}
>  }
>  
>  static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
>  {
>  	kvm_lapic_set_reg(apic, APIC_ID, id << 24);
> -	recalculate_apic_map(apic->vcpu->kvm);
> +	apic->vcpu->kvm->arch.apic_map_dirty = true;
>  }
>  
>  static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
>  {
>  	kvm_lapic_set_reg(apic, APIC_LDR, id);
> -	recalculate_apic_map(apic->vcpu->kvm);
> +	apic->vcpu->kvm->arch.apic_map_dirty = true;
>  }
>  
>  static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
> @@ -286,7 +306,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
>  
>  	kvm_lapic_set_reg(apic, APIC_ID, id);
>  	kvm_lapic_set_reg(apic, APIC_LDR, ldr);
> -	recalculate_apic_map(apic->vcpu->kvm);
> +	apic->vcpu->kvm->arch.apic_map_dirty = true;
>  }
>  
>  static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
> @@ -1912,7 +1932,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
>  	case APIC_DFR:
>  		if (!apic_x2apic_mode(apic)) {
>  			kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
> -			recalculate_apic_map(apic->vcpu->kvm);
> +			apic->vcpu->kvm->arch.apic_map_dirty = true;
>  		} else
>  			ret = 1;
>  		break;
> @@ -2018,6 +2038,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
>  		break;
>  	}
>  
> +	kvm_recalculate_apic_map(apic->vcpu->kvm);
> +
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
> @@ -2166,7 +2188,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>  			static_key_slow_dec_deferred(&apic_hw_disabled);
>  		} else {
>  			static_key_slow_inc(&apic_hw_disabled.key);
> -			recalculate_apic_map(vcpu->kvm);
> +			vcpu->kvm->arch.apic_map_dirty = true;
>  		}
>  	}
>  
> @@ -2207,6 +2229,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
>  	if (!apic)
>  		return;
>  
> +	vcpu->kvm->arch.apic_map_dirty = false;
>  	/* Stop the timer in case it's a reset to an active apic */
>  	hrtimer_cancel(&apic->lapic_timer.timer);
>  
> @@ -2258,6 +2281,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
>  
>  	vcpu->arch.apic_arb_prio = 0;
>  	vcpu->arch.apic_attention = 0;
> +
> +	kvm_recalculate_apic_map(vcpu->kvm);
>  }
>  
>  /*
> @@ -2479,17 +2504,18 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
>  	struct kvm_lapic *apic = vcpu->arch.apic;
>  	int r;
>  
> -
>  	kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
>  	/* set SPIV separately to get count of SW disabled APICs right */
>  	apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
>  
>  	r = kvm_apic_state_fixup(vcpu, s, true);
> -	if (r)
> +	if (r) {
> +		kvm_recalculate_apic_map(vcpu->kvm);
>  		return r;
> +	}
>  	memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
>  
> -	recalculate_apic_map(vcpu->kvm);
> +	kvm_recalculate_apic_map(vcpu->kvm);
>  	kvm_apic_set_version(vcpu);
>  
>  	apic_update_ppr(apic);
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index ec6fbfe..7581bc2 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -78,6 +78,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
>  void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
>  void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
>  u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
> +void kvm_recalculate_apic_map(struct kvm *kvm);
>  void kvm_apic_set_version(struct kvm_vcpu *vcpu);
>  int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
>  int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 79bc995..d3802a2 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -350,6 +350,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  	}
>  
>  	kvm_lapic_set_base(vcpu, msr_info->data);
> +	kvm_recalculate_apic_map(vcpu->kvm);
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(kvm_set_apic_base);
Paolo Bonzini June 19, 2020, 2:10 p.m. UTC | #3
On 19/06/20 14:36, Igor Mammedov wrote:
> qemu-kvm -m 2G -smp 4,maxcpus=8  -monitor stdio
> (qemu) device_add qemu64-x86_64-cpu,socket-id=4,core-id=0,thread-id=0
> 
> in guest fails with:
> 
>  smpboot: do_boot_cpu failed(-1) to wakeup CPU#4
> 
> which makes me suspect that  INIT/SIPI wasn't delivered
> 
> Is it a know issue?
> 

No, it isn't.  I'll revert.

Paolo
Igor Mammedov June 19, 2020, 3:38 p.m. UTC | #4
On Fri, 19 Jun 2020 16:10:43 +0200
Paolo Bonzini <pbonzini@redhat.com> wrote:

> On 19/06/20 14:36, Igor Mammedov wrote:
> > qemu-kvm -m 2G -smp 4,maxcpus=8  -monitor stdio
> > (qemu) device_add qemu64-x86_64-cpu,socket-id=4,core-id=0,thread-id=0
> > 
> > in guest fails with:
> > 
> >  smpboot: do_boot_cpu failed(-1) to wakeup CPU#4
> > 
> > which makes me suspect that  INIT/SIPI wasn't delivered
> > 
> > Is it a know issue?
> >   
> 
> No, it isn't.  I'll revert.
wait for a day or 2,

I'll try to come up with a fix over weekend.

> Paolo
>
Igor Mammedov June 21, 2020, 10:26 p.m. UTC | #5
On Fri, 19 Jun 2020 16:10:43 +0200
Paolo Bonzini <pbonzini@redhat.com> wrote:

> On 19/06/20 14:36, Igor Mammedov wrote:
> > qemu-kvm -m 2G -smp 4,maxcpus=8  -monitor stdio
> > (qemu) device_add qemu64-x86_64-cpu,socket-id=4,core-id=0,thread-id=0
> > 
> > in guest fails with:
> > 
> >  smpboot: do_boot_cpu failed(-1) to wakeup CPU#4
> > 
> > which makes me suspect that  INIT/SIPI wasn't delivered
> > 
> > Is it a know issue?
> >   
> 
> No, it isn't.  I'll revert.
> 
> Paolo
> 

Following fixes immediate issue:

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 34a7e0533dad..6dc177da19da 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2567,6 +2567,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
        }
        memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
 
+       apic->vcpu->kvm->arch.apic_map_dirty = true;
        kvm_recalculate_apic_map(vcpu->kvm);
        kvm_apic_set_version(vcpu);

Problem is that during kvm_arch_vcpu_create() new vcpu is not visible to
kvm_recalculate_apic_map(), so whoever many times map update was called
during it, it didn't affect apic map.

What broke hotplug is that kvm_vcpu_ioctl_set_lapic -> kvm_apic_set_state,
which is called after new vcpu is visible, used to make an unconditional update
which pulled in the new vcpu, but with this patch the map update is gone
since state hasn't actuaaly changed, so we lost the one call of
kvm_recalculate_apic_map() which did actually matter.

It happens to work for vcpus present at boot just by luck
(BSP updates SPIV after all vcpus has been created which triggers kvm_recalculate_apic_map())

I'm not sending formal patch yet, since I have doubts wrt subj.

following sequence looks like a race that can cause lost map update events:

         cpu1                            cpu2
                             
                                apic_map_dirty = true     
  ------------------------------------------------------------   
                                kvm_recalculate_apic_map:
                                     pass check
                                         mutex_lock(&kvm->arch.apic_map_lock);
                                         if (!kvm->arch.apic_map_dirty)
                                     and in process of updating map
  -------------------------------------------------------------
    other calls to
       apic_map_dirty = true         might be too late for affected cpu
  -------------------------------------------------------------
                                     apic_map_dirty = false
  -------------------------------------------------------------
    kvm_recalculate_apic_map:
    bail out on
      if (!kvm->arch.apic_map_dirty)

it's safer to revert this patch for now like you have suggested earlier.

If you prefer to keep it, I'll post above fixup as a patch.
Paolo Bonzini June 22, 2020, 2:37 p.m. UTC | #6
On 22/06/20 00:26, Igor Mammedov wrote:
> 
> following sequence looks like a race that can cause lost map update events:
> 
>          cpu1                            cpu2
>                              
>                                 apic_map_dirty = true     
>   ------------------------------------------------------------   
>                                 kvm_recalculate_apic_map:
>                                      pass check
>                                          mutex_lock(&kvm->arch.apic_map_lock);
>                                          if (!kvm->arch.apic_map_dirty)
>                                      and in process of updating map
>   -------------------------------------------------------------
>     other calls to
>        apic_map_dirty = true         might be too late for affected cpu
>   -------------------------------------------------------------
>                                      apic_map_dirty = false
>   -------------------------------------------------------------
>     kvm_recalculate_apic_map:
>     bail out on
>       if (!kvm->arch.apic_map_dirty)

I will post a fix for that.  Thanks for the analysis!

Paolo
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 40a0c0f..4380ed1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -920,6 +920,7 @@  struct kvm_arch {
 	atomic_t vapics_in_nmi_mode;
 	struct mutex apic_map_lock;
 	struct kvm_apic_map *apic_map;
+	bool apic_map_dirty;
 
 	bool apic_access_page_done;
 	unsigned long apicv_inhibit_reasons;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afcd30d..de832aa 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -164,14 +164,28 @@  static void kvm_apic_map_free(struct rcu_head *rcu)
 	kvfree(map);
 }
 
-static void recalculate_apic_map(struct kvm *kvm)
+void kvm_recalculate_apic_map(struct kvm *kvm)
 {
 	struct kvm_apic_map *new, *old = NULL;
 	struct kvm_vcpu *vcpu;
 	int i;
 	u32 max_id = 255; /* enough space for any xAPIC ID */
 
+	if (!kvm->arch.apic_map_dirty) {
+		/*
+		 * Read kvm->arch.apic_map_dirty before
+		 * kvm->arch.apic_map
+		 */
+		smp_rmb();
+		return;
+	}
+
 	mutex_lock(&kvm->arch.apic_map_lock);
+	if (!kvm->arch.apic_map_dirty) {
+		/* Someone else has updated the map. */
+		mutex_unlock(&kvm->arch.apic_map_lock);
+		return;
+	}
 
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		if (kvm_apic_present(vcpu))
@@ -236,6 +250,12 @@  static void recalculate_apic_map(struct kvm *kvm)
 	old = rcu_dereference_protected(kvm->arch.apic_map,
 			lockdep_is_held(&kvm->arch.apic_map_lock));
 	rcu_assign_pointer(kvm->arch.apic_map, new);
+	/*
+	 * Write kvm->arch.apic_map before
+	 * clearing apic->apic_map_dirty
+	 */
+	smp_wmb();
+	kvm->arch.apic_map_dirty = false;
 	mutex_unlock(&kvm->arch.apic_map_lock);
 
 	if (old)
@@ -257,20 +277,20 @@  static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
 		else
 			static_key_slow_inc(&apic_sw_disabled.key);
 
-		recalculate_apic_map(apic->vcpu->kvm);
+		apic->vcpu->kvm->arch.apic_map_dirty = true;
 	}
 }
 
 static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
 {
 	kvm_lapic_set_reg(apic, APIC_ID, id << 24);
-	recalculate_apic_map(apic->vcpu->kvm);
+	apic->vcpu->kvm->arch.apic_map_dirty = true;
 }
 
 static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
 {
 	kvm_lapic_set_reg(apic, APIC_LDR, id);
-	recalculate_apic_map(apic->vcpu->kvm);
+	apic->vcpu->kvm->arch.apic_map_dirty = true;
 }
 
 static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
@@ -286,7 +306,7 @@  static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
 
 	kvm_lapic_set_reg(apic, APIC_ID, id);
 	kvm_lapic_set_reg(apic, APIC_LDR, ldr);
-	recalculate_apic_map(apic->vcpu->kvm);
+	apic->vcpu->kvm->arch.apic_map_dirty = true;
 }
 
 static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
@@ -1912,7 +1932,7 @@  int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 	case APIC_DFR:
 		if (!apic_x2apic_mode(apic)) {
 			kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
-			recalculate_apic_map(apic->vcpu->kvm);
+			apic->vcpu->kvm->arch.apic_map_dirty = true;
 		} else
 			ret = 1;
 		break;
@@ -2018,6 +2038,8 @@  int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		break;
 	}
 
+	kvm_recalculate_apic_map(apic->vcpu->kvm);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
@@ -2166,7 +2188,7 @@  void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 			static_key_slow_dec_deferred(&apic_hw_disabled);
 		} else {
 			static_key_slow_inc(&apic_hw_disabled.key);
-			recalculate_apic_map(vcpu->kvm);
+			vcpu->kvm->arch.apic_map_dirty = true;
 		}
 	}
 
@@ -2207,6 +2229,7 @@  void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	if (!apic)
 		return;
 
+	vcpu->kvm->arch.apic_map_dirty = false;
 	/* Stop the timer in case it's a reset to an active apic */
 	hrtimer_cancel(&apic->lapic_timer.timer);
 
@@ -2258,6 +2281,8 @@  void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	vcpu->arch.apic_arb_prio = 0;
 	vcpu->arch.apic_attention = 0;
+
+	kvm_recalculate_apic_map(vcpu->kvm);
 }
 
 /*
@@ -2479,17 +2504,18 @@  int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	int r;
 
-
 	kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
 	/* set SPIV separately to get count of SW disabled APICs right */
 	apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
 
 	r = kvm_apic_state_fixup(vcpu, s, true);
-	if (r)
+	if (r) {
+		kvm_recalculate_apic_map(vcpu->kvm);
 		return r;
+	}
 	memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
 
-	recalculate_apic_map(vcpu->kvm);
+	kvm_recalculate_apic_map(vcpu->kvm);
 	kvm_apic_set_version(vcpu);
 
 	apic_update_ppr(apic);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ec6fbfe..7581bc2 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -78,6 +78,7 @@  void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_recalculate_apic_map(struct kvm *kvm);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
 int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 79bc995..d3802a2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -350,6 +350,7 @@  int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	}
 
 	kvm_lapic_set_base(vcpu, msr_info->data);
+	kvm_recalculate_apic_map(vcpu->kvm);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);