diff mbox

[RFC,V10,15/18] kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor

Message ID 20130624124342.27508.44656.sendpatchset@codeblue.in.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Raghavendra K T June 24, 2013, 12:43 p.m. UTC
kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor

From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>

During smp_boot_cpus  paravirtualied KVM guest detects if the hypervisor has
required feature (KVM_FEATURE_PV_UNHALT) to support pv-ticketlocks. If so,
 support for pv-ticketlocks is registered via pv_lock_ops.

Use KVM_HC_KICK_CPU hypercall to wakeup waiting/halted vcpu.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Suzuki Poulose <suzuki@in.ibm.com>
[Raghu: check_zero race fix, enum for kvm_contention_stat
jumplabel related changes ]
Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
---
 arch/x86/include/asm/kvm_para.h |   14 ++
 arch/x86/kernel/kvm.c           |  255 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 267 insertions(+), 2 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Gleb Natapov July 14, 2013, 1:12 p.m. UTC | #1
On Mon, Jun 24, 2013 at 06:13:42PM +0530, Raghavendra K T wrote:
> kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor
> 
> From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
> 
> During smp_boot_cpus  paravirtualied KVM guest detects if the hypervisor has
> required feature (KVM_FEATURE_PV_UNHALT) to support pv-ticketlocks. If so,
>  support for pv-ticketlocks is registered via pv_lock_ops.
> 
> Use KVM_HC_KICK_CPU hypercall to wakeup waiting/halted vcpu.
> 
> Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
> Signed-off-by: Suzuki Poulose <suzuki@in.ibm.com>
> [Raghu: check_zero race fix, enum for kvm_contention_stat
> jumplabel related changes ]
> Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
> ---
>  arch/x86/include/asm/kvm_para.h |   14 ++
>  arch/x86/kernel/kvm.c           |  255 +++++++++++++++++++++++++++++++++++++++
>  2 files changed, 267 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
> index 695399f..427afcb 100644
> --- a/arch/x86/include/asm/kvm_para.h
> +++ b/arch/x86/include/asm/kvm_para.h
> @@ -118,10 +118,20 @@ void kvm_async_pf_task_wait(u32 token);
>  void kvm_async_pf_task_wake(u32 token);
>  u32 kvm_read_and_reset_pf_reason(void);
>  extern void kvm_disable_steal_time(void);
> -#else
> -#define kvm_guest_init() do { } while (0)
> +
> +#ifdef CONFIG_PARAVIRT_SPINLOCKS
> +void __init kvm_spinlock_init(void);
> +#else /* !CONFIG_PARAVIRT_SPINLOCKS */
> +static inline void kvm_spinlock_init(void)
> +{
> +}
> +#endif /* CONFIG_PARAVIRT_SPINLOCKS */
> +
> +#else /* CONFIG_KVM_GUEST */
> +#define kvm_guest_init() do {} while (0)
>  #define kvm_async_pf_task_wait(T) do {} while(0)
>  #define kvm_async_pf_task_wake(T) do {} while(0)
> +
>  static inline u32 kvm_read_and_reset_pf_reason(void)
>  {
>  	return 0;
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index cd6d9a5..97ade5a 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -34,6 +34,7 @@
>  #include <linux/sched.h>
>  #include <linux/slab.h>
>  #include <linux/kprobes.h>
> +#include <linux/debugfs.h>
>  #include <asm/timer.h>
>  #include <asm/cpu.h>
>  #include <asm/traps.h>
> @@ -419,6 +420,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
>  	WARN_ON(kvm_register_clock("primary cpu clock"));
>  	kvm_guest_cpu_init();
>  	native_smp_prepare_boot_cpu();
> +	kvm_spinlock_init();
>  }
>  
>  static void __cpuinit kvm_guest_cpu_online(void *dummy)
> @@ -523,3 +525,256 @@ static __init int activate_jump_labels(void)
>  	return 0;
>  }
>  arch_initcall(activate_jump_labels);
> +
> +/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
> +void kvm_kick_cpu(int cpu)
> +{
> +	int apicid;
> +
> +	apicid = per_cpu(x86_cpu_to_apicid, cpu);
> +	kvm_hypercall1(KVM_HC_KICK_CPU, apicid);
> +}
> +
> +#ifdef CONFIG_PARAVIRT_SPINLOCKS
> +
> +enum kvm_contention_stat {
> +	TAKEN_SLOW,
> +	TAKEN_SLOW_PICKUP,
> +	RELEASED_SLOW,
> +	RELEASED_SLOW_KICKED,
> +	NR_CONTENTION_STATS
> +};
> +
> +#ifdef CONFIG_KVM_DEBUG_FS
> +#define HISTO_BUCKETS	30
> +
> +static struct kvm_spinlock_stats
> +{
> +	u32 contention_stats[NR_CONTENTION_STATS];
> +	u32 histo_spin_blocked[HISTO_BUCKETS+1];
> +	u64 time_blocked;
> +} spinlock_stats;
> +
> +static u8 zero_stats;
> +
> +static inline void check_zero(void)
> +{
> +	u8 ret;
> +	u8 old;
> +
> +	old = ACCESS_ONCE(zero_stats);
> +	if (unlikely(old)) {
> +		ret = cmpxchg(&zero_stats, old, 0);
> +		/* This ensures only one fellow resets the stat */
> +		if (ret == old)
> +			memset(&spinlock_stats, 0, sizeof(spinlock_stats));
> +	}
> +}
> +
> +static inline void add_stats(enum kvm_contention_stat var, u32 val)
> +{
> +	check_zero();
> +	spinlock_stats.contention_stats[var] += val;
> +}
> +
> +
> +static inline u64 spin_time_start(void)
> +{
> +	return sched_clock();
> +}
> +
> +static void __spin_time_accum(u64 delta, u32 *array)
> +{
> +	unsigned index;
> +
> +	index = ilog2(delta);
> +	check_zero();
> +
> +	if (index < HISTO_BUCKETS)
> +		array[index]++;
> +	else
> +		array[HISTO_BUCKETS]++;
> +}
> +
> +static inline void spin_time_accum_blocked(u64 start)
> +{
> +	u32 delta;
> +
> +	delta = sched_clock() - start;
> +	__spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
> +	spinlock_stats.time_blocked += delta;
> +}
> +
> +static struct dentry *d_spin_debug;
> +static struct dentry *d_kvm_debug;
> +
> +struct dentry *kvm_init_debugfs(void)
> +{
> +	d_kvm_debug = debugfs_create_dir("kvm", NULL);
> +	if (!d_kvm_debug)
> +		printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");
> +
> +	return d_kvm_debug;
> +}
> +
> +static int __init kvm_spinlock_debugfs(void)
> +{
> +	struct dentry *d_kvm;
> +
> +	d_kvm = kvm_init_debugfs();
> +	if (d_kvm == NULL)
> +		return -ENOMEM;
> +
> +	d_spin_debug = debugfs_create_dir("spinlocks", d_kvm);
> +
> +	debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
> +
> +	debugfs_create_u32("taken_slow", 0444, d_spin_debug,
> +		   &spinlock_stats.contention_stats[TAKEN_SLOW]);
> +	debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
> +		   &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
> +
> +	debugfs_create_u32("released_slow", 0444, d_spin_debug,
> +		   &spinlock_stats.contention_stats[RELEASED_SLOW]);
> +	debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
> +		   &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
> +
> +	debugfs_create_u64("time_blocked", 0444, d_spin_debug,
> +			   &spinlock_stats.time_blocked);
> +
> +	debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
> +		     spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
> +
> +	return 0;
> +}
> +fs_initcall(kvm_spinlock_debugfs);
> +#else  /* !CONFIG_KVM_DEBUG_FS */
> +static inline void add_stats(enum kvm_contention_stat var, u32 val)
> +{
> +}
> +
> +static inline u64 spin_time_start(void)
> +{
> +	return 0;
> +}
> +
> +static inline void spin_time_accum_blocked(u64 start)
> +{
> +}
> +#endif  /* CONFIG_KVM_DEBUG_FS */
> +
> +struct kvm_lock_waiting {
> +	struct arch_spinlock *lock;
> +	__ticket_t want;
> +};
> +
> +/* cpus 'waiting' on a spinlock to become available */
> +static cpumask_t waiting_cpus;
> +
> +/* Track spinlock on which a cpu is waiting */
> +static DEFINE_PER_CPU(struct kvm_lock_waiting, lock_waiting);
> +
> +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
> +{
> +	struct kvm_lock_waiting *w;
> +	int cpu;
> +	u64 start;
> +	unsigned long flags;
> +
> +	w = &__get_cpu_var(lock_waiting);
> +	cpu = smp_processor_id();
> +	start = spin_time_start();
> +
> +	/*
> +	 * Make sure an interrupt handler can't upset things in a
> +	 * partially setup state.
> +	 */
> +	local_irq_save(flags);
> +
> +	/*
> +	 * The ordering protocol on this is that the "lock" pointer
> +	 * may only be set non-NULL if the "want" ticket is correct.
> +	 * If we're updating "want", we must first clear "lock".
> +	 */
> +	w->lock = NULL;
> +	smp_wmb();
> +	w->want = want;
> +	smp_wmb();
> +	w->lock = lock;
> +
> +	add_stats(TAKEN_SLOW, 1);
> +
> +	/*
> +	 * This uses set_bit, which is atomic but we should not rely on its
> +	 * reordering gurantees. So barrier is needed after this call.
> +	 */
> +	cpumask_set_cpu(cpu, &waiting_cpus);
> +
> +	barrier();
> +
> +	/*
> +	 * Mark entry to slowpath before doing the pickup test to make
> +	 * sure we don't deadlock with an unlocker.
> +	 */
> +	__ticket_enter_slowpath(lock);
> +
> +	/*
> +	 * check again make sure it didn't become free while
> +	 * we weren't looking.
> +	 */
> +	if (ACCESS_ONCE(lock->tickets.head) == want) {
> +		add_stats(TAKEN_SLOW_PICKUP, 1);
> +		goto out;
> +	}
> +
> +	/* Allow interrupts while blocked */
> +	local_irq_restore(flags);
> +
So what happens if an interrupt comes here and an interrupt handler
takes another spinlock that goes into the slow path? As far as I see
lock_waiting will become overwritten and cpu will be cleared from
waiting_cpus bitmap by nested kvm_lock_spinning(), so when halt is
called here after returning from the interrupt handler nobody is going
to wake this lock holder. Next random interrupt will "fix" it, but it
may be several milliseconds away, or never. We should probably check
if interrupt were enabled and call native_safe_halt() here.

> +	/* halt until it's our turn and kicked. */
> +	halt();
> +
> +	local_irq_save(flags);
> +out:
> +	cpumask_clear_cpu(cpu, &waiting_cpus);
> +	w->lock = NULL;
> +	local_irq_restore(flags);
> +	spin_time_accum_blocked(start);
> +}
> +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
> +
> +/* Kick vcpu waiting on @lock->head to reach value @ticket */
> +static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
> +{
> +	int cpu;
> +
> +	add_stats(RELEASED_SLOW, 1);
> +	for_each_cpu(cpu, &waiting_cpus) {
> +		const struct kvm_lock_waiting *w = &per_cpu(lock_waiting, cpu);
> +		if (ACCESS_ONCE(w->lock) == lock &&
> +		    ACCESS_ONCE(w->want) == ticket) {
> +			add_stats(RELEASED_SLOW_KICKED, 1);
> +			kvm_kick_cpu(cpu);
> +			break;
> +		}
> +	}
> +}
> +
> +/*
> + * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
> + */
> +void __init kvm_spinlock_init(void)
> +{
> +	if (!kvm_para_available())
> +		return;
> +	/* Does host kernel support KVM_FEATURE_PV_UNHALT? */
> +	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
> +		return;
> +
> +	printk(KERN_INFO "KVM setup paravirtual spinlock\n");
> +
> +	static_key_slow_inc(&paravirt_ticketlocks_enabled);
> +
> +	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
> +	pv_lock_ops.unlock_kick = kvm_unlock_kick;
> +}
> +#endif	/* CONFIG_PARAVIRT_SPINLOCKS */

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 15, 2013, 9:50 a.m. UTC | #2
On 07/14/2013 06:42 PM, Gleb Natapov wrote:
> On Mon, Jun 24, 2013 at 06:13:42PM +0530, Raghavendra K T wrote:
>> kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor
>>
>> From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
>>
trimming
[...]
>> +
>> +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
>> +{
>> +	struct kvm_lock_waiting *w;
>> +	int cpu;
>> +	u64 start;
>> +	unsigned long flags;
>> +
>> +	w = &__get_cpu_var(lock_waiting);
>> +	cpu = smp_processor_id();
>> +	start = spin_time_start();
>> +
>> +	/*
>> +	 * Make sure an interrupt handler can't upset things in a
>> +	 * partially setup state.
>> +	 */
>> +	local_irq_save(flags);
>> +
>> +	/*
>> +	 * The ordering protocol on this is that the "lock" pointer
>> +	 * may only be set non-NULL if the "want" ticket is correct.
>> +	 * If we're updating "want", we must first clear "lock".
>> +	 */
>> +	w->lock = NULL;
>> +	smp_wmb();
>> +	w->want = want;
>> +	smp_wmb();
>> +	w->lock = lock;
>> +
>> +	add_stats(TAKEN_SLOW, 1);
>> +
>> +	/*
>> +	 * This uses set_bit, which is atomic but we should not rely on its
>> +	 * reordering gurantees. So barrier is needed after this call.
>> +	 */
>> +	cpumask_set_cpu(cpu, &waiting_cpus);
>> +
>> +	barrier();
>> +
>> +	/*
>> +	 * Mark entry to slowpath before doing the pickup test to make
>> +	 * sure we don't deadlock with an unlocker.
>> +	 */
>> +	__ticket_enter_slowpath(lock);
>> +
>> +	/*
>> +	 * check again make sure it didn't become free while
>> +	 * we weren't looking.
>> +	 */
>> +	if (ACCESS_ONCE(lock->tickets.head) == want) {
>> +		add_stats(TAKEN_SLOW_PICKUP, 1);
>> +		goto out;
>> +	}
>> +
>> +	/* Allow interrupts while blocked */
>> +	local_irq_restore(flags);
>> +
> So what happens if an interrupt comes here and an interrupt handler
> takes another spinlock that goes into the slow path? As far as I see
> lock_waiting will become overwritten and cpu will be cleared from
> waiting_cpus bitmap by nested kvm_lock_spinning(), so when halt is
> called here after returning from the interrupt handler nobody is going
> to wake this lock holder. Next random interrupt will "fix" it, but it
> may be several milliseconds away, or never. We should probably check
> if interrupt were enabled and call native_safe_halt() here.
>

Okay you mean something like below should be done.
if irq_enabled()
   native_safe_halt()
else
   halt()

It is been a complex stuff for analysis for me.

So in our discussion stack would looking like this.

spinlock()
   kvm_lock_spinning()
                   <------ interrupt here
           halt()


 From the halt if we trace

   halt()
     kvm_vcpu_block()
        kvm_arch_vcpu_runnable())
	 kvm_make_request(KVM_REQ_UNHALT)
		
This would drive us out of halt handler, and we are fine when it
happens since we would revisit kvm_lock_spinning.

But I see that

kvm_arch_vcpu_runnable() has this condition
  (kvm_arch_interrupt_allowed(vcpu) &&  kvm_cpu_has_interrupt(vcpu));

which means that if we going process the interrupt here we would set 
KVM_REQ_UNHALT. and we are fine.

But if we are in the situation kvm_arch_interrupt_allowed(vcpu) = true, 
but we already processed interrupt and kvm_cpu_has_interrupt(vcpu) is 
false, we have problem till next random interrupt.

The confusing part to me is the case kvm_cpu_has_interrupt(vcpu)=false 
and irq
already handled and overwritten the lock_waiting. can this
situation happen? or is it that we will process the interrupt only
after this point (kvm_vcpu_block). Because if that is the case we are
fine.

Please let me know.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 15, 2013, 10:36 a.m. UTC | #3
On Mon, Jul 15, 2013 at 03:20:06PM +0530, Raghavendra K T wrote:
> On 07/14/2013 06:42 PM, Gleb Natapov wrote:
> >On Mon, Jun 24, 2013 at 06:13:42PM +0530, Raghavendra K T wrote:
> >>kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor
> >>
> >>From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
> >>
> trimming
> [...]
> >>+
> >>+static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
> >>+{
> >>+	struct kvm_lock_waiting *w;
> >>+	int cpu;
> >>+	u64 start;
> >>+	unsigned long flags;
> >>+
> >>+	w = &__get_cpu_var(lock_waiting);
> >>+	cpu = smp_processor_id();
> >>+	start = spin_time_start();
> >>+
> >>+	/*
> >>+	 * Make sure an interrupt handler can't upset things in a
> >>+	 * partially setup state.
> >>+	 */
> >>+	local_irq_save(flags);
> >>+
> >>+	/*
> >>+	 * The ordering protocol on this is that the "lock" pointer
> >>+	 * may only be set non-NULL if the "want" ticket is correct.
> >>+	 * If we're updating "want", we must first clear "lock".
> >>+	 */
> >>+	w->lock = NULL;
> >>+	smp_wmb();
> >>+	w->want = want;
> >>+	smp_wmb();
> >>+	w->lock = lock;
> >>+
> >>+	add_stats(TAKEN_SLOW, 1);
> >>+
> >>+	/*
> >>+	 * This uses set_bit, which is atomic but we should not rely on its
> >>+	 * reordering gurantees. So barrier is needed after this call.
> >>+	 */
> >>+	cpumask_set_cpu(cpu, &waiting_cpus);
> >>+
> >>+	barrier();
> >>+
> >>+	/*
> >>+	 * Mark entry to slowpath before doing the pickup test to make
> >>+	 * sure we don't deadlock with an unlocker.
> >>+	 */
> >>+	__ticket_enter_slowpath(lock);
> >>+
> >>+	/*
> >>+	 * check again make sure it didn't become free while
> >>+	 * we weren't looking.
> >>+	 */
> >>+	if (ACCESS_ONCE(lock->tickets.head) == want) {
> >>+		add_stats(TAKEN_SLOW_PICKUP, 1);
> >>+		goto out;
> >>+	}
> >>+
> >>+	/* Allow interrupts while blocked */
> >>+	local_irq_restore(flags);
> >>+
> >So what happens if an interrupt comes here and an interrupt handler
> >takes another spinlock that goes into the slow path? As far as I see
> >lock_waiting will become overwritten and cpu will be cleared from
> >waiting_cpus bitmap by nested kvm_lock_spinning(), so when halt is
> >called here after returning from the interrupt handler nobody is going
> >to wake this lock holder. Next random interrupt will "fix" it, but it
> >may be several milliseconds away, or never. We should probably check
> >if interrupt were enabled and call native_safe_halt() here.
> >
> 
> Okay you mean something like below should be done.
> if irq_enabled()
>   native_safe_halt()
> else
>   halt()
> 
> It is been a complex stuff for analysis for me.
> 
> So in our discussion stack would looking like this.
> 
> spinlock()
>   kvm_lock_spinning()
>                   <------ interrupt here
>           halt()
> 
> 
> From the halt if we trace
> 
It is to early to trace the halt since it was not executed yet. Guest
stack trace will look something like this:

spinlock(a)
  kvm_lock_spinning(a)
   lock_waiting = a
   set bit in waiting_cpus
                <------ interrupt here
                spinlock(b)
                  kvm_lock_spinning(b)
                    lock_waiting = b
                    set bit in waiting_cpus
                    halt()
                    unset bit in waiting_cpus
                    lock_waiting = NULL
     ----------> ret from interrupt
   halt()

Now at the time of the last halt above lock_waiting == NULL and
waiting_cpus is empty and not interrupt it pending, so who will unhalt
the waiter?

>   halt()
>     kvm_vcpu_block()
>        kvm_arch_vcpu_runnable())
> 	 kvm_make_request(KVM_REQ_UNHALT)
> 		
> This would drive us out of halt handler, and we are fine when it
> happens since we would revisit kvm_lock_spinning.
> 
> But I see that
> 
> kvm_arch_vcpu_runnable() has this condition
>  (kvm_arch_interrupt_allowed(vcpu) &&  kvm_cpu_has_interrupt(vcpu));
> 
> which means that if we going process the interrupt here we would set
> KVM_REQ_UNHALT. and we are fine.
> 
> But if we are in the situation kvm_arch_interrupt_allowed(vcpu) =
> true, but we already processed interrupt and
> kvm_cpu_has_interrupt(vcpu) is false, we have problem till next
> random interrupt.
> 
> The confusing part to me is the case
> kvm_cpu_has_interrupt(vcpu)=false and irq
> already handled and overwritten the lock_waiting. can this
> situation happen? or is it that we will process the interrupt only
> after this point (kvm_vcpu_block). Because if that is the case we are
> fine.
kvm_vcpu_block has nothing to do with processing interrupt. All the code in kvm_arch_vcpu_runnable()
is just to make sure that interrupt unhalts vcpu if vcpu is already in
halt. Interrupts are injected as soon as they happen and CPU is in a
right state to receive them and it will be after local_irq_restore()
before halt. X86 inhibits interrupt till next instruction after sti to
solve exactly this kind of races. native_safe_halt() evaluates to "sti;
hlt" to make interrupt enablement and halt atomic with regards to
interrupts and NMIs.

> 
> Please let me know.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 16, 2013, 3:37 a.m. UTC | #4
On 07/15/2013 04:06 PM, Gleb Natapov wrote:
> On Mon, Jul 15, 2013 at 03:20:06PM +0530, Raghavendra K T wrote:
>> On 07/14/2013 06:42 PM, Gleb Natapov wrote:
>>> On Mon, Jun 24, 2013 at 06:13:42PM +0530, Raghavendra K T wrote:
>>>> kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor
>>>>
>>>> From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
>>>>
>> trimming
>> [...]
>>>> +
>>>> +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
>>>> +{
>>>> +	struct kvm_lock_waiting *w;
>>>> +	int cpu;
>>>> +	u64 start;
>>>> +	unsigned long flags;
>>>> +
>>>> +	w = &__get_cpu_var(lock_waiting);
>>>> +	cpu = smp_processor_id();
>>>> +	start = spin_time_start();
>>>> +
>>>> +	/*
>>>> +	 * Make sure an interrupt handler can't upset things in a
>>>> +	 * partially setup state.
>>>> +	 */
>>>> +	local_irq_save(flags);
>>>> +
>>>> +	/*
>>>> +	 * The ordering protocol on this is that the "lock" pointer
>>>> +	 * may only be set non-NULL if the "want" ticket is correct.
>>>> +	 * If we're updating "want", we must first clear "lock".
>>>> +	 */
>>>> +	w->lock = NULL;
>>>> +	smp_wmb();
>>>> +	w->want = want;
>>>> +	smp_wmb();
>>>> +	w->lock = lock;
>>>> +
>>>> +	add_stats(TAKEN_SLOW, 1);
>>>> +
>>>> +	/*
>>>> +	 * This uses set_bit, which is atomic but we should not rely on its
>>>> +	 * reordering gurantees. So barrier is needed after this call.
>>>> +	 */
>>>> +	cpumask_set_cpu(cpu, &waiting_cpus);
>>>> +
>>>> +	barrier();
>>>> +
>>>> +	/*
>>>> +	 * Mark entry to slowpath before doing the pickup test to make
>>>> +	 * sure we don't deadlock with an unlocker.
>>>> +	 */
>>>> +	__ticket_enter_slowpath(lock);
>>>> +
>>>> +	/*
>>>> +	 * check again make sure it didn't become free while
>>>> +	 * we weren't looking.
>>>> +	 */
>>>> +	if (ACCESS_ONCE(lock->tickets.head) == want) {
>>>> +		add_stats(TAKEN_SLOW_PICKUP, 1);
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	/* Allow interrupts while blocked */
>>>> +	local_irq_restore(flags);
>>>> +
>>> So what happens if an interrupt comes here and an interrupt handler
>>> takes another spinlock that goes into the slow path? As far as I see
>>> lock_waiting will become overwritten and cpu will be cleared from
>>> waiting_cpus bitmap by nested kvm_lock_spinning(), so when halt is
>>> called here after returning from the interrupt handler nobody is going
>>> to wake this lock holder. Next random interrupt will "fix" it, but it
>>> may be several milliseconds away, or never. We should probably check
>>> if interrupt were enabled and call native_safe_halt() here.
>>>
>>
>> Okay you mean something like below should be done.
>> if irq_enabled()
>>    native_safe_halt()
>> else
>>    halt()
>>
>> It is been a complex stuff for analysis for me.
>>
>> So in our discussion stack would looking like this.
>>
>> spinlock()
>>    kvm_lock_spinning()
>>                    <------ interrupt here
>>            halt()
>>
>>
>>  From the halt if we trace
>>
> It is to early to trace the halt since it was not executed yet. Guest
> stack trace will look something like this:
>
> spinlock(a)
>    kvm_lock_spinning(a)
>     lock_waiting = a
>     set bit in waiting_cpus
>                  <------ interrupt here
>                  spinlock(b)
>                    kvm_lock_spinning(b)
>                      lock_waiting = b
>                      set bit in waiting_cpus
>                      halt()
>                      unset bit in waiting_cpus
>                      lock_waiting = NULL
>       ----------> ret from interrupt
>     halt()
>
> Now at the time of the last halt above lock_waiting == NULL and
> waiting_cpus is empty and not interrupt it pending, so who will unhalt
> the waiter?
>

Yes. if an interrupt occurs between
local_irq_restore() and halt(), this is possible. and since this is 
rarest of rare (possiility of irq entering slowpath and then no random 
irq to do spurious wakeup), we had never hit this problem in the past.

So I am,
1. trying to artificially reproduce this.

2. I replaced the halt with below code,
        if (arch_irqs_disabled())
                 halt();

and ran benchmarks.
But this results in degradation because, it means we again go back and 
spin in irq enabled case.

3. Now I am analyzing the performance overhead of safe_halt in irq 
enabled case.
       if (arch_irqs_disabled())
                halt();
       else
                safe_halt();

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 16, 2013, 6:02 a.m. UTC | #5
On Tue, Jul 16, 2013 at 09:07:53AM +0530, Raghavendra K T wrote:
> On 07/15/2013 04:06 PM, Gleb Natapov wrote:
> >On Mon, Jul 15, 2013 at 03:20:06PM +0530, Raghavendra K T wrote:
> >>On 07/14/2013 06:42 PM, Gleb Natapov wrote:
> >>>On Mon, Jun 24, 2013 at 06:13:42PM +0530, Raghavendra K T wrote:
> >>>>kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor
> >>>>
> >>>>From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
> >>>>
> >>trimming
> >>[...]
> >>>>+
> >>>>+static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
> >>>>+{
> >>>>+	struct kvm_lock_waiting *w;
> >>>>+	int cpu;
> >>>>+	u64 start;
> >>>>+	unsigned long flags;
> >>>>+
> >>>>+	w = &__get_cpu_var(lock_waiting);
> >>>>+	cpu = smp_processor_id();
> >>>>+	start = spin_time_start();
> >>>>+
> >>>>+	/*
> >>>>+	 * Make sure an interrupt handler can't upset things in a
> >>>>+	 * partially setup state.
> >>>>+	 */
> >>>>+	local_irq_save(flags);
> >>>>+
> >>>>+	/*
> >>>>+	 * The ordering protocol on this is that the "lock" pointer
> >>>>+	 * may only be set non-NULL if the "want" ticket is correct.
> >>>>+	 * If we're updating "want", we must first clear "lock".
> >>>>+	 */
> >>>>+	w->lock = NULL;
> >>>>+	smp_wmb();
> >>>>+	w->want = want;
> >>>>+	smp_wmb();
> >>>>+	w->lock = lock;
> >>>>+
> >>>>+	add_stats(TAKEN_SLOW, 1);
> >>>>+
> >>>>+	/*
> >>>>+	 * This uses set_bit, which is atomic but we should not rely on its
> >>>>+	 * reordering gurantees. So barrier is needed after this call.
> >>>>+	 */
> >>>>+	cpumask_set_cpu(cpu, &waiting_cpus);
> >>>>+
> >>>>+	barrier();
> >>>>+
> >>>>+	/*
> >>>>+	 * Mark entry to slowpath before doing the pickup test to make
> >>>>+	 * sure we don't deadlock with an unlocker.
> >>>>+	 */
> >>>>+	__ticket_enter_slowpath(lock);
> >>>>+
> >>>>+	/*
> >>>>+	 * check again make sure it didn't become free while
> >>>>+	 * we weren't looking.
> >>>>+	 */
> >>>>+	if (ACCESS_ONCE(lock->tickets.head) == want) {
> >>>>+		add_stats(TAKEN_SLOW_PICKUP, 1);
> >>>>+		goto out;
> >>>>+	}
> >>>>+
> >>>>+	/* Allow interrupts while blocked */
> >>>>+	local_irq_restore(flags);
> >>>>+
> >>>So what happens if an interrupt comes here and an interrupt handler
> >>>takes another spinlock that goes into the slow path? As far as I see
> >>>lock_waiting will become overwritten and cpu will be cleared from
> >>>waiting_cpus bitmap by nested kvm_lock_spinning(), so when halt is
> >>>called here after returning from the interrupt handler nobody is going
> >>>to wake this lock holder. Next random interrupt will "fix" it, but it
> >>>may be several milliseconds away, or never. We should probably check
> >>>if interrupt were enabled and call native_safe_halt() here.
> >>>
> >>
> >>Okay you mean something like below should be done.
> >>if irq_enabled()
> >>   native_safe_halt()
> >>else
> >>   halt()
> >>
> >>It is been a complex stuff for analysis for me.
> >>
> >>So in our discussion stack would looking like this.
> >>
> >>spinlock()
> >>   kvm_lock_spinning()
> >>                   <------ interrupt here
> >>           halt()
> >>
> >>
> >> From the halt if we trace
> >>
> >It is to early to trace the halt since it was not executed yet. Guest
> >stack trace will look something like this:
> >
> >spinlock(a)
> >   kvm_lock_spinning(a)
> >    lock_waiting = a
> >    set bit in waiting_cpus
> >                 <------ interrupt here
> >                 spinlock(b)
> >                   kvm_lock_spinning(b)
> >                     lock_waiting = b
> >                     set bit in waiting_cpus
> >                     halt()
> >                     unset bit in waiting_cpus
> >                     lock_waiting = NULL
> >      ----------> ret from interrupt
> >    halt()
> >
> >Now at the time of the last halt above lock_waiting == NULL and
> >waiting_cpus is empty and not interrupt it pending, so who will unhalt
> >the waiter?
> >
> 
> Yes. if an interrupt occurs between
> local_irq_restore() and halt(), this is possible. and since this is
> rarest of rare (possiility of irq entering slowpath and then no
> random irq to do spurious wakeup), we had never hit this problem in
> the past.
I do not think it is very rare to get interrupt between
local_irq_restore() and halt() under load since any interrupt that
occurs between local_irq_save() and local_irq_restore() will be delivered
immediately after local_irq_restore(). Of course the chance of no other
random interrupt waking lock waiter is very low, but waiter can sleep
for much longer then needed and this will be noticeable in performance.

BTW can NMI handler take spinlocks? If it can what happens if NMI is
delivered in a section protected by local_irq_save()/local_irq_restore()?

> 
> So I am,
> 1. trying to artificially reproduce this.
> 
> 2. I replaced the halt with below code,
>        if (arch_irqs_disabled())
>                 halt();
> 
> and ran benchmarks.
> But this results in degradation because, it means we again go back
> and spin in irq enabled case.
> 
Yes, this is not what I proposed.

> 3. Now I am analyzing the performance overhead of safe_halt in irq
> enabled case.
>       if (arch_irqs_disabled())
>                halt();
>       else
>                safe_halt();
Use of arch_irqs_disabled() is incorrect here. If you are doing it before
local_irq_restore() it will always be false since you disabled interrupt
yourself, if you do it after then it is to late since interrupt can come
between local_irq_restore() and halt() so enabling interrupt and halt
are still not atomic.  You should drop local_irq_restore() and do

  if (arch_irqs_disabled_flags(flags))
	halt();
  else
	safe_halt();

instead.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra July 16, 2013, 3:48 p.m. UTC | #6
On Tue, Jul 16, 2013 at 09:02:15AM +0300, Gleb Natapov wrote:
> BTW can NMI handler take spinlocks? 

No -- that is, yes you can using trylock, but you still shouldn't.

> If it can what happens if NMI is
> delivered in a section protected by local_irq_save()/local_irq_restore()?

You deadlock.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 16, 2013, 4:31 p.m. UTC | #7
On Tue, Jul 16, 2013 at 05:48:52PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 16, 2013 at 09:02:15AM +0300, Gleb Natapov wrote:
> > BTW can NMI handler take spinlocks? 
> 
> No -- that is, yes you can using trylock, but you still shouldn't.
> 
Great news for this code. Thanks.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 16, 2013, 6:42 p.m. UTC | #8
On 07/16/2013 11:32 AM, Gleb Natapov wrote:
> On Tue, Jul 16, 2013 at 09:07:53AM +0530, Raghavendra K T wrote:
>> On 07/15/2013 04:06 PM, Gleb Natapov wrote:
>>> On Mon, Jul 15, 2013 at 03:20:06PM +0530, Raghavendra K T wrote:
>>>> On 07/14/2013 06:42 PM, Gleb Natapov wrote:
>>>>> On Mon, Jun 24, 2013 at 06:13:42PM +0530, Raghavendra K T wrote:
>>>>>> kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor
>>>>>>
>>>>>> From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
>>>>>>
>>>> trimming
>>>> [...]
>>>>>> +
>>>>>> +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
>>>>>> +{
>>>>>> +	struct kvm_lock_waiting *w;
>>>>>> +	int cpu;
>>>>>> +	u64 start;
>>>>>> +	unsigned long flags;
>>>>>> +
>>>>>> +	w = &__get_cpu_var(lock_waiting);
>>>>>> +	cpu = smp_processor_id();
>>>>>> +	start = spin_time_start();
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * Make sure an interrupt handler can't upset things in a
>>>>>> +	 * partially setup state.
>>>>>> +	 */
>>>>>> +	local_irq_save(flags);
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * The ordering protocol on this is that the "lock" pointer
>>>>>> +	 * may only be set non-NULL if the "want" ticket is correct.
>>>>>> +	 * If we're updating "want", we must first clear "lock".
>>>>>> +	 */
>>>>>> +	w->lock = NULL;
>>>>>> +	smp_wmb();
>>>>>> +	w->want = want;
>>>>>> +	smp_wmb();
>>>>>> +	w->lock = lock;
>>>>>> +
>>>>>> +	add_stats(TAKEN_SLOW, 1);
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * This uses set_bit, which is atomic but we should not rely on its
>>>>>> +	 * reordering gurantees. So barrier is needed after this call.
>>>>>> +	 */
>>>>>> +	cpumask_set_cpu(cpu, &waiting_cpus);
>>>>>> +
>>>>>> +	barrier();
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * Mark entry to slowpath before doing the pickup test to make
>>>>>> +	 * sure we don't deadlock with an unlocker.
>>>>>> +	 */
>>>>>> +	__ticket_enter_slowpath(lock);
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * check again make sure it didn't become free while
>>>>>> +	 * we weren't looking.
>>>>>> +	 */
>>>>>> +	if (ACCESS_ONCE(lock->tickets.head) == want) {
>>>>>> +		add_stats(TAKEN_SLOW_PICKUP, 1);
>>>>>> +		goto out;
>>>>>> +	}
>>>>>> +
>>>>>> +	/* Allow interrupts while blocked */
>>>>>> +	local_irq_restore(flags);
>>>>>> +
>>>>> So what happens if an interrupt comes here and an interrupt handler
>>>>> takes another spinlock that goes into the slow path? As far as I see
>>>>> lock_waiting will become overwritten and cpu will be cleared from
>>>>> waiting_cpus bitmap by nested kvm_lock_spinning(), so when halt is
>>>>> called here after returning from the interrupt handler nobody is going
>>>>> to wake this lock holder. Next random interrupt will "fix" it, but it
>>>>> may be several milliseconds away, or never. We should probably check
>>>>> if interrupt were enabled and call native_safe_halt() here.
>>>>>
>>>>
>>>> Okay you mean something like below should be done.
>>>> if irq_enabled()
>>>>    native_safe_halt()
>>>> else
>>>>    halt()
>>>>
>>>> It is been a complex stuff for analysis for me.
>>>>
>>>> So in our discussion stack would looking like this.
>>>>
>>>> spinlock()
>>>>    kvm_lock_spinning()
>>>>                    <------ interrupt here
>>>>            halt()
>>>>
>>>>
>>>>  From the halt if we trace
>>>>
>>> It is to early to trace the halt since it was not executed yet. Guest
>>> stack trace will look something like this:
>>>
>>> spinlock(a)
>>>    kvm_lock_spinning(a)
>>>     lock_waiting = a
>>>     set bit in waiting_cpus
>>>                  <------ interrupt here
>>>                  spinlock(b)
>>>                    kvm_lock_spinning(b)
>>>                      lock_waiting = b
>>>                      set bit in waiting_cpus
>>>                      halt()
>>>                      unset bit in waiting_cpus
>>>                      lock_waiting = NULL
>>>       ----------> ret from interrupt
>>>     halt()
>>>
>>> Now at the time of the last halt above lock_waiting == NULL and
>>> waiting_cpus is empty and not interrupt it pending, so who will unhalt
>>> the waiter?
>>>
>>
>> Yes. if an interrupt occurs between
>> local_irq_restore() and halt(), this is possible. and since this is
>> rarest of rare (possiility of irq entering slowpath and then no
>> random irq to do spurious wakeup), we had never hit this problem in
>> the past.
> I do not think it is very rare to get interrupt between
> local_irq_restore() and halt() under load since any interrupt that
> occurs between local_irq_save() and local_irq_restore() will be delivered
> immediately after local_irq_restore(). Of course the chance of no other
> random interrupt waking lock waiter is very low, but waiter can sleep
> for much longer then needed and this will be noticeable in performance.

Yes, I meant the entire thing. I did infact turned WARN on w->lock==null 
before halt() [ though we can potentially have irq right after that ], 
but did not hit so far.

> BTW can NMI handler take spinlocks? If it can what happens if NMI is
> delivered in a section protected by local_irq_save()/local_irq_restore()?
>

Had another idea if NMI, halts are causing problem until I saw PeterZ's 
reply similar to V2 of pvspinlock posted  here:

  https://lkml.org/lkml/2011/10/23/211

Instead of halt we started with a sleep hypercall in those
  versions. Changed to halt() once Avi suggested to reuse existing sleep.

If we use older hypercall with few changes like below:

kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
{
  // a0 reserved for flags
if (!w->lock)
return;
DEFINE_WAIT
...
end_wait
}

Only question is how to retry immediately with lock_spinning in
w->lock=null cases.

/me need to experiment that again perhaps to see if we get some benefit.

>>
>> So I am,
>> 1. trying to artificially reproduce this.
>>
>> 2. I replaced the halt with below code,
>>         if (arch_irqs_disabled())
>>                  halt();
>>
>> and ran benchmarks.
>> But this results in degradation because, it means we again go back
>> and spin in irq enabled case.
>>
> Yes, this is not what I proposed.

True.

>
>> 3. Now I am analyzing the performance overhead of safe_halt in irq
>> enabled case.
>>        if (arch_irqs_disabled())
>>                 halt();
>>        else
>>                 safe_halt();
> Use of arch_irqs_disabled() is incorrect here.

Oops! sill me.

If you are doing it before
> local_irq_restore() it will always be false since you disabled interrupt
> yourself,

This was not the case. but latter is the one I missed.

  if you do it after then it is to late since interrupt can come
> between local_irq_restore() and halt() so enabling interrupt and halt
> are still not atomic.  You should drop local_irq_restore() and do
>
>    if (arch_irqs_disabled_flags(flags))
> 	halt();
>    else
> 	safe_halt();
>
> instead.
>

Yes, I tested with below as suggested:

//local_irq_restore(flags);

         /* halt until it's our turn and kicked. */
         if (arch_irqs_disabled_flags(flags))
                 halt();
         else
                 safe_halt();

  //local_irq_save(flags);
I am seeing only a slight overhead, but want to give a full run to
check the performance.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 16, 2013, 6:49 p.m. UTC | #9
On 07/16/2013 09:18 PM, Peter Zijlstra wrote:
> On Tue, Jul 16, 2013 at 09:02:15AM +0300, Gleb Natapov wrote:
>> BTW can NMI handler take spinlocks?
>
> No -- that is, yes you can using trylock, but you still shouldn't.

Thanks Peter for the clarification.

I had started checking few of nmi handlers code to confirm.
Did saw a raw spinlock in drivers/acpi/apei/ghes.c, but then stopped.

>
>> If it can what happens if NMI is
>> delivered in a section protected by local_irq_save()/local_irq_restore()?
>
> You deadlock.
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 17, 2013, 9:34 a.m. UTC | #10
On Wed, Jul 17, 2013 at 12:12:35AM +0530, Raghavendra K T wrote:
>> I do not think it is very rare to get interrupt between
>> local_irq_restore() and halt() under load since any interrupt that
>> occurs between local_irq_save() and local_irq_restore() will be
>> delivered
>> immediately after local_irq_restore(). Of course the chance of no
>> other
>> random interrupt waking lock waiter is very low, but waiter can sleep
>> for much longer then needed and this will be noticeable in
>> performance.
>
>Yes, I meant the entire thing. I did infact turned WARN on
>w->lock==null before halt() [ though we can potentially have irq right
>after that ], but did not hit so far.
Depends on your workload of course. To hit that you not only need to get
interrupt in there, but the interrupt handler needs to take contended
spinlock.

>
> >BTW can NMI handler take spinlocks? If it can what happens if NMI is
> >delivered in a section protected by local_irq_save()/local_irq_restore()?
> >
> 
> Had another idea if NMI, halts are causing problem until I saw
> PeterZ's reply similar to V2 of pvspinlock posted  here:
> 
>  https://lkml.org/lkml/2011/10/23/211
> 
> Instead of halt we started with a sleep hypercall in those
>  versions. Changed to halt() once Avi suggested to reuse existing sleep.
> 
> If we use older hypercall with few changes like below:
> 
> kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
> {
>  // a0 reserved for flags
> if (!w->lock)
> return;
> DEFINE_WAIT
> ...
> end_wait
> }
> 
How would this help if NMI takes lock in critical section. The thing
that may happen is that lock_waiting->want may have NMI lock value, but
lock_waiting->lock will point to non NMI lock. Setting of want and lock
have to be atomic.

kvm_pv_wait_for_kick_op() is incorrect in other ways. It will spuriously
return to a guest since not all events that wake up vcpu thread
correspond to work for guest to do.

> Only question is how to retry immediately with lock_spinning in
> w->lock=null cases.
> 
> /me need to experiment that again perhaps to see if we get some benefit.
> 
> >>
> >>So I am,
> >>1. trying to artificially reproduce this.
> >>
> >>2. I replaced the halt with below code,
> >>        if (arch_irqs_disabled())
> >>                 halt();
> >>
> >>and ran benchmarks.
> >>But this results in degradation because, it means we again go back
> >>and spin in irq enabled case.
> >>
> >Yes, this is not what I proposed.
> 
> True.
> 
> >
> >>3. Now I am analyzing the performance overhead of safe_halt in irq
> >>enabled case.
> >>       if (arch_irqs_disabled())
> >>                halt();
> >>       else
> >>                safe_halt();
> >Use of arch_irqs_disabled() is incorrect here.
> 
> Oops! sill me.
> 
> If you are doing it before
> >local_irq_restore() it will always be false since you disabled interrupt
> >yourself,
> 
> This was not the case. but latter is the one I missed.
> 
>  if you do it after then it is to late since interrupt can come
> >between local_irq_restore() and halt() so enabling interrupt and halt
> >are still not atomic.  You should drop local_irq_restore() and do
> >
> >   if (arch_irqs_disabled_flags(flags))
> >	halt();
> >   else
> >	safe_halt();
> >
> >instead.
> >
> 
> Yes, I tested with below as suggested:
> 
> //local_irq_restore(flags);
> 
>         /* halt until it's our turn and kicked. */
>         if (arch_irqs_disabled_flags(flags))
>                 halt();
>         else
>                 safe_halt();
> 
>  //local_irq_save(flags);
> I am seeing only a slight overhead, but want to give a full run to
> check the performance.
Without compiling and checking myself the different between previous
code and this one should be a couple asm instruction. I would be
surprised if you can measure it especially as vcpu is going to halt
(and do expensive vmexit in the process) anyway.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 17, 2013, 10:05 a.m. UTC | #11
On 07/17/2013 03:04 PM, Gleb Natapov wrote:
> On Wed, Jul 17, 2013 at 12:12:35AM +0530, Raghavendra K T wrote:
>>> I do not think it is very rare to get interrupt between
>>> local_irq_restore() and halt() under load since any interrupt that
>>> occurs between local_irq_save() and local_irq_restore() will be
>>> delivered
>>> immediately after local_irq_restore(). Of course the chance of no
>>> other
>>> random interrupt waking lock waiter is very low, but waiter can sleep
>>> for much longer then needed and this will be noticeable in
>>> performance.
>>
>> Yes, I meant the entire thing. I did infact turned WARN on
>> w->lock==null before halt() [ though we can potentially have irq right
>> after that ], but did not hit so far.
> Depends on your workload of course. To hit that you not only need to get
> interrupt in there, but the interrupt handler needs to take contended
> spinlock.
>

Yes. Agree.

>>
>>> BTW can NMI handler take spinlocks? If it can what happens if NMI is
>>> delivered in a section protected by local_irq_save()/local_irq_restore()?
>>>
>>
>> Had another idea if NMI, halts are causing problem until I saw
>> PeterZ's reply similar to V2 of pvspinlock posted  here:
>>
>>   https://lkml.org/lkml/2011/10/23/211
>>
>> Instead of halt we started with a sleep hypercall in those
>>   versions. Changed to halt() once Avi suggested to reuse existing sleep.
>>
>> If we use older hypercall with few changes like below:
>>
>> kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
>> {
>>   // a0 reserved for flags
>> if (!w->lock)
>> return;
>> DEFINE_WAIT
>> ...
>> end_wait
>> }
>>
> How would this help if NMI takes lock in critical section. The thing
> that may happen is that lock_waiting->want may have NMI lock value, but
> lock_waiting->lock will point to non NMI lock. Setting of want and lock
> have to be atomic.

True. so we are here

         non NMI lock(a)
         w->lock = NULL;
         smp_wmb();
         w->want = want;
                                NMI
                          <---------------------
                           NMI lock(b)
                           w->lock = NULL;
                           smp_wmb();
                           w->want = want;
                           smp_wmb();
                           w->lock = lock;
                          ---------------------->
         smp_wmb();
         w->lock = lock;

so how about fixing like this?

again:
         w->lock = NULL;
         smp_wmb();
         w->want = want;
         smp_wmb();
         w->lock = lock;

if (!lock || w->want != want) goto again;

>
> kvm_pv_wait_for_kick_op() is incorrect in other ways. It will spuriously
> return to a guest since not all events that wake up vcpu thread
> correspond to work for guest to do.
>

Okay. agree.

>> Only question is how to retry immediately with lock_spinning in
>> w->lock=null cases.
>>
>> /me need to experiment that again perhaps to see if we get some benefit.
>>
>>>>
>>>> So I am,
>>>> 1. trying to artificially reproduce this.
>>>>
>>>> 2. I replaced the halt with below code,
>>>>         if (arch_irqs_disabled())
>>>>                  halt();
>>>>
>>>> and ran benchmarks.
>>>> But this results in degradation because, it means we again go back
>>>> and spin in irq enabled case.
>>>>
>>> Yes, this is not what I proposed.
>>
>> True.
>>
>>>
>>>> 3. Now I am analyzing the performance overhead of safe_halt in irq
>>>> enabled case.
>>>>        if (arch_irqs_disabled())
>>>>                 halt();
>>>>        else
>>>>                 safe_halt();
>>> Use of arch_irqs_disabled() is incorrect here.
>>
>> Oops! sill me.
>>
>> If you are doing it before
>>> local_irq_restore() it will always be false since you disabled interrupt
>>> yourself,
>>
>> This was not the case. but latter is the one I missed.
>>
>>   if you do it after then it is to late since interrupt can come
>>> between local_irq_restore() and halt() so enabling interrupt and halt
>>> are still not atomic.  You should drop local_irq_restore() and do
>>>
>>>    if (arch_irqs_disabled_flags(flags))
>>> 	halt();
>>>    else
>>> 	safe_halt();
>>>
>>> instead.
>>>
>>
>> Yes, I tested with below as suggested:
>>
>> //local_irq_restore(flags);
>>
>>          /* halt until it's our turn and kicked. */
>>          if (arch_irqs_disabled_flags(flags))
>>                  halt();
>>          else
>>                  safe_halt();
>>
>>   //local_irq_save(flags);
>> I am seeing only a slight overhead, but want to give a full run to
>> check the performance.
> Without compiling and checking myself the different between previous
> code and this one should be a couple asm instruction. I would be
> surprised if you can measure it especially as vcpu is going to halt
> (and do expensive vmexit in the process) anyway.
>

Yes, right.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 17, 2013, 10:38 a.m. UTC | #12
On 07/17/2013 03:35 PM, Raghavendra K T wrote:
> On 07/17/2013 03:04 PM, Gleb Natapov wrote:
>> On Wed, Jul 17, 2013 at 12:12:35AM +0530, Raghavendra K T wrote:
>>>> I do not think it is very rare to get interrupt between
>>>> local_irq_restore() and halt() under load since any interrupt that
>>>> occurs between local_irq_save() and local_irq_restore() will be
>>>> delivered
>>>> immediately after local_irq_restore(). Of course the chance of no
>>>> other
>>>> random interrupt waking lock waiter is very low, but waiter can sleep
>>>> for much longer then needed and this will be noticeable in
>>>> performance.
>>>
>>> Yes, I meant the entire thing. I did infact turned WARN on
>>> w->lock==null before halt() [ though we can potentially have irq right
>>> after that ], but did not hit so far.
>> Depends on your workload of course. To hit that you not only need to get
>> interrupt in there, but the interrupt handler needs to take contended
>> spinlock.
>>
>
> Yes. Agree.
>
>>>
>>>> BTW can NMI handler take spinlocks? If it can what happens if NMI is
>>>> delivered in a section protected by
>>>> local_irq_save()/local_irq_restore()?
>>>>
>>>
>>> Had another idea if NMI, halts are causing problem until I saw
>>> PeterZ's reply similar to V2 of pvspinlock posted  here:
>>>
>>>   https://lkml.org/lkml/2011/10/23/211
>>>
>>> Instead of halt we started with a sleep hypercall in those
>>>   versions. Changed to halt() once Avi suggested to reuse existing
>>> sleep.
>>>
>>> If we use older hypercall with few changes like below:
>>>
>>> kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
>>> {
>>>   // a0 reserved for flags
>>> if (!w->lock)
>>> return;
>>> DEFINE_WAIT
>>> ...
>>> end_wait
>>> }
>>>
>> How would this help if NMI takes lock in critical section. The thing
>> that may happen is that lock_waiting->want may have NMI lock value, but
>> lock_waiting->lock will point to non NMI lock. Setting of want and lock
>> have to be atomic.
>
> True. so we are here
>
>          non NMI lock(a)
>          w->lock = NULL;
>          smp_wmb();
>          w->want = want;
>                                 NMI
>                           <---------------------
>                            NMI lock(b)
>                            w->lock = NULL;
>                            smp_wmb();
>                            w->want = want;
>                            smp_wmb();
>                            w->lock = lock;
>                           ---------------------->
>          smp_wmb();
>          w->lock = lock;
>
> so how about fixing like this?
>
> again:
>          w->lock = NULL;
>          smp_wmb();
>          w->want = want;
>          smp_wmb();
>          w->lock = lock;
>
> if (!lock || w->want != want) goto again;
>
Sorry,
I meant if (!w->lock || w->want !=want) here


[...]

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 17, 2013, 12:45 p.m. UTC | #13
On Wed, Jul 17, 2013 at 03:35:37PM +0530, Raghavendra K T wrote:
> >>Instead of halt we started with a sleep hypercall in those
> >>  versions. Changed to halt() once Avi suggested to reuse existing sleep.
> >>
> >>If we use older hypercall with few changes like below:
> >>
> >>kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
> >>{
> >>  // a0 reserved for flags
> >>if (!w->lock)
> >>return;
> >>DEFINE_WAIT
> >>...
> >>end_wait
> >>}
> >>
> >How would this help if NMI takes lock in critical section. The thing
> >that may happen is that lock_waiting->want may have NMI lock value, but
> >lock_waiting->lock will point to non NMI lock. Setting of want and lock
> >have to be atomic.
> 
> True. so we are here
> 
>         non NMI lock(a)
>         w->lock = NULL;
>         smp_wmb();
>         w->want = want;
>                                NMI
>                          <---------------------
>                           NMI lock(b)
>                           w->lock = NULL;
>                           smp_wmb();
>                           w->want = want;
>                           smp_wmb();
>                           w->lock = lock;
>                          ---------------------->
>         smp_wmb();
>         w->lock = lock;
> 
> so how about fixing like this?
> 
> again:
>         w->lock = NULL;
>         smp_wmb();
>         w->want = want;
>         smp_wmb();
>         w->lock = lock;
> 
> if (!lock || w->want != want) goto again;
> 
NMI can happen after the if() but before halt and the same situation
we are trying to prevent with IRQs will occur. But if NMI handler do not
take locks we shouldn't worry.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 17, 2013, 12:55 p.m. UTC | #14
On 07/17/2013 06:15 PM, Gleb Natapov wrote:
> On Wed, Jul 17, 2013 at 03:35:37PM +0530, Raghavendra K T wrote:
>>>> Instead of halt we started with a sleep hypercall in those
>>>>   versions. Changed to halt() once Avi suggested to reuse existing sleep.
>>>>
>>>> If we use older hypercall with few changes like below:
>>>>
>>>> kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
>>>> {
>>>>   // a0 reserved for flags
>>>> if (!w->lock)
>>>> return;
>>>> DEFINE_WAIT
>>>> ...
>>>> end_wait
>>>> }
>>>>
>>> How would this help if NMI takes lock in critical section. The thing
>>> that may happen is that lock_waiting->want may have NMI lock value, but
>>> lock_waiting->lock will point to non NMI lock. Setting of want and lock
>>> have to be atomic.
>>
>> True. so we are here
>>
>>          non NMI lock(a)
>>          w->lock = NULL;
>>          smp_wmb();
>>          w->want = want;
>>                                 NMI
>>                           <---------------------
>>                            NMI lock(b)
>>                            w->lock = NULL;
>>                            smp_wmb();
>>                            w->want = want;
>>                            smp_wmb();
>>                            w->lock = lock;
>>                           ---------------------->
>>          smp_wmb();
>>          w->lock = lock;
>>
>> so how about fixing like this?
>>
>> again:
>>          w->lock = NULL;
>>          smp_wmb();
>>          w->want = want;
>>          smp_wmb();
>>          w->lock = lock;
>>
>> if (!lock || w->want != want) goto again;
>>
> NMI can happen after the if() but before halt and the same situation
> we are trying to prevent with IRQs will occur.

True, we can not fix that. I thought to fix the inconsistency of
lock,want pair.
But NMI could happen after the first OR condition also.
/me thinks again

  But if NMI handler do not
> take locks we shouldn't worry.

Okay. Thanks for the reviews.
'll spin the next version with all the suggested changes.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 17, 2013, 1:25 p.m. UTC | #15
On Wed, Jul 17, 2013 at 06:25:05PM +0530, Raghavendra K T wrote:
> On 07/17/2013 06:15 PM, Gleb Natapov wrote:
> >On Wed, Jul 17, 2013 at 03:35:37PM +0530, Raghavendra K T wrote:
> >>>>Instead of halt we started with a sleep hypercall in those
> >>>>  versions. Changed to halt() once Avi suggested to reuse existing sleep.
> >>>>
> >>>>If we use older hypercall with few changes like below:
> >>>>
> >>>>kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
> >>>>{
> >>>>  // a0 reserved for flags
> >>>>if (!w->lock)
> >>>>return;
> >>>>DEFINE_WAIT
> >>>>...
> >>>>end_wait
> >>>>}
> >>>>
> >>>How would this help if NMI takes lock in critical section. The thing
> >>>that may happen is that lock_waiting->want may have NMI lock value, but
> >>>lock_waiting->lock will point to non NMI lock. Setting of want and lock
> >>>have to be atomic.
> >>
> >>True. so we are here
> >>
> >>         non NMI lock(a)
> >>         w->lock = NULL;
> >>         smp_wmb();
> >>         w->want = want;
> >>                                NMI
> >>                          <---------------------
> >>                           NMI lock(b)
> >>                           w->lock = NULL;
> >>                           smp_wmb();
> >>                           w->want = want;
> >>                           smp_wmb();
> >>                           w->lock = lock;
> >>                          ---------------------->
> >>         smp_wmb();
> >>         w->lock = lock;
> >>
> >>so how about fixing like this?
> >>
> >>again:
> >>         w->lock = NULL;
> >>         smp_wmb();
> >>         w->want = want;
> >>         smp_wmb();
> >>         w->lock = lock;
> >>
> >>if (!lock || w->want != want) goto again;
> >>
> >NMI can happen after the if() but before halt and the same situation
> >we are trying to prevent with IRQs will occur.
> 
> True, we can not fix that. I thought to fix the inconsistency of
> lock,want pair.
> But NMI could happen after the first OR condition also.
> /me thinks again
> 
lock_spinning() can check that it is called in nmi context and bail out.
How often this will happens anyway.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 17, 2013, 2:13 p.m. UTC | #16
On 07/17/2013 06:55 PM, Gleb Natapov wrote:
> On Wed, Jul 17, 2013 at 06:25:05PM +0530, Raghavendra K T wrote:
>> On 07/17/2013 06:15 PM, Gleb Natapov wrote:
>>> On Wed, Jul 17, 2013 at 03:35:37PM +0530, Raghavendra K T wrote:
>>>>>> Instead of halt we started with a sleep hypercall in those
>>>>>>   versions. Changed to halt() once Avi suggested to reuse existing sleep.
>>>>>>
>>>>>> If we use older hypercall with few changes like below:
>>>>>>
>>>>>> kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
>>>>>> {
>>>>>>   // a0 reserved for flags
>>>>>> if (!w->lock)
>>>>>> return;
>>>>>> DEFINE_WAIT
>>>>>> ...
>>>>>> end_wait
>>>>>> }
>>>>>>
>>>>> How would this help if NMI takes lock in critical section. The thing
>>>>> that may happen is that lock_waiting->want may have NMI lock value, but
>>>>> lock_waiting->lock will point to non NMI lock. Setting of want and lock
>>>>> have to be atomic.
>>>>
>>>> True. so we are here
>>>>
>>>>          non NMI lock(a)
>>>>          w->lock = NULL;
>>>>          smp_wmb();
>>>>          w->want = want;
>>>>                                 NMI
>>>>                           <---------------------
>>>>                            NMI lock(b)
>>>>                            w->lock = NULL;
>>>>                            smp_wmb();
>>>>                            w->want = want;
>>>>                            smp_wmb();
>>>>                            w->lock = lock;
>>>>                           ---------------------->
>>>>          smp_wmb();
>>>>          w->lock = lock;
>>>>
>>>> so how about fixing like this?
>>>>
>>>> again:
>>>>          w->lock = NULL;
>>>>          smp_wmb();
>>>>          w->want = want;
>>>>          smp_wmb();
>>>>          w->lock = lock;
>>>>
>>>> if (!lock || w->want != want) goto again;
>>>>
>>> NMI can happen after the if() but before halt and the same situation
>>> we are trying to prevent with IRQs will occur.
>>
>> True, we can not fix that. I thought to fix the inconsistency of
>> lock,want pair.
>> But NMI could happen after the first OR condition also.
>> /me thinks again
>>
> lock_spinning() can check that it is called in nmi context and bail out.

Good point.
I think we can check for even irq context and bailout so that in irq 
context we continue spinning instead of slowpath. no ?

> How often this will happens anyway.
>

I know NMIs occur frequently with watchdogs. or used by sysrq-trigger
etc.. But I am not an expert how frequent it is otherwise. But even
then if they do not use spinlock, we have no problem as already pointed.

I can measure with debugfs counter how often it happens.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 17, 2013, 2:14 p.m. UTC | #17
On 07/17/2013 07:43 PM, Raghavendra K T wrote:
> On 07/17/2013 06:55 PM, Gleb Natapov wrote:
>> On Wed, Jul 17, 2013 at 06:25:05PM +0530, Raghavendra K T wrote:
>>> On 07/17/2013 06:15 PM, Gleb Natapov wrote:
>>>> On Wed, Jul 17, 2013 at 03:35:37PM +0530, Raghavendra K T wrote:
>>>>>>> Instead of halt we started with a sleep hypercall in those
>>>>>>>   versions. Changed to halt() once Avi suggested to reuse
>>>>>>> existing sleep.
>>>>>>>
>>>>>>> If we use older hypercall with few changes like below:
>>>>>>>
>>>>>>> kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
>>>>>>> {
>>>>>>>   // a0 reserved for flags
>>>>>>> if (!w->lock)
>>>>>>> return;
>>>>>>> DEFINE_WAIT
>>>>>>> ...
>>>>>>> end_wait
>>>>>>> }
>>>>>>>
>>>>>> How would this help if NMI takes lock in critical section. The thing
>>>>>> that may happen is that lock_waiting->want may have NMI lock
>>>>>> value, but
>>>>>> lock_waiting->lock will point to non NMI lock. Setting of want and
>>>>>> lock
>>>>>> have to be atomic.
>>>>>
>>>>> True. so we are here
>>>>>
>>>>>          non NMI lock(a)
>>>>>          w->lock = NULL;
>>>>>          smp_wmb();
>>>>>          w->want = want;
>>>>>                                 NMI
>>>>>                           <---------------------
>>>>>                            NMI lock(b)
>>>>>                            w->lock = NULL;
>>>>>                            smp_wmb();
>>>>>                            w->want = want;
>>>>>                            smp_wmb();
>>>>>                            w->lock = lock;
>>>>>                           ---------------------->
>>>>>          smp_wmb();
>>>>>          w->lock = lock;
>>>>>
>>>>> so how about fixing like this?
>>>>>
>>>>> again:
>>>>>          w->lock = NULL;
>>>>>          smp_wmb();
>>>>>          w->want = want;
>>>>>          smp_wmb();
>>>>>          w->lock = lock;
>>>>>
>>>>> if (!lock || w->want != want) goto again;
>>>>>
>>>> NMI can happen after the if() but before halt and the same situation
>>>> we are trying to prevent with IRQs will occur.
>>>
>>> True, we can not fix that. I thought to fix the inconsistency of
>>> lock,want pair.
>>> But NMI could happen after the first OR condition also.
>>> /me thinks again
>>>
>> lock_spinning() can check that it is called in nmi context and bail out.
>
> Good point.
> I think we can check for even irq context and bailout so that in irq
> context we continue spinning instead of slowpath. no ?
>
>> How often this will happens anyway.
>>
>
> I know NMIs occur frequently with watchdogs. or used by sysrq-trigger
> etc.. But I am not an expert how frequent it is otherwise.

Forgot to ask if Peter has any points on NMI frequency.

But even
> then if they do not use spinlock, we have no problem as already pointed.
>
> I can measure with debugfs counter how often it happens.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 17, 2013, 2:44 p.m. UTC | #18
On Wed, Jul 17, 2013 at 07:43:01PM +0530, Raghavendra K T wrote:
> On 07/17/2013 06:55 PM, Gleb Natapov wrote:
> >On Wed, Jul 17, 2013 at 06:25:05PM +0530, Raghavendra K T wrote:
> >>On 07/17/2013 06:15 PM, Gleb Natapov wrote:
> >>>On Wed, Jul 17, 2013 at 03:35:37PM +0530, Raghavendra K T wrote:
> >>>>>>Instead of halt we started with a sleep hypercall in those
> >>>>>>  versions. Changed to halt() once Avi suggested to reuse existing sleep.
> >>>>>>
> >>>>>>If we use older hypercall with few changes like below:
> >>>>>>
> >>>>>>kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
> >>>>>>{
> >>>>>>  // a0 reserved for flags
> >>>>>>if (!w->lock)
> >>>>>>return;
> >>>>>>DEFINE_WAIT
> >>>>>>...
> >>>>>>end_wait
> >>>>>>}
> >>>>>>
> >>>>>How would this help if NMI takes lock in critical section. The thing
> >>>>>that may happen is that lock_waiting->want may have NMI lock value, but
> >>>>>lock_waiting->lock will point to non NMI lock. Setting of want and lock
> >>>>>have to be atomic.
> >>>>
> >>>>True. so we are here
> >>>>
> >>>>         non NMI lock(a)
> >>>>         w->lock = NULL;
> >>>>         smp_wmb();
> >>>>         w->want = want;
> >>>>                                NMI
> >>>>                          <---------------------
> >>>>                           NMI lock(b)
> >>>>                           w->lock = NULL;
> >>>>                           smp_wmb();
> >>>>                           w->want = want;
> >>>>                           smp_wmb();
> >>>>                           w->lock = lock;
> >>>>                          ---------------------->
> >>>>         smp_wmb();
> >>>>         w->lock = lock;
> >>>>
> >>>>so how about fixing like this?
> >>>>
> >>>>again:
> >>>>         w->lock = NULL;
> >>>>         smp_wmb();
> >>>>         w->want = want;
> >>>>         smp_wmb();
> >>>>         w->lock = lock;
> >>>>
> >>>>if (!lock || w->want != want) goto again;
> >>>>
> >>>NMI can happen after the if() but before halt and the same situation
> >>>we are trying to prevent with IRQs will occur.
> >>
> >>True, we can not fix that. I thought to fix the inconsistency of
> >>lock,want pair.
> >>But NMI could happen after the first OR condition also.
> >>/me thinks again
> >>
> >lock_spinning() can check that it is called in nmi context and bail out.
> 
> Good point.
> I think we can check for even irq context and bailout so that in irq
> context we continue spinning instead of slowpath. no ?
> 
That will happen much more often and irq context is no a problem anyway.

> >How often this will happens anyway.
> >
> 
> I know NMIs occur frequently with watchdogs. or used by sysrq-trigger
> etc.. But I am not an expert how frequent it is otherwise. But even
> then if they do not use spinlock, we have no problem as already pointed.
> 
> I can measure with debugfs counter how often it happens.
> 
When you run perf you will see a lot of NMIs, but those should not take
any locks.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 17, 2013, 2:55 p.m. UTC | #19
On 07/17/2013 08:14 PM, Gleb Natapov wrote:
> On Wed, Jul 17, 2013 at 07:43:01PM +0530, Raghavendra K T wrote:
>> On 07/17/2013 06:55 PM, Gleb Natapov wrote:
>>> On Wed, Jul 17, 2013 at 06:25:05PM +0530, Raghavendra K T wrote:
>>>> On 07/17/2013 06:15 PM, Gleb Natapov wrote:
>>>>> On Wed, Jul 17, 2013 at 03:35:37PM +0530, Raghavendra K T wrote:
>>>>>>>> Instead of halt we started with a sleep hypercall in those
>>>>>>>>   versions. Changed to halt() once Avi suggested to reuse existing sleep.
>>>>>>>>
>>>>>>>> If we use older hypercall with few changes like below:
>>>>>>>>
>>>>>>>> kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
>>>>>>>> {
>>>>>>>>   // a0 reserved for flags
>>>>>>>> if (!w->lock)
>>>>>>>> return;
>>>>>>>> DEFINE_WAIT
>>>>>>>> ...
>>>>>>>> end_wait
>>>>>>>> }
>>>>>>>>
>>>>>>> How would this help if NMI takes lock in critical section. The thing
>>>>>>> that may happen is that lock_waiting->want may have NMI lock value, but
>>>>>>> lock_waiting->lock will point to non NMI lock. Setting of want and lock
>>>>>>> have to be atomic.
>>>>>>
>>>>>> True. so we are here
>>>>>>
>>>>>>          non NMI lock(a)
>>>>>>          w->lock = NULL;
>>>>>>          smp_wmb();
>>>>>>          w->want = want;
>>>>>>                                 NMI
>>>>>>                           <---------------------
>>>>>>                            NMI lock(b)
>>>>>>                            w->lock = NULL;
>>>>>>                            smp_wmb();
>>>>>>                            w->want = want;
>>>>>>                            smp_wmb();
>>>>>>                            w->lock = lock;
>>>>>>                           ---------------------->
>>>>>>          smp_wmb();
>>>>>>          w->lock = lock;
>>>>>>
>>>>>> so how about fixing like this?
>>>>>>
>>>>>> again:
>>>>>>          w->lock = NULL;
>>>>>>          smp_wmb();
>>>>>>          w->want = want;
>>>>>>          smp_wmb();
>>>>>>          w->lock = lock;
>>>>>>
>>>>>> if (!lock || w->want != want) goto again;
>>>>>>
>>>>> NMI can happen after the if() but before halt and the same situation
>>>>> we are trying to prevent with IRQs will occur.
>>>>
>>>> True, we can not fix that. I thought to fix the inconsistency of
>>>> lock,want pair.
>>>> But NMI could happen after the first OR condition also.
>>>> /me thinks again
>>>>
>>> lock_spinning() can check that it is called in nmi context and bail out.
>>
>> Good point.
>> I think we can check for even irq context and bailout so that in irq
>> context we continue spinning instead of slowpath. no ?
>>
> That will happen much more often and irq context is no a problem anyway.
>

Yes. It is not a problem. But my idea was to not to enter slowpath lock
during irq processing. Do you think that is a good idea?

I 'll now experiment how often we enter slowpath in irq context.

>>> How often this will happens anyway.
>>>
>>
>> I know NMIs occur frequently with watchdogs. or used by sysrq-trigger
>> etc.. But I am not an expert how frequent it is otherwise. But even
>> then if they do not use spinlock, we have no problem as already pointed.
>>
>> I can measure with debugfs counter how often it happens.
>>
> When you run perf you will see a lot of NMIs, but those should not take
> any locks.

Yes. I just verified that with benchmark runs, and with perf running,
there was not even a single nmi hitting the lock_spinning.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 17, 2013, 3:11 p.m. UTC | #20
On Wed, Jul 17, 2013 at 08:25:19PM +0530, Raghavendra K T wrote:
> On 07/17/2013 08:14 PM, Gleb Natapov wrote:
> >On Wed, Jul 17, 2013 at 07:43:01PM +0530, Raghavendra K T wrote:
> >>On 07/17/2013 06:55 PM, Gleb Natapov wrote:
> >>>On Wed, Jul 17, 2013 at 06:25:05PM +0530, Raghavendra K T wrote:
> >>>>On 07/17/2013 06:15 PM, Gleb Natapov wrote:
> >>>>>On Wed, Jul 17, 2013 at 03:35:37PM +0530, Raghavendra K T wrote:
> >>>>>>>>Instead of halt we started with a sleep hypercall in those
> >>>>>>>>  versions. Changed to halt() once Avi suggested to reuse existing sleep.
> >>>>>>>>
> >>>>>>>>If we use older hypercall with few changes like below:
> >>>>>>>>
> >>>>>>>>kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
> >>>>>>>>{
> >>>>>>>>  // a0 reserved for flags
> >>>>>>>>if (!w->lock)
> >>>>>>>>return;
> >>>>>>>>DEFINE_WAIT
> >>>>>>>>...
> >>>>>>>>end_wait
> >>>>>>>>}
> >>>>>>>>
> >>>>>>>How would this help if NMI takes lock in critical section. The thing
> >>>>>>>that may happen is that lock_waiting->want may have NMI lock value, but
> >>>>>>>lock_waiting->lock will point to non NMI lock. Setting of want and lock
> >>>>>>>have to be atomic.
> >>>>>>
> >>>>>>True. so we are here
> >>>>>>
> >>>>>>         non NMI lock(a)
> >>>>>>         w->lock = NULL;
> >>>>>>         smp_wmb();
> >>>>>>         w->want = want;
> >>>>>>                                NMI
> >>>>>>                          <---------------------
> >>>>>>                           NMI lock(b)
> >>>>>>                           w->lock = NULL;
> >>>>>>                           smp_wmb();
> >>>>>>                           w->want = want;
> >>>>>>                           smp_wmb();
> >>>>>>                           w->lock = lock;
> >>>>>>                          ---------------------->
> >>>>>>         smp_wmb();
> >>>>>>         w->lock = lock;
> >>>>>>
> >>>>>>so how about fixing like this?
> >>>>>>
> >>>>>>again:
> >>>>>>         w->lock = NULL;
> >>>>>>         smp_wmb();
> >>>>>>         w->want = want;
> >>>>>>         smp_wmb();
> >>>>>>         w->lock = lock;
> >>>>>>
> >>>>>>if (!lock || w->want != want) goto again;
> >>>>>>
> >>>>>NMI can happen after the if() but before halt and the same situation
> >>>>>we are trying to prevent with IRQs will occur.
> >>>>
> >>>>True, we can not fix that. I thought to fix the inconsistency of
> >>>>lock,want pair.
> >>>>But NMI could happen after the first OR condition also.
> >>>>/me thinks again
> >>>>
> >>>lock_spinning() can check that it is called in nmi context and bail out.
> >>
> >>Good point.
> >>I think we can check for even irq context and bailout so that in irq
> >>context we continue spinning instead of slowpath. no ?
> >>
> >That will happen much more often and irq context is no a problem anyway.
> >
> 
> Yes. It is not a problem. But my idea was to not to enter slowpath lock
> during irq processing. Do you think that is a good idea?
> 
Why would we disable it if its purpose is to improve handling of
contended locks? NMI is only special because it is impossible to handle
and should not happen anyway.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 17, 2013, 3:20 p.m. UTC | #21
On 07/17/2013 08:25 PM, Raghavendra K T wrote:
> On 07/17/2013 08:14 PM, Gleb Natapov wrote:
>> On Wed, Jul 17, 2013 at 07:43:01PM +0530, Raghavendra K T wrote:
>>> On 07/17/2013 06:55 PM, Gleb Natapov wrote:
>>>> On Wed, Jul 17, 2013 at 06:25:05PM +0530, Raghavendra K T wrote:
>>>>> On 07/17/2013 06:15 PM, Gleb Natapov wrote:
>>>>>> On Wed, Jul 17, 2013 at 03:35:37PM +0530, Raghavendra K T wrote:
>>>>>>>>> Instead of halt we started with a sleep hypercall in those
>>>>>>>>>   versions. Changed to halt() once Avi suggested to reuse
>>>>>>>>> existing sleep.
>>>>>>>>>
>>>>>>>>> If we use older hypercall with few changes like below:
>>>>>>>>>
>>>>>>>>> kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
>>>>>>>>> {
>>>>>>>>>   // a0 reserved for flags
>>>>>>>>> if (!w->lock)
>>>>>>>>> return;
>>>>>>>>> DEFINE_WAIT
>>>>>>>>> ...
>>>>>>>>> end_wait
>>>>>>>>> }
>>>>>>>>>
>>>>>>>> How would this help if NMI takes lock in critical section. The
>>>>>>>> thing
>>>>>>>> that may happen is that lock_waiting->want may have NMI lock
>>>>>>>> value, but
>>>>>>>> lock_waiting->lock will point to non NMI lock. Setting of want
>>>>>>>> and lock
>>>>>>>> have to be atomic.
>>>>>>>
>>>>>>> True. so we are here
>>>>>>>
>>>>>>>          non NMI lock(a)
>>>>>>>          w->lock = NULL;
>>>>>>>          smp_wmb();
>>>>>>>          w->want = want;
>>>>>>>                                 NMI
>>>>>>>                           <---------------------
>>>>>>>                            NMI lock(b)
>>>>>>>                            w->lock = NULL;
>>>>>>>                            smp_wmb();
>>>>>>>                            w->want = want;
>>>>>>>                            smp_wmb();
>>>>>>>                            w->lock = lock;
>>>>>>>                           ---------------------->
>>>>>>>          smp_wmb();
>>>>>>>          w->lock = lock;
>>>>>>>
>>>>>>> so how about fixing like this?
>>>>>>>
>>>>>>> again:
>>>>>>>          w->lock = NULL;
>>>>>>>          smp_wmb();
>>>>>>>          w->want = want;
>>>>>>>          smp_wmb();
>>>>>>>          w->lock = lock;
>>>>>>>
>>>>>>> if (!lock || w->want != want) goto again;
>>>>>>>
>>>>>> NMI can happen after the if() but before halt and the same situation
>>>>>> we are trying to prevent with IRQs will occur.
>>>>>
>>>>> True, we can not fix that. I thought to fix the inconsistency of
>>>>> lock,want pair.
>>>>> But NMI could happen after the first OR condition also.
>>>>> /me thinks again
>>>>>
>>>> lock_spinning() can check that it is called in nmi context and bail
>>>> out.
>>>
>>> Good point.
>>> I think we can check for even irq context and bailout so that in irq
>>> context we continue spinning instead of slowpath. no ?
>>>
>> That will happen much more often and irq context is no a problem anyway.
>>
>
> Yes. It is not a problem. But my idea was to not to enter slowpath lock
> during irq processing. Do you think that is a good idea?
>
> I 'll now experiment how often we enter slowpath in irq context.
>

With dbench 1.5x run, on my 32cpu / 16core sandybridge, I saw
around 10  spinlock slowpath  entered from the irq context.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 17, 2013, 3:22 p.m. UTC | #22
On 07/17/2013 08:41 PM, Gleb Natapov wrote:
> On Wed, Jul 17, 2013 at 08:25:19PM +0530, Raghavendra K T wrote:
>> On 07/17/2013 08:14 PM, Gleb Natapov wrote:
>>> On Wed, Jul 17, 2013 at 07:43:01PM +0530, Raghavendra K T wrote:
>>>> On 07/17/2013 06:55 PM, Gleb Natapov wrote:
>>>>> On Wed, Jul 17, 2013 at 06:25:05PM +0530, Raghavendra K T wrote:
>>>>>> On 07/17/2013 06:15 PM, Gleb Natapov wrote:
>>>>>>> On Wed, Jul 17, 2013 at 03:35:37PM +0530, Raghavendra K T wrote:
>>>>>>>>>> Instead of halt we started with a sleep hypercall in those
>>>>>>>>>>   versions. Changed to halt() once Avi suggested to reuse existing sleep.
>>>>>>>>>>
>>>>>>>>>> If we use older hypercall with few changes like below:
>>>>>>>>>>
>>>>>>>>>> kvm_pv_wait_for_kick_op(flags, vcpu, w->lock )
>>>>>>>>>> {
>>>>>>>>>>   // a0 reserved for flags
>>>>>>>>>> if (!w->lock)
>>>>>>>>>> return;
>>>>>>>>>> DEFINE_WAIT
>>>>>>>>>> ...
>>>>>>>>>> end_wait
>>>>>>>>>> }
>>>>>>>>>>
>>>>>>>>> How would this help if NMI takes lock in critical section. The thing
>>>>>>>>> that may happen is that lock_waiting->want may have NMI lock value, but
>>>>>>>>> lock_waiting->lock will point to non NMI lock. Setting of want and lock
>>>>>>>>> have to be atomic.
>>>>>>>>
>>>>>>>> True. so we are here
>>>>>>>>
>>>>>>>>          non NMI lock(a)
>>>>>>>>          w->lock = NULL;
>>>>>>>>          smp_wmb();
>>>>>>>>          w->want = want;
>>>>>>>>                                 NMI
>>>>>>>>                           <---------------------
>>>>>>>>                            NMI lock(b)
>>>>>>>>                            w->lock = NULL;
>>>>>>>>                            smp_wmb();
>>>>>>>>                            w->want = want;
>>>>>>>>                            smp_wmb();
>>>>>>>>                            w->lock = lock;
>>>>>>>>                           ---------------------->
>>>>>>>>          smp_wmb();
>>>>>>>>          w->lock = lock;
>>>>>>>>
>>>>>>>> so how about fixing like this?
>>>>>>>>
>>>>>>>> again:
>>>>>>>>          w->lock = NULL;
>>>>>>>>          smp_wmb();
>>>>>>>>          w->want = want;
>>>>>>>>          smp_wmb();
>>>>>>>>          w->lock = lock;
>>>>>>>>
>>>>>>>> if (!lock || w->want != want) goto again;
>>>>>>>>
>>>>>>> NMI can happen after the if() but before halt and the same situation
>>>>>>> we are trying to prevent with IRQs will occur.
>>>>>>
>>>>>> True, we can not fix that. I thought to fix the inconsistency of
>>>>>> lock,want pair.
>>>>>> But NMI could happen after the first OR condition also.
>>>>>> /me thinks again
>>>>>>
>>>>> lock_spinning() can check that it is called in nmi context and bail out.
>>>>
>>>> Good point.
>>>> I think we can check for even irq context and bailout so that in irq
>>>> context we continue spinning instead of slowpath. no ?
>>>>
>>> That will happen much more often and irq context is no a problem anyway.
>>>
>>
>> Yes. It is not a problem. But my idea was to not to enter slowpath lock
>> during irq processing. Do you think that is a good idea?
>>
> Why would we disable it if its purpose is to improve handling of
> contended locks? NMI is only special because it is impossible to handle
> and should not happen anyway.
>

Yes. agreed. indeed I saw degradation if we allow the slowpath spinlock 
to loop again.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 695399f..427afcb 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -118,10 +118,20 @@  void kvm_async_pf_task_wait(u32 token);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 extern void kvm_disable_steal_time(void);
-#else
-#define kvm_guest_init() do { } while (0)
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init kvm_spinlock_init(void);
+#else /* !CONFIG_PARAVIRT_SPINLOCKS */
+static inline void kvm_spinlock_init(void)
+{
+}
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+#else /* CONFIG_KVM_GUEST */
+#define kvm_guest_init() do {} while (0)
 #define kvm_async_pf_task_wait(T) do {} while(0)
 #define kvm_async_pf_task_wake(T) do {} while(0)
+
 static inline u32 kvm_read_and_reset_pf_reason(void)
 {
 	return 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index cd6d9a5..97ade5a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -34,6 +34,7 @@ 
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/kprobes.h>
+#include <linux/debugfs.h>
 #include <asm/timer.h>
 #include <asm/cpu.h>
 #include <asm/traps.h>
@@ -419,6 +420,7 @@  static void __init kvm_smp_prepare_boot_cpu(void)
 	WARN_ON(kvm_register_clock("primary cpu clock"));
 	kvm_guest_cpu_init();
 	native_smp_prepare_boot_cpu();
+	kvm_spinlock_init();
 }
 
 static void __cpuinit kvm_guest_cpu_online(void *dummy)
@@ -523,3 +525,256 @@  static __init int activate_jump_labels(void)
 	return 0;
 }
 arch_initcall(activate_jump_labels);
+
+/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
+void kvm_kick_cpu(int cpu)
+{
+	int apicid;
+
+	apicid = per_cpu(x86_cpu_to_apicid, cpu);
+	kvm_hypercall1(KVM_HC_KICK_CPU, apicid);
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+enum kvm_contention_stat {
+	TAKEN_SLOW,
+	TAKEN_SLOW_PICKUP,
+	RELEASED_SLOW,
+	RELEASED_SLOW_KICKED,
+	NR_CONTENTION_STATS
+};
+
+#ifdef CONFIG_KVM_DEBUG_FS
+#define HISTO_BUCKETS	30
+
+static struct kvm_spinlock_stats
+{
+	u32 contention_stats[NR_CONTENTION_STATS];
+	u32 histo_spin_blocked[HISTO_BUCKETS+1];
+	u64 time_blocked;
+} spinlock_stats;
+
+static u8 zero_stats;
+
+static inline void check_zero(void)
+{
+	u8 ret;
+	u8 old;
+
+	old = ACCESS_ONCE(zero_stats);
+	if (unlikely(old)) {
+		ret = cmpxchg(&zero_stats, old, 0);
+		/* This ensures only one fellow resets the stat */
+		if (ret == old)
+			memset(&spinlock_stats, 0, sizeof(spinlock_stats));
+	}
+}
+
+static inline void add_stats(enum kvm_contention_stat var, u32 val)
+{
+	check_zero();
+	spinlock_stats.contention_stats[var] += val;
+}
+
+
+static inline u64 spin_time_start(void)
+{
+	return sched_clock();
+}
+
+static void __spin_time_accum(u64 delta, u32 *array)
+{
+	unsigned index;
+
+	index = ilog2(delta);
+	check_zero();
+
+	if (index < HISTO_BUCKETS)
+		array[index]++;
+	else
+		array[HISTO_BUCKETS]++;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+	u32 delta;
+
+	delta = sched_clock() - start;
+	__spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
+	spinlock_stats.time_blocked += delta;
+}
+
+static struct dentry *d_spin_debug;
+static struct dentry *d_kvm_debug;
+
+struct dentry *kvm_init_debugfs(void)
+{
+	d_kvm_debug = debugfs_create_dir("kvm", NULL);
+	if (!d_kvm_debug)
+		printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");
+
+	return d_kvm_debug;
+}
+
+static int __init kvm_spinlock_debugfs(void)
+{
+	struct dentry *d_kvm;
+
+	d_kvm = kvm_init_debugfs();
+	if (d_kvm == NULL)
+		return -ENOMEM;
+
+	d_spin_debug = debugfs_create_dir("spinlocks", d_kvm);
+
+	debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
+
+	debugfs_create_u32("taken_slow", 0444, d_spin_debug,
+		   &spinlock_stats.contention_stats[TAKEN_SLOW]);
+	debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
+		   &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
+
+	debugfs_create_u32("released_slow", 0444, d_spin_debug,
+		   &spinlock_stats.contention_stats[RELEASED_SLOW]);
+	debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
+		   &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
+
+	debugfs_create_u64("time_blocked", 0444, d_spin_debug,
+			   &spinlock_stats.time_blocked);
+
+	debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
+		     spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
+
+	return 0;
+}
+fs_initcall(kvm_spinlock_debugfs);
+#else  /* !CONFIG_KVM_DEBUG_FS */
+static inline void add_stats(enum kvm_contention_stat var, u32 val)
+{
+}
+
+static inline u64 spin_time_start(void)
+{
+	return 0;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+}
+#endif  /* CONFIG_KVM_DEBUG_FS */
+
+struct kvm_lock_waiting {
+	struct arch_spinlock *lock;
+	__ticket_t want;
+};
+
+/* cpus 'waiting' on a spinlock to become available */
+static cpumask_t waiting_cpus;
+
+/* Track spinlock on which a cpu is waiting */
+static DEFINE_PER_CPU(struct kvm_lock_waiting, lock_waiting);
+
+static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
+{
+	struct kvm_lock_waiting *w;
+	int cpu;
+	u64 start;
+	unsigned long flags;
+
+	w = &__get_cpu_var(lock_waiting);
+	cpu = smp_processor_id();
+	start = spin_time_start();
+
+	/*
+	 * Make sure an interrupt handler can't upset things in a
+	 * partially setup state.
+	 */
+	local_irq_save(flags);
+
+	/*
+	 * The ordering protocol on this is that the "lock" pointer
+	 * may only be set non-NULL if the "want" ticket is correct.
+	 * If we're updating "want", we must first clear "lock".
+	 */
+	w->lock = NULL;
+	smp_wmb();
+	w->want = want;
+	smp_wmb();
+	w->lock = lock;
+
+	add_stats(TAKEN_SLOW, 1);
+
+	/*
+	 * This uses set_bit, which is atomic but we should not rely on its
+	 * reordering gurantees. So barrier is needed after this call.
+	 */
+	cpumask_set_cpu(cpu, &waiting_cpus);
+
+	barrier();
+
+	/*
+	 * Mark entry to slowpath before doing the pickup test to make
+	 * sure we don't deadlock with an unlocker.
+	 */
+	__ticket_enter_slowpath(lock);
+
+	/*
+	 * check again make sure it didn't become free while
+	 * we weren't looking.
+	 */
+	if (ACCESS_ONCE(lock->tickets.head) == want) {
+		add_stats(TAKEN_SLOW_PICKUP, 1);
+		goto out;
+	}
+
+	/* Allow interrupts while blocked */
+	local_irq_restore(flags);
+
+	/* halt until it's our turn and kicked. */
+	halt();
+
+	local_irq_save(flags);
+out:
+	cpumask_clear_cpu(cpu, &waiting_cpus);
+	w->lock = NULL;
+	local_irq_restore(flags);
+	spin_time_accum_blocked(start);
+}
+PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
+
+/* Kick vcpu waiting on @lock->head to reach value @ticket */
+static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
+{
+	int cpu;
+
+	add_stats(RELEASED_SLOW, 1);
+	for_each_cpu(cpu, &waiting_cpus) {
+		const struct kvm_lock_waiting *w = &per_cpu(lock_waiting, cpu);
+		if (ACCESS_ONCE(w->lock) == lock &&
+		    ACCESS_ONCE(w->want) == ticket) {
+			add_stats(RELEASED_SLOW_KICKED, 1);
+			kvm_kick_cpu(cpu);
+			break;
+		}
+	}
+}
+
+/*
+ * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
+ */
+void __init kvm_spinlock_init(void)
+{
+	if (!kvm_para_available())
+		return;
+	/* Does host kernel support KVM_FEATURE_PV_UNHALT? */
+	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
+		return;
+
+	printk(KERN_INFO "KVM setup paravirtual spinlock\n");
+
+	static_key_slow_inc(&paravirt_ticketlocks_enabled);
+
+	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
+	pv_lock_ops.unlock_kick = kvm_unlock_kick;
+}
+#endif	/* CONFIG_PARAVIRT_SPINLOCKS */