diff mbox

[RFC,V11,15/18] kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor

Message ID 20130722062016.24737.54554.sendpatchset@codeblue (mailing list archive)
State New, archived
Headers show

Commit Message

Raghavendra K T July 22, 2013, 6:20 a.m. UTC
kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor

From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>

During smp_boot_cpus  paravirtualied KVM guest detects if the hypervisor has
required feature (KVM_FEATURE_PV_UNHALT) to support pv-ticketlocks. If so,
 support for pv-ticketlocks is registered via pv_lock_ops.

Use KVM_HC_KICK_CPU hypercall to wakeup waiting/halted vcpu.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Suzuki Poulose <suzuki@in.ibm.com>
[Raghu: check_zero race fix, enum for kvm_contention_stat, jumplabel related changes,
addition of safe_halt for irq enabled case(Gleb)]
Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
---
 arch/x86/include/asm/kvm_para.h |   14 ++
 arch/x86/kernel/kvm.c           |  259 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 271 insertions(+), 2 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Gleb Natapov July 23, 2013, 3:07 p.m. UTC | #1
On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
> +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
> +{
> +	struct kvm_lock_waiting *w;
> +	int cpu;
> +	u64 start;
> +	unsigned long flags;
> +
Why don't you bailout if in nmi here like we discussed?

> +	w = &__get_cpu_var(lock_waiting);
> +	cpu = smp_processor_id();
> +	start = spin_time_start();
> +
> +	/*
> +	 * Make sure an interrupt handler can't upset things in a
> +	 * partially setup state.
> +	 */
> +	local_irq_save(flags);
> +
> +	/*
> +	 * The ordering protocol on this is that the "lock" pointer
> +	 * may only be set non-NULL if the "want" ticket is correct.
> +	 * If we're updating "want", we must first clear "lock".
> +	 */
> +	w->lock = NULL;
> +	smp_wmb();
> +	w->want = want;
> +	smp_wmb();
> +	w->lock = lock;
> +
> +	add_stats(TAKEN_SLOW, 1);
> +
> +	/*
> +	 * This uses set_bit, which is atomic but we should not rely on its
> +	 * reordering gurantees. So barrier is needed after this call.
> +	 */
> +	cpumask_set_cpu(cpu, &waiting_cpus);
> +
> +	barrier();
> +
> +	/*
> +	 * Mark entry to slowpath before doing the pickup test to make
> +	 * sure we don't deadlock with an unlocker.
> +	 */
> +	__ticket_enter_slowpath(lock);
> +
> +	/*
> +	 * check again make sure it didn't become free while
> +	 * we weren't looking.
> +	 */
> +	if (ACCESS_ONCE(lock->tickets.head) == want) {
> +		add_stats(TAKEN_SLOW_PICKUP, 1);
> +		goto out;
> +	}
> +
> +	/*
> +	 * halt until it's our turn and kicked. Note that we do safe halt
> +	 * for irq enabled case to avoid hang when lock info is overwritten
> +	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
> +	 */
> +	if (arch_irqs_disabled_flags(flags))
> +		halt();
> +	else
> +		safe_halt();
> +
> +out:
So here now interrupts can be either disabled or enabled. Previous
version disabled interrupts here, so are we sure it is safe to have them
enabled at this point? I do not see any problem yet, will keep thinking.

> +	cpumask_clear_cpu(cpu, &waiting_cpus);
> +	w->lock = NULL;
> +	local_irq_restore(flags);
> +	spin_time_accum_blocked(start);
> +}
> +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
> +
> +/* Kick vcpu waiting on @lock->head to reach value @ticket */
> +static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
> +{
> +	int cpu;
> +
> +	add_stats(RELEASED_SLOW, 1);
> +	for_each_cpu(cpu, &waiting_cpus) {
> +		const struct kvm_lock_waiting *w = &per_cpu(lock_waiting, cpu);
> +		if (ACCESS_ONCE(w->lock) == lock &&
> +		    ACCESS_ONCE(w->want) == ticket) {
> +			add_stats(RELEASED_SLOW_KICKED, 1);
> +			kvm_kick_cpu(cpu);
What about using NMI to wake sleepers? I think it was discussed, but
forgot why it was dismissed.

> +			break;
> +		}
> +	}
> +}
> +
> +/*
> + * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
> + */
> +void __init kvm_spinlock_init(void)
> +{
> +	if (!kvm_para_available())
> +		return;
> +	/* Does host kernel support KVM_FEATURE_PV_UNHALT? */
> +	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
> +		return;
> +
> +	printk(KERN_INFO "KVM setup paravirtual spinlock\n");
> +
> +	static_key_slow_inc(&paravirt_ticketlocks_enabled);
> +
> +	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
> +	pv_lock_ops.unlock_kick = kvm_unlock_kick;
> +}
> +#endif	/* CONFIG_PARAVIRT_SPINLOCKS */

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 24, 2013, 9:45 a.m. UTC | #2
On 07/23/2013 08:37 PM, Gleb Natapov wrote:
> On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
>> +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
[...]
>> +
>> +	/*
>> +	 * halt until it's our turn and kicked. Note that we do safe halt
>> +	 * for irq enabled case to avoid hang when lock info is overwritten
>> +	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
>> +	 */
>> +	if (arch_irqs_disabled_flags(flags))
>> +		halt();
>> +	else
>> +		safe_halt();
>> +
>> +out:
> So here now interrupts can be either disabled or enabled. Previous
> version disabled interrupts here, so are we sure it is safe to have them
> enabled at this point? I do not see any problem yet, will keep thinking.

If we enable interrupt here, then


>> +	cpumask_clear_cpu(cpu, &waiting_cpus);

and if we start serving lock for an interrupt that came here,
cpumask clear and w->lock=null may not happen atomically.
if irq spinlock does not take slow path we would have non null value for 
lock, but with no information in waitingcpu.

I am still thinking what would be problem with that.

>> +	w->lock = NULL;
>> +	local_irq_restore(flags);
>> +	spin_time_accum_blocked(start);
>> +}
>> +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
>> +
>> +/* Kick vcpu waiting on @lock->head to reach value @ticket */
>> +static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
>> +{
>> +	int cpu;
>> +
>> +	add_stats(RELEASED_SLOW, 1);
>> +	for_each_cpu(cpu, &waiting_cpus) {
>> +		const struct kvm_lock_waiting *w = &per_cpu(lock_waiting, cpu);
>> +		if (ACCESS_ONCE(w->lock) == lock &&
>> +		    ACCESS_ONCE(w->want) == ticket) {
>> +			add_stats(RELEASED_SLOW_KICKED, 1);
>> +			kvm_kick_cpu(cpu);
> What about using NMI to wake sleepers? I think it was discussed, but
> forgot why it was dismissed.

I think I have missed that discussion. 'll go back and check. so what is 
the idea here? we can easily wake up the halted vcpus that have 
interrupt disabled?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 24, 2013, 10:39 a.m. UTC | #3
On Wed, Jul 24, 2013 at 03:15:50PM +0530, Raghavendra K T wrote:
> On 07/23/2013 08:37 PM, Gleb Natapov wrote:
> >On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
> >>+static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
> [...]
> >>+
> >>+	/*
> >>+	 * halt until it's our turn and kicked. Note that we do safe halt
> >>+	 * for irq enabled case to avoid hang when lock info is overwritten
> >>+	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
> >>+	 */
> >>+	if (arch_irqs_disabled_flags(flags))
> >>+		halt();
> >>+	else
> >>+		safe_halt();
> >>+
> >>+out:
> >So here now interrupts can be either disabled or enabled. Previous
> >version disabled interrupts here, so are we sure it is safe to have them
> >enabled at this point? I do not see any problem yet, will keep thinking.
> 
> If we enable interrupt here, then
> 
> 
> >>+	cpumask_clear_cpu(cpu, &waiting_cpus);
> 
> and if we start serving lock for an interrupt that came here,
> cpumask clear and w->lock=null may not happen atomically.
> if irq spinlock does not take slow path we would have non null value
> for lock, but with no information in waitingcpu.
> 
> I am still thinking what would be problem with that.
> 
Exactly, for kicker waiting_cpus and w->lock updates are
non atomic anyway.

> >>+	w->lock = NULL;
> >>+	local_irq_restore(flags);
> >>+	spin_time_accum_blocked(start);
> >>+}
> >>+PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
> >>+
> >>+/* Kick vcpu waiting on @lock->head to reach value @ticket */
> >>+static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
> >>+{
> >>+	int cpu;
> >>+
> >>+	add_stats(RELEASED_SLOW, 1);
> >>+	for_each_cpu(cpu, &waiting_cpus) {
> >>+		const struct kvm_lock_waiting *w = &per_cpu(lock_waiting, cpu);
> >>+		if (ACCESS_ONCE(w->lock) == lock &&
> >>+		    ACCESS_ONCE(w->want) == ticket) {
> >>+			add_stats(RELEASED_SLOW_KICKED, 1);
> >>+			kvm_kick_cpu(cpu);
> >What about using NMI to wake sleepers? I think it was discussed, but
> >forgot why it was dismissed.
> 
> I think I have missed that discussion. 'll go back and check. so
> what is the idea here? we can easily wake up the halted vcpus that
> have interrupt disabled?
We can of course. IIRC the objection was that NMI handling path is very
fragile and handling NMI on each wakeup will be more expensive then
waking up a guest without injecting an event, but it is still interesting
to see the numbers.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 24, 2013, noon UTC | #4
On 07/24/2013 04:09 PM, Gleb Natapov wrote:
> On Wed, Jul 24, 2013 at 03:15:50PM +0530, Raghavendra K T wrote:
>> On 07/23/2013 08:37 PM, Gleb Natapov wrote:
>>> On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
>>>> +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
>> [...]
>>>> +
>>>> +	/*
>>>> +	 * halt until it's our turn and kicked. Note that we do safe halt
>>>> +	 * for irq enabled case to avoid hang when lock info is overwritten
>>>> +	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
>>>> +	 */
>>>> +	if (arch_irqs_disabled_flags(flags))
>>>> +		halt();
>>>> +	else
>>>> +		safe_halt();
>>>> +
>>>> +out:
>>> So here now interrupts can be either disabled or enabled. Previous
>>> version disabled interrupts here, so are we sure it is safe to have them
>>> enabled at this point? I do not see any problem yet, will keep thinking.
>>
>> If we enable interrupt here, then
>>
>>
>>>> +	cpumask_clear_cpu(cpu, &waiting_cpus);
>>
>> and if we start serving lock for an interrupt that came here,
>> cpumask clear and w->lock=null may not happen atomically.
>> if irq spinlock does not take slow path we would have non null value
>> for lock, but with no information in waitingcpu.
>>
>> I am still thinking what would be problem with that.
>>
> Exactly, for kicker waiting_cpus and w->lock updates are
> non atomic anyway.
>
>>>> +	w->lock = NULL;
>>>> +	local_irq_restore(flags);
>>>> +	spin_time_accum_blocked(start);
>>>> +}
>>>> +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
>>>> +
>>>> +/* Kick vcpu waiting on @lock->head to reach value @ticket */
>>>> +static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
>>>> +{
>>>> +	int cpu;
>>>> +
>>>> +	add_stats(RELEASED_SLOW, 1);
>>>> +	for_each_cpu(cpu, &waiting_cpus) {
>>>> +		const struct kvm_lock_waiting *w = &per_cpu(lock_waiting, cpu);
>>>> +		if (ACCESS_ONCE(w->lock) == lock &&
>>>> +		    ACCESS_ONCE(w->want) == ticket) {
>>>> +			add_stats(RELEASED_SLOW_KICKED, 1);
>>>> +			kvm_kick_cpu(cpu);
>>> What about using NMI to wake sleepers? I think it was discussed, but
>>> forgot why it was dismissed.
>>
>> I think I have missed that discussion. 'll go back and check. so
>> what is the idea here? we can easily wake up the halted vcpus that
>> have interrupt disabled?
> We can of course. IIRC the objection was that NMI handling path is very
> fragile and handling NMI on each wakeup will be more expensive then
> waking up a guest without injecting an event, but it is still interesting
> to see the numbers.
>

Haam, now I remember, We had tried request based mechanism. (new
request like REQ_UNHALT) and process that. It had worked, but had some
complex hacks in vcpu_enter_guest to avoid guest hang in case of
request cleared.  So had left it there..

https://lkml.org/lkml/2012/4/30/67

But I do not remember performance impact though.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 24, 2013, 12:06 p.m. UTC | #5
On Wed, Jul 24, 2013 at 05:30:20PM +0530, Raghavendra K T wrote:
> On 07/24/2013 04:09 PM, Gleb Natapov wrote:
> >On Wed, Jul 24, 2013 at 03:15:50PM +0530, Raghavendra K T wrote:
> >>On 07/23/2013 08:37 PM, Gleb Natapov wrote:
> >>>On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
> >>>>+static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
> >>[...]
> >>>>+
> >>>>+	/*
> >>>>+	 * halt until it's our turn and kicked. Note that we do safe halt
> >>>>+	 * for irq enabled case to avoid hang when lock info is overwritten
> >>>>+	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
> >>>>+	 */
> >>>>+	if (arch_irqs_disabled_flags(flags))
> >>>>+		halt();
> >>>>+	else
> >>>>+		safe_halt();
> >>>>+
> >>>>+out:
> >>>So here now interrupts can be either disabled or enabled. Previous
> >>>version disabled interrupts here, so are we sure it is safe to have them
> >>>enabled at this point? I do not see any problem yet, will keep thinking.
> >>
> >>If we enable interrupt here, then
> >>
> >>
> >>>>+	cpumask_clear_cpu(cpu, &waiting_cpus);
> >>
> >>and if we start serving lock for an interrupt that came here,
> >>cpumask clear and w->lock=null may not happen atomically.
> >>if irq spinlock does not take slow path we would have non null value
> >>for lock, but with no information in waitingcpu.
> >>
> >>I am still thinking what would be problem with that.
> >>
> >Exactly, for kicker waiting_cpus and w->lock updates are
> >non atomic anyway.
> >
> >>>>+	w->lock = NULL;
> >>>>+	local_irq_restore(flags);
> >>>>+	spin_time_accum_blocked(start);
> >>>>+}
> >>>>+PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
> >>>>+
> >>>>+/* Kick vcpu waiting on @lock->head to reach value @ticket */
> >>>>+static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
> >>>>+{
> >>>>+	int cpu;
> >>>>+
> >>>>+	add_stats(RELEASED_SLOW, 1);
> >>>>+	for_each_cpu(cpu, &waiting_cpus) {
> >>>>+		const struct kvm_lock_waiting *w = &per_cpu(lock_waiting, cpu);
> >>>>+		if (ACCESS_ONCE(w->lock) == lock &&
> >>>>+		    ACCESS_ONCE(w->want) == ticket) {
> >>>>+			add_stats(RELEASED_SLOW_KICKED, 1);
> >>>>+			kvm_kick_cpu(cpu);
> >>>What about using NMI to wake sleepers? I think it was discussed, but
> >>>forgot why it was dismissed.
> >>
> >>I think I have missed that discussion. 'll go back and check. so
> >>what is the idea here? we can easily wake up the halted vcpus that
> >>have interrupt disabled?
> >We can of course. IIRC the objection was that NMI handling path is very
> >fragile and handling NMI on each wakeup will be more expensive then
> >waking up a guest without injecting an event, but it is still interesting
> >to see the numbers.
> >
> 
> Haam, now I remember, We had tried request based mechanism. (new
> request like REQ_UNHALT) and process that. It had worked, but had some
> complex hacks in vcpu_enter_guest to avoid guest hang in case of
> request cleared.  So had left it there..
> 
> https://lkml.org/lkml/2012/4/30/67
> 
> But I do not remember performance impact though.
No, this is something different. Wakeup with NMI does not need KVM changes at
all. Instead of kvm_kick_cpu(cpu) in kvm_unlock_kick you send NMI IPI.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 24, 2013, 12:36 p.m. UTC | #6
On 07/24/2013 05:36 PM, Gleb Natapov wrote:
> On Wed, Jul 24, 2013 at 05:30:20PM +0530, Raghavendra K T wrote:
>> On 07/24/2013 04:09 PM, Gleb Natapov wrote:
>>> On Wed, Jul 24, 2013 at 03:15:50PM +0530, Raghavendra K T wrote:
>>>> On 07/23/2013 08:37 PM, Gleb Natapov wrote:
>>>>> On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
>>>>>> +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
>>>> [...]
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * halt until it's our turn and kicked. Note that we do safe halt
>>>>>> +	 * for irq enabled case to avoid hang when lock info is overwritten
>>>>>> +	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
>>>>>> +	 */
>>>>>> +	if (arch_irqs_disabled_flags(flags))
>>>>>> +		halt();
>>>>>> +	else
>>>>>> +		safe_halt();
>>>>>> +
>>>>>> +out:
>>>>> So here now interrupts can be either disabled or enabled. Previous
>>>>> version disabled interrupts here, so are we sure it is safe to have them
>>>>> enabled at this point? I do not see any problem yet, will keep thinking.
>>>>
>>>> If we enable interrupt here, then
>>>>
>>>>
>>>>>> +	cpumask_clear_cpu(cpu, &waiting_cpus);
>>>>
>>>> and if we start serving lock for an interrupt that came here,
>>>> cpumask clear and w->lock=null may not happen atomically.
>>>> if irq spinlock does not take slow path we would have non null value
>>>> for lock, but with no information in waitingcpu.
>>>>
>>>> I am still thinking what would be problem with that.
>>>>
>>> Exactly, for kicker waiting_cpus and w->lock updates are
>>> non atomic anyway.
>>>
>>>>>> +	w->lock = NULL;
>>>>>> +	local_irq_restore(flags);
>>>>>> +	spin_time_accum_blocked(start);
>>>>>> +}
>>>>>> +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
>>>>>> +
>>>>>> +/* Kick vcpu waiting on @lock->head to reach value @ticket */
>>>>>> +static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
>>>>>> +{
>>>>>> +	int cpu;
>>>>>> +
>>>>>> +	add_stats(RELEASED_SLOW, 1);
>>>>>> +	for_each_cpu(cpu, &waiting_cpus) {
>>>>>> +		const struct kvm_lock_waiting *w = &per_cpu(lock_waiting, cpu);
>>>>>> +		if (ACCESS_ONCE(w->lock) == lock &&
>>>>>> +		    ACCESS_ONCE(w->want) == ticket) {
>>>>>> +			add_stats(RELEASED_SLOW_KICKED, 1);
>>>>>> +			kvm_kick_cpu(cpu);
>>>>> What about using NMI to wake sleepers? I think it was discussed, but
>>>>> forgot why it was dismissed.
>>>>
>>>> I think I have missed that discussion. 'll go back and check. so
>>>> what is the idea here? we can easily wake up the halted vcpus that
>>>> have interrupt disabled?
>>> We can of course. IIRC the objection was that NMI handling path is very
>>> fragile and handling NMI on each wakeup will be more expensive then
>>> waking up a guest without injecting an event, but it is still interesting
>>> to see the numbers.
>>>
>>
>> Haam, now I remember, We had tried request based mechanism. (new
>> request like REQ_UNHALT) and process that. It had worked, but had some
>> complex hacks in vcpu_enter_guest to avoid guest hang in case of
>> request cleared.  So had left it there..
>>
>> https://lkml.org/lkml/2012/4/30/67
>>
>> But I do not remember performance impact though.
> No, this is something different. Wakeup with NMI does not need KVM changes at
> all. Instead of kvm_kick_cpu(cpu) in kvm_unlock_kick you send NMI IPI.
>

True. It was not NMI.
just to confirm, are you talking about something like this to be tried ?

apic->send_IPI_mask(cpumask_of(cpu), APIC_DM_NMI);

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 25, 2013, 9:15 a.m. UTC | #7
On Thu, Jul 25, 2013 at 02:47:37PM +0530, Raghavendra K T wrote:
> On 07/24/2013 06:06 PM, Raghavendra K T wrote:
> >On 07/24/2013 05:36 PM, Gleb Natapov wrote:
> >>On Wed, Jul 24, 2013 at 05:30:20PM +0530, Raghavendra K T wrote:
> >>>On 07/24/2013 04:09 PM, Gleb Natapov wrote:
> >>>>On Wed, Jul 24, 2013 at 03:15:50PM +0530, Raghavendra K T wrote:
> >>>>>On 07/23/2013 08:37 PM, Gleb Natapov wrote:
> >>>>>>On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
> >>>>>>>+static void kvm_lock_spinning(struct arch_spinlock *lock,
> >>>>>>>__ticket_t want)
> >>>>>[...]
> >>>>>>>+
> >>>>>>>+    /*
> >>>>>>>+     * halt until it's our turn and kicked. Note that we do safe
> >>>>>>>halt
> >>>>>>>+     * for irq enabled case to avoid hang when lock info is
> >>>>>>>overwritten
> >>>>>>>+     * in irq spinlock slowpath and no spurious interrupt occur
> >>>>>>>to save us.
> >>>>>>>+     */
> >>>>>>>+    if (arch_irqs_disabled_flags(flags))
> >>>>>>>+        halt();
> >>>>>>>+    else
> >>>>>>>+        safe_halt();
> >>>>>>>+
> >>>>>>>+out:
> >>>>>>So here now interrupts can be either disabled or enabled. Previous
> >>>>>>version disabled interrupts here, so are we sure it is safe to
> >>>>>>have them
> >>>>>>enabled at this point? I do not see any problem yet, will keep
> >>>>>>thinking.
> >>>>>
> >>>>>If we enable interrupt here, then
> >>>>>
> >>>>>
> >>>>>>>+    cpumask_clear_cpu(cpu, &waiting_cpus);
> >>>>>
> >>>>>and if we start serving lock for an interrupt that came here,
> >>>>>cpumask clear and w->lock=null may not happen atomically.
> >>>>>if irq spinlock does not take slow path we would have non null value
> >>>>>for lock, but with no information in waitingcpu.
> >>>>>
> >>>>>I am still thinking what would be problem with that.
> >>>>>
> >>>>Exactly, for kicker waiting_cpus and w->lock updates are
> >>>>non atomic anyway.
> >>>>
> >>>>>>>+    w->lock = NULL;
> >>>>>>>+    local_irq_restore(flags);
> >>>>>>>+    spin_time_accum_blocked(start);
> >>>>>>>+}
> >>>>>>>+PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
> >>>>>>>+
> >>>>>>>+/* Kick vcpu waiting on @lock->head to reach value @ticket */
> >>>>>>>+static void kvm_unlock_kick(struct arch_spinlock *lock,
> >>>>>>>__ticket_t ticket)
> >>>>>>>+{
> >>>>>>>+    int cpu;
> >>>>>>>+
> >>>>>>>+    add_stats(RELEASED_SLOW, 1);
> >>>>>>>+    for_each_cpu(cpu, &waiting_cpus) {
> >>>>>>>+        const struct kvm_lock_waiting *w =
> >>>>>>>&per_cpu(lock_waiting, cpu);
> >>>>>>>+        if (ACCESS_ONCE(w->lock) == lock &&
> >>>>>>>+            ACCESS_ONCE(w->want) == ticket) {
> >>>>>>>+            add_stats(RELEASED_SLOW_KICKED, 1);
> >>>>>>>+            kvm_kick_cpu(cpu);
> >>>>>>What about using NMI to wake sleepers? I think it was discussed, but
> >>>>>>forgot why it was dismissed.
> >>>>>
> >>>>>I think I have missed that discussion. 'll go back and check. so
> >>>>>what is the idea here? we can easily wake up the halted vcpus that
> >>>>>have interrupt disabled?
> >>>>We can of course. IIRC the objection was that NMI handling path is very
> >>>>fragile and handling NMI on each wakeup will be more expensive then
> >>>>waking up a guest without injecting an event, but it is still
> >>>>interesting
> >>>>to see the numbers.
> >>>>
> >>>
> >>>Haam, now I remember, We had tried request based mechanism. (new
> >>>request like REQ_UNHALT) and process that. It had worked, but had some
> >>>complex hacks in vcpu_enter_guest to avoid guest hang in case of
> >>>request cleared.  So had left it there..
> >>>
> >>>https://lkml.org/lkml/2012/4/30/67
> >>>
> >>>But I do not remember performance impact though.
> >>No, this is something different. Wakeup with NMI does not need KVM
> >>changes at
> >>all. Instead of kvm_kick_cpu(cpu) in kvm_unlock_kick you send NMI IPI.
> >>
> >
> >True. It was not NMI.
> >just to confirm, are you talking about something like this to be tried ?
> >
> >apic->send_IPI_mask(cpumask_of(cpu), APIC_DM_NMI);
> 
> When I started benchmark, I started seeing
> "Dazed and confused, but trying to continue" from unknown nmi error
> handling.
> Did I miss anything (because we did not register any NMI handler)? or
> is it that spurious NMIs are trouble because we could get spurious NMIs
> if next waiter already acquired the lock.
There is a default NMI handler that tries to detect the reason why NMI
happened (which is no so easy on x86) and prints this message if it
fails. You need to add logic to detect spinlock slow path there. Check
bit in waiting_cpus for instance.

> 
> (note: I tried sending APIC_DM_REMRD IPI directly, which worked fine
> but hypercall way of handling still performed well from the results I
> saw).
You mean better? This is strange. Have you ran guest with x2apic?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 25, 2013, 9:17 a.m. UTC | #8
On 07/24/2013 06:06 PM, Raghavendra K T wrote:
> On 07/24/2013 05:36 PM, Gleb Natapov wrote:
>> On Wed, Jul 24, 2013 at 05:30:20PM +0530, Raghavendra K T wrote:
>>> On 07/24/2013 04:09 PM, Gleb Natapov wrote:
>>>> On Wed, Jul 24, 2013 at 03:15:50PM +0530, Raghavendra K T wrote:
>>>>> On 07/23/2013 08:37 PM, Gleb Natapov wrote:
>>>>>> On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
>>>>>>> +static void kvm_lock_spinning(struct arch_spinlock *lock,
>>>>>>> __ticket_t want)
>>>>> [...]
>>>>>>> +
>>>>>>> +    /*
>>>>>>> +     * halt until it's our turn and kicked. Note that we do safe
>>>>>>> halt
>>>>>>> +     * for irq enabled case to avoid hang when lock info is
>>>>>>> overwritten
>>>>>>> +     * in irq spinlock slowpath and no spurious interrupt occur
>>>>>>> to save us.
>>>>>>> +     */
>>>>>>> +    if (arch_irqs_disabled_flags(flags))
>>>>>>> +        halt();
>>>>>>> +    else
>>>>>>> +        safe_halt();
>>>>>>> +
>>>>>>> +out:
>>>>>> So here now interrupts can be either disabled or enabled. Previous
>>>>>> version disabled interrupts here, so are we sure it is safe to
>>>>>> have them
>>>>>> enabled at this point? I do not see any problem yet, will keep
>>>>>> thinking.
>>>>>
>>>>> If we enable interrupt here, then
>>>>>
>>>>>
>>>>>>> +    cpumask_clear_cpu(cpu, &waiting_cpus);
>>>>>
>>>>> and if we start serving lock for an interrupt that came here,
>>>>> cpumask clear and w->lock=null may not happen atomically.
>>>>> if irq spinlock does not take slow path we would have non null value
>>>>> for lock, but with no information in waitingcpu.
>>>>>
>>>>> I am still thinking what would be problem with that.
>>>>>
>>>> Exactly, for kicker waiting_cpus and w->lock updates are
>>>> non atomic anyway.
>>>>
>>>>>>> +    w->lock = NULL;
>>>>>>> +    local_irq_restore(flags);
>>>>>>> +    spin_time_accum_blocked(start);
>>>>>>> +}
>>>>>>> +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
>>>>>>> +
>>>>>>> +/* Kick vcpu waiting on @lock->head to reach value @ticket */
>>>>>>> +static void kvm_unlock_kick(struct arch_spinlock *lock,
>>>>>>> __ticket_t ticket)
>>>>>>> +{
>>>>>>> +    int cpu;
>>>>>>> +
>>>>>>> +    add_stats(RELEASED_SLOW, 1);
>>>>>>> +    for_each_cpu(cpu, &waiting_cpus) {
>>>>>>> +        const struct kvm_lock_waiting *w =
>>>>>>> &per_cpu(lock_waiting, cpu);
>>>>>>> +        if (ACCESS_ONCE(w->lock) == lock &&
>>>>>>> +            ACCESS_ONCE(w->want) == ticket) {
>>>>>>> +            add_stats(RELEASED_SLOW_KICKED, 1);
>>>>>>> +            kvm_kick_cpu(cpu);
>>>>>> What about using NMI to wake sleepers? I think it was discussed, but
>>>>>> forgot why it was dismissed.
>>>>>
>>>>> I think I have missed that discussion. 'll go back and check. so
>>>>> what is the idea here? we can easily wake up the halted vcpus that
>>>>> have interrupt disabled?
>>>> We can of course. IIRC the objection was that NMI handling path is very
>>>> fragile and handling NMI on each wakeup will be more expensive then
>>>> waking up a guest without injecting an event, but it is still
>>>> interesting
>>>> to see the numbers.
>>>>
>>>
>>> Haam, now I remember, We had tried request based mechanism. (new
>>> request like REQ_UNHALT) and process that. It had worked, but had some
>>> complex hacks in vcpu_enter_guest to avoid guest hang in case of
>>> request cleared.  So had left it there..
>>>
>>> https://lkml.org/lkml/2012/4/30/67
>>>
>>> But I do not remember performance impact though.
>> No, this is something different. Wakeup with NMI does not need KVM
>> changes at
>> all. Instead of kvm_kick_cpu(cpu) in kvm_unlock_kick you send NMI IPI.
>>
>
> True. It was not NMI.
> just to confirm, are you talking about something like this to be tried ?
>
> apic->send_IPI_mask(cpumask_of(cpu), APIC_DM_NMI);

When I started benchmark, I started seeing
"Dazed and confused, but trying to continue" from unknown nmi error
handling.
Did I miss anything (because we did not register any NMI handler)? or
is it that spurious NMIs are trouble because we could get spurious NMIs
if next waiter already acquired the lock.

(note: I tried sending APIC_DM_REMRD IPI directly, which worked fine
but hypercall way of handling still performed well from the results I
saw).





--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 25, 2013, 9:38 a.m. UTC | #9
On 07/25/2013 02:45 PM, Gleb Natapov wrote:
> On Thu, Jul 25, 2013 at 02:47:37PM +0530, Raghavendra K T wrote:
>> On 07/24/2013 06:06 PM, Raghavendra K T wrote:
>>> On 07/24/2013 05:36 PM, Gleb Natapov wrote:
>>>> On Wed, Jul 24, 2013 at 05:30:20PM +0530, Raghavendra K T wrote:
>>>>> On 07/24/2013 04:09 PM, Gleb Natapov wrote:
>>>>>> On Wed, Jul 24, 2013 at 03:15:50PM +0530, Raghavendra K T wrote:
>>>>>>> On 07/23/2013 08:37 PM, Gleb Natapov wrote:
>>>>>>>> On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
>>>>>>>>> +static void kvm_lock_spinning(struct arch_spinlock *lock,
>>>>>>>>> __ticket_t want)
>>>>>>> [...]
>>>>>>>>> +
>>>>>>>>> +    /*
>>>>>>>>> +     * halt until it's our turn and kicked. Note that we do safe
>>>>>>>>> halt
>>>>>>>>> +     * for irq enabled case to avoid hang when lock info is
>>>>>>>>> overwritten
>>>>>>>>> +     * in irq spinlock slowpath and no spurious interrupt occur
>>>>>>>>> to save us.
>>>>>>>>> +     */
>>>>>>>>> +    if (arch_irqs_disabled_flags(flags))
>>>>>>>>> +        halt();
>>>>>>>>> +    else
>>>>>>>>> +        safe_halt();
>>>>>>>>> +
>>>>>>>>> +out:
>>>>>>>> So here now interrupts can be either disabled or enabled. Previous
>>>>>>>> version disabled interrupts here, so are we sure it is safe to
>>>>>>>> have them
>>>>>>>> enabled at this point? I do not see any problem yet, will keep
>>>>>>>> thinking.
>>>>>>>
>>>>>>> If we enable interrupt here, then
>>>>>>>
>>>>>>>
>>>>>>>>> +    cpumask_clear_cpu(cpu, &waiting_cpus);
>>>>>>>
>>>>>>> and if we start serving lock for an interrupt that came here,
>>>>>>> cpumask clear and w->lock=null may not happen atomically.
>>>>>>> if irq spinlock does not take slow path we would have non null value
>>>>>>> for lock, but with no information in waitingcpu.
>>>>>>>
>>>>>>> I am still thinking what would be problem with that.
>>>>>>>
>>>>>> Exactly, for kicker waiting_cpus and w->lock updates are
>>>>>> non atomic anyway.
>>>>>>
>>>>>>>>> +    w->lock = NULL;
>>>>>>>>> +    local_irq_restore(flags);
>>>>>>>>> +    spin_time_accum_blocked(start);
>>>>>>>>> +}
>>>>>>>>> +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
>>>>>>>>> +
>>>>>>>>> +/* Kick vcpu waiting on @lock->head to reach value @ticket */
>>>>>>>>> +static void kvm_unlock_kick(struct arch_spinlock *lock,
>>>>>>>>> __ticket_t ticket)
>>>>>>>>> +{
>>>>>>>>> +    int cpu;
>>>>>>>>> +
>>>>>>>>> +    add_stats(RELEASED_SLOW, 1);
>>>>>>>>> +    for_each_cpu(cpu, &waiting_cpus) {
>>>>>>>>> +        const struct kvm_lock_waiting *w =
>>>>>>>>> &per_cpu(lock_waiting, cpu);
>>>>>>>>> +        if (ACCESS_ONCE(w->lock) == lock &&
>>>>>>>>> +            ACCESS_ONCE(w->want) == ticket) {
>>>>>>>>> +            add_stats(RELEASED_SLOW_KICKED, 1);
>>>>>>>>> +            kvm_kick_cpu(cpu);
>>>>>>>> What about using NMI to wake sleepers? I think it was discussed, but
>>>>>>>> forgot why it was dismissed.
>>>>>>>
>>>>>>> I think I have missed that discussion. 'll go back and check. so
>>>>>>> what is the idea here? we can easily wake up the halted vcpus that
>>>>>>> have interrupt disabled?
>>>>>> We can of course. IIRC the objection was that NMI handling path is very
>>>>>> fragile and handling NMI on each wakeup will be more expensive then
>>>>>> waking up a guest without injecting an event, but it is still
>>>>>> interesting
>>>>>> to see the numbers.
>>>>>>
>>>>>
>>>>> Haam, now I remember, We had tried request based mechanism. (new
>>>>> request like REQ_UNHALT) and process that. It had worked, but had some
>>>>> complex hacks in vcpu_enter_guest to avoid guest hang in case of
>>>>> request cleared.  So had left it there..
>>>>>
>>>>> https://lkml.org/lkml/2012/4/30/67
>>>>>
>>>>> But I do not remember performance impact though.
>>>> No, this is something different. Wakeup with NMI does not need KVM
>>>> changes at
>>>> all. Instead of kvm_kick_cpu(cpu) in kvm_unlock_kick you send NMI IPI.
>>>>
>>>
>>> True. It was not NMI.
>>> just to confirm, are you talking about something like this to be tried ?
>>>
>>> apic->send_IPI_mask(cpumask_of(cpu), APIC_DM_NMI);
>>
>> When I started benchmark, I started seeing
>> "Dazed and confused, but trying to continue" from unknown nmi error
>> handling.
>> Did I miss anything (because we did not register any NMI handler)? or
>> is it that spurious NMIs are trouble because we could get spurious NMIs
>> if next waiter already acquired the lock.
> There is a default NMI handler that tries to detect the reason why NMI
> happened (which is no so easy on x86) and prints this message if it
> fails. You need to add logic to detect spinlock slow path there. Check
> bit in waiting_cpus for instance.

aha.. Okay. will check that.

>
>>
>> (note: I tried sending APIC_DM_REMRD IPI directly, which worked fine
>> but hypercall way of handling still performed well from the results I
>> saw).
> You mean better? This is strange. Have you ran guest with x2apic?
>

Had the same doubt. So ran the full benchmark for dbench.
So here is what I saw now. 1x was neck to neck (0.9% for hypercall vs 
0.7% for IPI which should boil to no difference considering the noise
factors) but otherwise, by sending IPI I see few percentage gain in 
overcommit cases.




--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T July 30, 2013, 4:43 p.m. UTC | #10
On 07/25/2013 03:08 PM, Raghavendra K T wrote:
> On 07/25/2013 02:45 PM, Gleb Natapov wrote:
>> On Thu, Jul 25, 2013 at 02:47:37PM +0530, Raghavendra K T wrote:
>>> On 07/24/2013 06:06 PM, Raghavendra K T wrote:
>>>> On 07/24/2013 05:36 PM, Gleb Natapov wrote:
>>>>> On Wed, Jul 24, 2013 at 05:30:20PM +0530, Raghavendra K T wrote:
>>>>>> On 07/24/2013 04:09 PM, Gleb Natapov wrote:
>>>>>>> On Wed, Jul 24, 2013 at 03:15:50PM +0530, Raghavendra K T wrote:
>>>>>>>> On 07/23/2013 08:37 PM, Gleb Natapov wrote:
>>>>>>>>> On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
>>>>>>>>>> +static void kvm_lock_spinning(struct arch_spinlock *lock,
>>>>>>>>>> __ticket_t want)
>>>>>>>> [...]
>>>>>>>>>> +
>>>>>>>>>> +    /*
>>>>>>>>>> +     * halt until it's our turn and kicked. Note that we do safe
>>>>>>>>>> halt
>>>>>>>>>> +     * for irq enabled case to avoid hang when lock info is
>>>>>>>>>> overwritten
>>>>>>>>>> +     * in irq spinlock slowpath and no spurious interrupt occur
>>>>>>>>>> to save us.
>>>>>>>>>> +     */
>>>>>>>>>> +    if (arch_irqs_disabled_flags(flags))
>>>>>>>>>> +        halt();
>>>>>>>>>> +    else
>>>>>>>>>> +        safe_halt();
>>>>>>>>>> +
>>>>>>>>>> +out:
>>>>>>>>> So here now interrupts can be either disabled or enabled. Previous
>>>>>>>>> version disabled interrupts here, so are we sure it is safe to
>>>>>>>>> have them
>>>>>>>>> enabled at this point? I do not see any problem yet, will keep
>>>>>>>>> thinking.
>>>>>>>>
>>>>>>>> If we enable interrupt here, then
>>>>>>>>
>>>>>>>>
>>>>>>>>>> +    cpumask_clear_cpu(cpu, &waiting_cpus);
>>>>>>>>
>>>>>>>> and if we start serving lock for an interrupt that came here,
>>>>>>>> cpumask clear and w->lock=null may not happen atomically.
>>>>>>>> if irq spinlock does not take slow path we would have non null
>>>>>>>> value
>>>>>>>> for lock, but with no information in waitingcpu.
>>>>>>>>
>>>>>>>> I am still thinking what would be problem with that.
>>>>>>>>
>>>>>>> Exactly, for kicker waiting_cpus and w->lock updates are
>>>>>>> non atomic anyway.
>>>>>>>
>>>>>>>>>> +    w->lock = NULL;
>>>>>>>>>> +    local_irq_restore(flags);
>>>>>>>>>> +    spin_time_accum_blocked(start);
>>>>>>>>>> +}
>>>>>>>>>> +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
>>>>>>>>>> +
>>>>>>>>>> +/* Kick vcpu waiting on @lock->head to reach value @ticket */
>>>>>>>>>> +static void kvm_unlock_kick(struct arch_spinlock *lock,
>>>>>>>>>> __ticket_t ticket)
>>>>>>>>>> +{
>>>>>>>>>> +    int cpu;
>>>>>>>>>> +
>>>>>>>>>> +    add_stats(RELEASED_SLOW, 1);
>>>>>>>>>> +    for_each_cpu(cpu, &waiting_cpus) {
>>>>>>>>>> +        const struct kvm_lock_waiting *w =
>>>>>>>>>> &per_cpu(lock_waiting, cpu);
>>>>>>>>>> +        if (ACCESS_ONCE(w->lock) == lock &&
>>>>>>>>>> +            ACCESS_ONCE(w->want) == ticket) {
>>>>>>>>>> +            add_stats(RELEASED_SLOW_KICKED, 1);
>>>>>>>>>> +            kvm_kick_cpu(cpu);
>>>>>>>>> What about using NMI to wake sleepers? I think it was
>>>>>>>>> discussed, but
>>>>>>>>> forgot why it was dismissed.
>>>>>>>>
>>>>>>>> I think I have missed that discussion. 'll go back and check. so
>>>>>>>> what is the idea here? we can easily wake up the halted vcpus that
>>>>>>>> have interrupt disabled?
>>>>>>> We can of course. IIRC the objection was that NMI handling path
>>>>>>> is very
>>>>>>> fragile and handling NMI on each wakeup will be more expensive then
>>>>>>> waking up a guest without injecting an event, but it is still
>>>>>>> interesting
>>>>>>> to see the numbers.
>>>>>>>
>>>>>>
>>>>>> Haam, now I remember, We had tried request based mechanism. (new
>>>>>> request like REQ_UNHALT) and process that. It had worked, but had
>>>>>> some
>>>>>> complex hacks in vcpu_enter_guest to avoid guest hang in case of
>>>>>> request cleared.  So had left it there..
>>>>>>
>>>>>> https://lkml.org/lkml/2012/4/30/67
>>>>>>
>>>>>> But I do not remember performance impact though.
>>>>> No, this is something different. Wakeup with NMI does not need KVM
>>>>> changes at
>>>>> all. Instead of kvm_kick_cpu(cpu) in kvm_unlock_kick you send NMI IPI.
>>>>>
>>>>
>>>> True. It was not NMI.
>>>> just to confirm, are you talking about something like this to be
>>>> tried ?
>>>>
>>>> apic->send_IPI_mask(cpumask_of(cpu), APIC_DM_NMI);
>>>
>>> When I started benchmark, I started seeing
>>> "Dazed and confused, but trying to continue" from unknown nmi error
>>> handling.
>>> Did I miss anything (because we did not register any NMI handler)? or
>>> is it that spurious NMIs are trouble because we could get spurious NMIs
>>> if next waiter already acquired the lock.
>> There is a default NMI handler that tries to detect the reason why NMI
>> happened (which is no so easy on x86) and prints this message if it
>> fails. You need to add logic to detect spinlock slow path there. Check
>> bit in waiting_cpus for instance.
>
> aha.. Okay. will check that.

yes. Thanks.. that did the trick.

I did like below in unknown_nmi_error():
if (cpumask_test_cpu(smp_processor_id(), &waiting_cpus))
    return;

But I believe you asked NMI method only for experimental purpose to
check the upperbound. because as I doubted above, for spurious NMI
(i.e. when unlocker kicks when waiter already got the lock), we would
still hit unknown NMI error.

I had hit spurious NMI over 1656 times over entire benchmark run.
along with
INFO: NMI handler (arch_trigger_all_cpu_backtrace_handler) took too long 
to run: 24.886 msecs etc...

(and we cannot get away with that too because it means we bypass the
unknown NMI error even in genuine cases too)

Here was the result for the my dbench test( 32 core  machine with 32
vcpu guest HT off)

                  ---------- % improvement --------------
		pvspinlock      pvspin_ipi      pvpsin_nmi
dbench_1x	0.9016    	0.7442    	0.7522
dbench_2x	14.7513   	18.0164   	15.9421
dbench_3x	14.7571   	17.0793   	13.3572
dbench_4x	6.3625    	8.7897    	5.3800

So I am seeing over 2-4% improvement with IPI method.

Gleb,
  do you think the current series looks good to you? [one patch I
have resent with in_nmi() check] or do you think I have to respin the
series with IPI method etc. or is there any concerns that I have to
address. Please let me know..


PS: [Sorry for the late reply, was quickly checking whether unfair lock
with lockowner is better. it did not prove to be though. and so far
all the results are favoring this series.]

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 31, 2013, 6:24 a.m. UTC | #11
On Tue, Jul 30, 2013 at 10:13:12PM +0530, Raghavendra K T wrote:
> On 07/25/2013 03:08 PM, Raghavendra K T wrote:
> >On 07/25/2013 02:45 PM, Gleb Natapov wrote:
> >>On Thu, Jul 25, 2013 at 02:47:37PM +0530, Raghavendra K T wrote:
> >>>On 07/24/2013 06:06 PM, Raghavendra K T wrote:
> >>>>On 07/24/2013 05:36 PM, Gleb Natapov wrote:
> >>>>>On Wed, Jul 24, 2013 at 05:30:20PM +0530, Raghavendra K T wrote:
> >>>>>>On 07/24/2013 04:09 PM, Gleb Natapov wrote:
> >>>>>>>On Wed, Jul 24, 2013 at 03:15:50PM +0530, Raghavendra K T wrote:
> >>>>>>>>On 07/23/2013 08:37 PM, Gleb Natapov wrote:
> >>>>>>>>>On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
> >>>>>>>>>>+static void kvm_lock_spinning(struct arch_spinlock *lock,
> >>>>>>>>>>__ticket_t want)
> >>>>>>>>[...]
> >>>>>>>>>>+
> >>>>>>>>>>+    /*
> >>>>>>>>>>+     * halt until it's our turn and kicked. Note that we do safe
> >>>>>>>>>>halt
> >>>>>>>>>>+     * for irq enabled case to avoid hang when lock info is
> >>>>>>>>>>overwritten
> >>>>>>>>>>+     * in irq spinlock slowpath and no spurious interrupt occur
> >>>>>>>>>>to save us.
> >>>>>>>>>>+     */
> >>>>>>>>>>+    if (arch_irqs_disabled_flags(flags))
> >>>>>>>>>>+        halt();
> >>>>>>>>>>+    else
> >>>>>>>>>>+        safe_halt();
> >>>>>>>>>>+
> >>>>>>>>>>+out:
> >>>>>>>>>So here now interrupts can be either disabled or enabled. Previous
> >>>>>>>>>version disabled interrupts here, so are we sure it is safe to
> >>>>>>>>>have them
> >>>>>>>>>enabled at this point? I do not see any problem yet, will keep
> >>>>>>>>>thinking.
> >>>>>>>>
> >>>>>>>>If we enable interrupt here, then
> >>>>>>>>
> >>>>>>>>
> >>>>>>>>>>+    cpumask_clear_cpu(cpu, &waiting_cpus);
> >>>>>>>>
> >>>>>>>>and if we start serving lock for an interrupt that came here,
> >>>>>>>>cpumask clear and w->lock=null may not happen atomically.
> >>>>>>>>if irq spinlock does not take slow path we would have non null
> >>>>>>>>value
> >>>>>>>>for lock, but with no information in waitingcpu.
> >>>>>>>>
> >>>>>>>>I am still thinking what would be problem with that.
> >>>>>>>>
> >>>>>>>Exactly, for kicker waiting_cpus and w->lock updates are
> >>>>>>>non atomic anyway.
> >>>>>>>
> >>>>>>>>>>+    w->lock = NULL;
> >>>>>>>>>>+    local_irq_restore(flags);
> >>>>>>>>>>+    spin_time_accum_blocked(start);
> >>>>>>>>>>+}
> >>>>>>>>>>+PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
> >>>>>>>>>>+
> >>>>>>>>>>+/* Kick vcpu waiting on @lock->head to reach value @ticket */
> >>>>>>>>>>+static void kvm_unlock_kick(struct arch_spinlock *lock,
> >>>>>>>>>>__ticket_t ticket)
> >>>>>>>>>>+{
> >>>>>>>>>>+    int cpu;
> >>>>>>>>>>+
> >>>>>>>>>>+    add_stats(RELEASED_SLOW, 1);
> >>>>>>>>>>+    for_each_cpu(cpu, &waiting_cpus) {
> >>>>>>>>>>+        const struct kvm_lock_waiting *w =
> >>>>>>>>>>&per_cpu(lock_waiting, cpu);
> >>>>>>>>>>+        if (ACCESS_ONCE(w->lock) == lock &&
> >>>>>>>>>>+            ACCESS_ONCE(w->want) == ticket) {
> >>>>>>>>>>+            add_stats(RELEASED_SLOW_KICKED, 1);
> >>>>>>>>>>+            kvm_kick_cpu(cpu);
> >>>>>>>>>What about using NMI to wake sleepers? I think it was
> >>>>>>>>>discussed, but
> >>>>>>>>>forgot why it was dismissed.
> >>>>>>>>
> >>>>>>>>I think I have missed that discussion. 'll go back and check. so
> >>>>>>>>what is the idea here? we can easily wake up the halted vcpus that
> >>>>>>>>have interrupt disabled?
> >>>>>>>We can of course. IIRC the objection was that NMI handling path
> >>>>>>>is very
> >>>>>>>fragile and handling NMI on each wakeup will be more expensive then
> >>>>>>>waking up a guest without injecting an event, but it is still
> >>>>>>>interesting
> >>>>>>>to see the numbers.
> >>>>>>>
> >>>>>>
> >>>>>>Haam, now I remember, We had tried request based mechanism. (new
> >>>>>>request like REQ_UNHALT) and process that. It had worked, but had
> >>>>>>some
> >>>>>>complex hacks in vcpu_enter_guest to avoid guest hang in case of
> >>>>>>request cleared.  So had left it there..
> >>>>>>
> >>>>>>https://lkml.org/lkml/2012/4/30/67
> >>>>>>
> >>>>>>But I do not remember performance impact though.
> >>>>>No, this is something different. Wakeup with NMI does not need KVM
> >>>>>changes at
> >>>>>all. Instead of kvm_kick_cpu(cpu) in kvm_unlock_kick you send NMI IPI.
> >>>>>
> >>>>
> >>>>True. It was not NMI.
> >>>>just to confirm, are you talking about something like this to be
> >>>>tried ?
> >>>>
> >>>>apic->send_IPI_mask(cpumask_of(cpu), APIC_DM_NMI);
> >>>
> >>>When I started benchmark, I started seeing
> >>>"Dazed and confused, but trying to continue" from unknown nmi error
> >>>handling.
> >>>Did I miss anything (because we did not register any NMI handler)? or
> >>>is it that spurious NMIs are trouble because we could get spurious NMIs
> >>>if next waiter already acquired the lock.
> >>There is a default NMI handler that tries to detect the reason why NMI
> >>happened (which is no so easy on x86) and prints this message if it
> >>fails. You need to add logic to detect spinlock slow path there. Check
> >>bit in waiting_cpus for instance.
> >
> >aha.. Okay. will check that.
> 
> yes. Thanks.. that did the trick.
> 
> I did like below in unknown_nmi_error():
> if (cpumask_test_cpu(smp_processor_id(), &waiting_cpus))
>    return;
> 
> But I believe you asked NMI method only for experimental purpose to
> check the upperbound. because as I doubted above, for spurious NMI
> (i.e. when unlocker kicks when waiter already got the lock), we would
> still hit unknown NMI error.
> 
> I had hit spurious NMI over 1656 times over entire benchmark run.
> along with
> INFO: NMI handler (arch_trigger_all_cpu_backtrace_handler) took too
> long to run: 24.886 msecs etc...
> 
I wonder why this happens.

> (and we cannot get away with that too because it means we bypass the
> unknown NMI error even in genuine cases too)
> 
> Here was the result for the my dbench test( 32 core  machine with 32
> vcpu guest HT off)
> 
>                  ---------- % improvement --------------
> 		pvspinlock      pvspin_ipi      pvpsin_nmi
> dbench_1x	0.9016    	0.7442    	0.7522
> dbench_2x	14.7513   	18.0164   	15.9421
> dbench_3x	14.7571   	17.0793   	13.3572
> dbench_4x	6.3625    	8.7897    	5.3800
> 
> So I am seeing over 2-4% improvement with IPI method.
> 
Yeah, this was expected.

> Gleb,
>  do you think the current series looks good to you? [one patch I
> have resent with in_nmi() check] or do you think I have to respin the
> series with IPI method etc. or is there any concerns that I have to
> address. Please let me know..
> 
The current code looks fine to me.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T Aug. 1, 2013, 7:38 a.m. UTC | #12
On 07/31/2013 11:54 AM, Gleb Natapov wrote:
> On Tue, Jul 30, 2013 at 10:13:12PM +0530, Raghavendra K T wrote:
>> On 07/25/2013 03:08 PM, Raghavendra K T wrote:
>>> On 07/25/2013 02:45 PM, Gleb Natapov wrote:
>>>> On Thu, Jul 25, 2013 at 02:47:37PM +0530, Raghavendra K T wrote:
>>>>> On 07/24/2013 06:06 PM, Raghavendra K T wrote:
>>>>>> On 07/24/2013 05:36 PM, Gleb Natapov wrote:
>>>>>>> On Wed, Jul 24, 2013 at 05:30:20PM +0530, Raghavendra K T wrote:
>>>>>>>> On 07/24/2013 04:09 PM, Gleb Natapov wrote:
>>>>>>>>> On Wed, Jul 24, 2013 at 03:15:50PM +0530, Raghavendra K T wrote:
>>>>>>>>>> On 07/23/2013 08:37 PM, Gleb Natapov wrote:
>>>>>>>>>>> On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
>>>>>>>>>>>> +static void kvm_lock_spinning(struct arch_spinlock *lock,
>>>>>>>>>>>> __ticket_t want)
>>>>>>>>>> [...]
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /*
>>>>>>>>>>>> +     * halt until it's our turn and kicked. Note that we do safe
>>>>>>>>>>>> halt
>>>>>>>>>>>> +     * for irq enabled case to avoid hang when lock info is
>>>>>>>>>>>> overwritten
>>>>>>>>>>>> +     * in irq spinlock slowpath and no spurious interrupt occur
>>>>>>>>>>>> to save us.
>>>>>>>>>>>> +     */
>>>>>>>>>>>> +    if (arch_irqs_disabled_flags(flags))
>>>>>>>>>>>> +        halt();
>>>>>>>>>>>> +    else
>>>>>>>>>>>> +        safe_halt();
>>>>>>>>>>>> +
>>>>>>>>>>>> +out:
>>>>>>>>>>> So here now interrupts can be either disabled or enabled. Previous
>>>>>>>>>>> version disabled interrupts here, so are we sure it is safe to
>>>>>>>>>>> have them
>>>>>>>>>>> enabled at this point? I do not see any problem yet, will keep
>>>>>>>>>>> thinking.
>>>>>>>>>>
>>>>>>>>>> If we enable interrupt here, then
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>> +    cpumask_clear_cpu(cpu, &waiting_cpus);
>>>>>>>>>>
>>>>>>>>>> and if we start serving lock for an interrupt that came here,
>>>>>>>>>> cpumask clear and w->lock=null may not happen atomically.
>>>>>>>>>> if irq spinlock does not take slow path we would have non null
>>>>>>>>>> value
>>>>>>>>>> for lock, but with no information in waitingcpu.
>>>>>>>>>>
>>>>>>>>>> I am still thinking what would be problem with that.
>>>>>>>>>>
>>>>>>>>> Exactly, for kicker waiting_cpus and w->lock updates are
>>>>>>>>> non atomic anyway.
>>>>>>>>>
>>>>>>>>>>>> +    w->lock = NULL;
>>>>>>>>>>>> +    local_irq_restore(flags);
>>>>>>>>>>>> +    spin_time_accum_blocked(start);
>>>>>>>>>>>> +}
>>>>>>>>>>>> +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
>>>>>>>>>>>> +
>>>>>>>>>>>> +/* Kick vcpu waiting on @lock->head to reach value @ticket */
>>>>>>>>>>>> +static void kvm_unlock_kick(struct arch_spinlock *lock,
>>>>>>>>>>>> __ticket_t ticket)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    int cpu;
>>>>>>>>>>>> +
>>>>>>>>>>>> +    add_stats(RELEASED_SLOW, 1);
>>>>>>>>>>>> +    for_each_cpu(cpu, &waiting_cpus) {
>>>>>>>>>>>> +        const struct kvm_lock_waiting *w =
>>>>>>>>>>>> &per_cpu(lock_waiting, cpu);
>>>>>>>>>>>> +        if (ACCESS_ONCE(w->lock) == lock &&
>>>>>>>>>>>> +            ACCESS_ONCE(w->want) == ticket) {
>>>>>>>>>>>> +            add_stats(RELEASED_SLOW_KICKED, 1);
>>>>>>>>>>>> +            kvm_kick_cpu(cpu);
>>>>>>>>>>> What about using NMI to wake sleepers? I think it was
>>>>>>>>>>> discussed, but
>>>>>>>>>>> forgot why it was dismissed.
>>>>>>>>>>
>>>>>>>>>> I think I have missed that discussion. 'll go back and check. so
>>>>>>>>>> what is the idea here? we can easily wake up the halted vcpus that
>>>>>>>>>> have interrupt disabled?
>>>>>>>>> We can of course. IIRC the objection was that NMI handling path
>>>>>>>>> is very
>>>>>>>>> fragile and handling NMI on each wakeup will be more expensive then
>>>>>>>>> waking up a guest without injecting an event, but it is still
>>>>>>>>> interesting
>>>>>>>>> to see the numbers.
>>>>>>>>>
>>>>>>>>
>>>>>>>> Haam, now I remember, We had tried request based mechanism. (new
>>>>>>>> request like REQ_UNHALT) and process that. It had worked, but had
>>>>>>>> some
>>>>>>>> complex hacks in vcpu_enter_guest to avoid guest hang in case of
>>>>>>>> request cleared.  So had left it there..
>>>>>>>>
>>>>>>>> https://lkml.org/lkml/2012/4/30/67
>>>>>>>>
>>>>>>>> But I do not remember performance impact though.
>>>>>>> No, this is something different. Wakeup with NMI does not need KVM
>>>>>>> changes at
>>>>>>> all. Instead of kvm_kick_cpu(cpu) in kvm_unlock_kick you send NMI IPI.
>>>>>>>
>>>>>>
>>>>>> True. It was not NMI.
>>>>>> just to confirm, are you talking about something like this to be
>>>>>> tried ?
>>>>>>
>>>>>> apic->send_IPI_mask(cpumask_of(cpu), APIC_DM_NMI);
>>>>>
>>>>> When I started benchmark, I started seeing
>>>>> "Dazed and confused, but trying to continue" from unknown nmi error
>>>>> handling.
>>>>> Did I miss anything (because we did not register any NMI handler)? or
>>>>> is it that spurious NMIs are trouble because we could get spurious NMIs
>>>>> if next waiter already acquired the lock.
>>>> There is a default NMI handler that tries to detect the reason why NMI
>>>> happened (which is no so easy on x86) and prints this message if it
>>>> fails. You need to add logic to detect spinlock slow path there. Check
>>>> bit in waiting_cpus for instance.
>>>
>>> aha.. Okay. will check that.
>>
>> yes. Thanks.. that did the trick.
>>
>> I did like below in unknown_nmi_error():
>> if (cpumask_test_cpu(smp_processor_id(), &waiting_cpus))
>>     return;
>>
>> But I believe you asked NMI method only for experimental purpose to
>> check the upperbound. because as I doubted above, for spurious NMI
>> (i.e. when unlocker kicks when waiter already got the lock), we would
>> still hit unknown NMI error.
>>
>> I had hit spurious NMI over 1656 times over entire benchmark run.
>> along with
>> INFO: NMI handler (arch_trigger_all_cpu_backtrace_handler) took too
>> long to run: 24.886 msecs etc...
>>
> I wonder why this happens.
>
>> (and we cannot get away with that too because it means we bypass the
>> unknown NMI error even in genuine cases too)
>>
>> Here was the result for the my dbench test( 32 core  machine with 32
>> vcpu guest HT off)
>>
>>                   ---------- % improvement --------------
>> 		pvspinlock      pvspin_ipi      pvpsin_nmi
>> dbench_1x	0.9016    	0.7442    	0.7522
>> dbench_2x	14.7513   	18.0164   	15.9421
>> dbench_3x	14.7571   	17.0793   	13.3572
>> dbench_4x	6.3625    	8.7897    	5.3800
>>
>> So I am seeing over 2-4% improvement with IPI method.
>>
> Yeah, this was expected.
>
>> Gleb,
>>   do you think the current series looks good to you? [one patch I
>> have resent with in_nmi() check] or do you think I have to respin the
>> series with IPI method etc. or is there any concerns that I have to
>> address. Please let me know..
>>
> The current code looks fine to me.

Gleb,

Shall I consider this as an ack for kvm part?

Ingo,

Do you have any concerns reg this series? please let me know if this 
looks good now to you.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Aug. 1, 2013, 7:45 a.m. UTC | #13
On Thu, Aug 01, 2013 at 01:08:47PM +0530, Raghavendra K T wrote:
> On 07/31/2013 11:54 AM, Gleb Natapov wrote:
> >On Tue, Jul 30, 2013 at 10:13:12PM +0530, Raghavendra K T wrote:
> >>On 07/25/2013 03:08 PM, Raghavendra K T wrote:
> >>>On 07/25/2013 02:45 PM, Gleb Natapov wrote:
> >>>>On Thu, Jul 25, 2013 at 02:47:37PM +0530, Raghavendra K T wrote:
> >>>>>On 07/24/2013 06:06 PM, Raghavendra K T wrote:
> >>>>>>On 07/24/2013 05:36 PM, Gleb Natapov wrote:
> >>>>>>>On Wed, Jul 24, 2013 at 05:30:20PM +0530, Raghavendra K T wrote:
> >>>>>>>>On 07/24/2013 04:09 PM, Gleb Natapov wrote:
> >>>>>>>>>On Wed, Jul 24, 2013 at 03:15:50PM +0530, Raghavendra K T wrote:
> >>>>>>>>>>On 07/23/2013 08:37 PM, Gleb Natapov wrote:
> >>>>>>>>>>>On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
> >>>>>>>>>>>>+static void kvm_lock_spinning(struct arch_spinlock *lock,
> >>>>>>>>>>>>__ticket_t want)
> >>>>>>>>>>[...]
> >>>>>>>>>>>>+
> >>>>>>>>>>>>+    /*
> >>>>>>>>>>>>+     * halt until it's our turn and kicked. Note that we do safe
> >>>>>>>>>>>>halt
> >>>>>>>>>>>>+     * for irq enabled case to avoid hang when lock info is
> >>>>>>>>>>>>overwritten
> >>>>>>>>>>>>+     * in irq spinlock slowpath and no spurious interrupt occur
> >>>>>>>>>>>>to save us.
> >>>>>>>>>>>>+     */
> >>>>>>>>>>>>+    if (arch_irqs_disabled_flags(flags))
> >>>>>>>>>>>>+        halt();
> >>>>>>>>>>>>+    else
> >>>>>>>>>>>>+        safe_halt();
> >>>>>>>>>>>>+
> >>>>>>>>>>>>+out:
> >>>>>>>>>>>So here now interrupts can be either disabled or enabled. Previous
> >>>>>>>>>>>version disabled interrupts here, so are we sure it is safe to
> >>>>>>>>>>>have them
> >>>>>>>>>>>enabled at this point? I do not see any problem yet, will keep
> >>>>>>>>>>>thinking.
> >>>>>>>>>>
> >>>>>>>>>>If we enable interrupt here, then
> >>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>>>>+    cpumask_clear_cpu(cpu, &waiting_cpus);
> >>>>>>>>>>
> >>>>>>>>>>and if we start serving lock for an interrupt that came here,
> >>>>>>>>>>cpumask clear and w->lock=null may not happen atomically.
> >>>>>>>>>>if irq spinlock does not take slow path we would have non null
> >>>>>>>>>>value
> >>>>>>>>>>for lock, but with no information in waitingcpu.
> >>>>>>>>>>
> >>>>>>>>>>I am still thinking what would be problem with that.
> >>>>>>>>>>
> >>>>>>>>>Exactly, for kicker waiting_cpus and w->lock updates are
> >>>>>>>>>non atomic anyway.
> >>>>>>>>>
> >>>>>>>>>>>>+    w->lock = NULL;
> >>>>>>>>>>>>+    local_irq_restore(flags);
> >>>>>>>>>>>>+    spin_time_accum_blocked(start);
> >>>>>>>>>>>>+}
> >>>>>>>>>>>>+PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
> >>>>>>>>>>>>+
> >>>>>>>>>>>>+/* Kick vcpu waiting on @lock->head to reach value @ticket */
> >>>>>>>>>>>>+static void kvm_unlock_kick(struct arch_spinlock *lock,
> >>>>>>>>>>>>__ticket_t ticket)
> >>>>>>>>>>>>+{
> >>>>>>>>>>>>+    int cpu;
> >>>>>>>>>>>>+
> >>>>>>>>>>>>+    add_stats(RELEASED_SLOW, 1);
> >>>>>>>>>>>>+    for_each_cpu(cpu, &waiting_cpus) {
> >>>>>>>>>>>>+        const struct kvm_lock_waiting *w =
> >>>>>>>>>>>>&per_cpu(lock_waiting, cpu);
> >>>>>>>>>>>>+        if (ACCESS_ONCE(w->lock) == lock &&
> >>>>>>>>>>>>+            ACCESS_ONCE(w->want) == ticket) {
> >>>>>>>>>>>>+            add_stats(RELEASED_SLOW_KICKED, 1);
> >>>>>>>>>>>>+            kvm_kick_cpu(cpu);
> >>>>>>>>>>>What about using NMI to wake sleepers? I think it was
> >>>>>>>>>>>discussed, but
> >>>>>>>>>>>forgot why it was dismissed.
> >>>>>>>>>>
> >>>>>>>>>>I think I have missed that discussion. 'll go back and check. so
> >>>>>>>>>>what is the idea here? we can easily wake up the halted vcpus that
> >>>>>>>>>>have interrupt disabled?
> >>>>>>>>>We can of course. IIRC the objection was that NMI handling path
> >>>>>>>>>is very
> >>>>>>>>>fragile and handling NMI on each wakeup will be more expensive then
> >>>>>>>>>waking up a guest without injecting an event, but it is still
> >>>>>>>>>interesting
> >>>>>>>>>to see the numbers.
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>>Haam, now I remember, We had tried request based mechanism. (new
> >>>>>>>>request like REQ_UNHALT) and process that. It had worked, but had
> >>>>>>>>some
> >>>>>>>>complex hacks in vcpu_enter_guest to avoid guest hang in case of
> >>>>>>>>request cleared.  So had left it there..
> >>>>>>>>
> >>>>>>>>https://lkml.org/lkml/2012/4/30/67
> >>>>>>>>
> >>>>>>>>But I do not remember performance impact though.
> >>>>>>>No, this is something different. Wakeup with NMI does not need KVM
> >>>>>>>changes at
> >>>>>>>all. Instead of kvm_kick_cpu(cpu) in kvm_unlock_kick you send NMI IPI.
> >>>>>>>
> >>>>>>
> >>>>>>True. It was not NMI.
> >>>>>>just to confirm, are you talking about something like this to be
> >>>>>>tried ?
> >>>>>>
> >>>>>>apic->send_IPI_mask(cpumask_of(cpu), APIC_DM_NMI);
> >>>>>
> >>>>>When I started benchmark, I started seeing
> >>>>>"Dazed and confused, but trying to continue" from unknown nmi error
> >>>>>handling.
> >>>>>Did I miss anything (because we did not register any NMI handler)? or
> >>>>>is it that spurious NMIs are trouble because we could get spurious NMIs
> >>>>>if next waiter already acquired the lock.
> >>>>There is a default NMI handler that tries to detect the reason why NMI
> >>>>happened (which is no so easy on x86) and prints this message if it
> >>>>fails. You need to add logic to detect spinlock slow path there. Check
> >>>>bit in waiting_cpus for instance.
> >>>
> >>>aha.. Okay. will check that.
> >>
> >>yes. Thanks.. that did the trick.
> >>
> >>I did like below in unknown_nmi_error():
> >>if (cpumask_test_cpu(smp_processor_id(), &waiting_cpus))
> >>    return;
> >>
> >>But I believe you asked NMI method only for experimental purpose to
> >>check the upperbound. because as I doubted above, for spurious NMI
> >>(i.e. when unlocker kicks when waiter already got the lock), we would
> >>still hit unknown NMI error.
> >>
> >>I had hit spurious NMI over 1656 times over entire benchmark run.
> >>along with
> >>INFO: NMI handler (arch_trigger_all_cpu_backtrace_handler) took too
> >>long to run: 24.886 msecs etc...
> >>
> >I wonder why this happens.
> >
> >>(and we cannot get away with that too because it means we bypass the
> >>unknown NMI error even in genuine cases too)
> >>
> >>Here was the result for the my dbench test( 32 core  machine with 32
> >>vcpu guest HT off)
> >>
> >>                  ---------- % improvement --------------
> >>		pvspinlock      pvspin_ipi      pvpsin_nmi
> >>dbench_1x	0.9016    	0.7442    	0.7522
> >>dbench_2x	14.7513   	18.0164   	15.9421
> >>dbench_3x	14.7571   	17.0793   	13.3572
> >>dbench_4x	6.3625    	8.7897    	5.3800
> >>
> >>So I am seeing over 2-4% improvement with IPI method.
> >>
> >Yeah, this was expected.
> >
> >>Gleb,
> >>  do you think the current series looks good to you? [one patch I
> >>have resent with in_nmi() check] or do you think I have to respin the
> >>series with IPI method etc. or is there any concerns that I have to
> >>address. Please let me know..
> >>
> >The current code looks fine to me.
> 
> Gleb,
> 
> Shall I consider this as an ack for kvm part?
> 
For everything except 18/18. For that I still want to see numbers. But
18/18 is pretty independent from the reset of the series so it should
not stop the reset from going in.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T Aug. 1, 2013, 9:04 a.m. UTC | #14
On 08/01/2013 01:15 PM, Gleb Natapov wrote:
> On Thu, Aug 01, 2013 at 01:08:47PM +0530, Raghavendra K T wrote:
>> On 07/31/2013 11:54 AM, Gleb Natapov wrote:
>>> On Tue, Jul 30, 2013 at 10:13:12PM +0530, Raghavendra K T wrote:
>>>> On 07/25/2013 03:08 PM, Raghavendra K T wrote:
>>>>> On 07/25/2013 02:45 PM, Gleb Natapov wrote:
>>>>>> On Thu, Jul 25, 2013 at 02:47:37PM +0530, Raghavendra K T wrote:
>>>>>>> On 07/24/2013 06:06 PM, Raghavendra K T wrote:
>>>>>>>> On 07/24/2013 05:36 PM, Gleb Natapov wrote:
>>>>>>>>> On Wed, Jul 24, 2013 at 05:30:20PM +0530, Raghavendra K T wrote:
>>>>>>>>>> On 07/24/2013 04:09 PM, Gleb Natapov wrote:
>>>>>>>>>>> On Wed, Jul 24, 2013 at 03:15:50PM +0530, Raghavendra K T wrote:
>>>>>>>>>>>> On 07/23/2013 08:37 PM, Gleb Natapov wrote:
>>>>>>>>>>>>> On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
>>>>>>>>>>>>>> +static void kvm_lock_spinning(struct arch_spinlock *lock,
>>>>>>>>>>>>>> __ticket_t want)
>>>>>>>>>>>> [...]
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +    /*
>>>>>>>>>>>>>> +     * halt until it's our turn and kicked. Note that we do safe
>>>>>>>>>>>>>> halt
>>>>>>>>>>>>>> +     * for irq enabled case to avoid hang when lock info is
>>>>>>>>>>>>>> overwritten
>>>>>>>>>>>>>> +     * in irq spinlock slowpath and no spurious interrupt occur
>>>>>>>>>>>>>> to save us.
>>>>>>>>>>>>>> +     */
>>>>>>>>>>>>>> +    if (arch_irqs_disabled_flags(flags))
>>>>>>>>>>>>>> +        halt();
>>>>>>>>>>>>>> +    else
>>>>>>>>>>>>>> +        safe_halt();
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +out:
>>>>>>>>>>>>> So here now interrupts can be either disabled or enabled. Previous
>>>>>>>>>>>>> version disabled interrupts here, so are we sure it is safe to
>>>>>>>>>>>>> have them
>>>>>>>>>>>>> enabled at this point? I do not see any problem yet, will keep
>>>>>>>>>>>>> thinking.
>>>>>>>>>>>>
>>>>>>>>>>>> If we enable interrupt here, then
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>> +    cpumask_clear_cpu(cpu, &waiting_cpus);
>>>>>>>>>>>>
>>>>>>>>>>>> and if we start serving lock for an interrupt that came here,
>>>>>>>>>>>> cpumask clear and w->lock=null may not happen atomically.
>>>>>>>>>>>> if irq spinlock does not take slow path we would have non null
>>>>>>>>>>>> value
>>>>>>>>>>>> for lock, but with no information in waitingcpu.
>>>>>>>>>>>>
>>>>>>>>>>>> I am still thinking what would be problem with that.
>>>>>>>>>>>>
>>>>>>>>>>> Exactly, for kicker waiting_cpus and w->lock updates are
>>>>>>>>>>> non atomic anyway.
>>>>>>>>>>>
>>>>>>>>>>>>>> +    w->lock = NULL;
>>>>>>>>>>>>>> +    local_irq_restore(flags);
>>>>>>>>>>>>>> +    spin_time_accum_blocked(start);
>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>> +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +/* Kick vcpu waiting on @lock->head to reach value @ticket */
>>>>>>>>>>>>>> +static void kvm_unlock_kick(struct arch_spinlock *lock,
>>>>>>>>>>>>>> __ticket_t ticket)
>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>> +    int cpu;
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +    add_stats(RELEASED_SLOW, 1);
>>>>>>>>>>>>>> +    for_each_cpu(cpu, &waiting_cpus) {
>>>>>>>>>>>>>> +        const struct kvm_lock_waiting *w =
>>>>>>>>>>>>>> &per_cpu(lock_waiting, cpu);
>>>>>>>>>>>>>> +        if (ACCESS_ONCE(w->lock) == lock &&
>>>>>>>>>>>>>> +            ACCESS_ONCE(w->want) == ticket) {
>>>>>>>>>>>>>> +            add_stats(RELEASED_SLOW_KICKED, 1);
>>>>>>>>>>>>>> +            kvm_kick_cpu(cpu);
>>>>>>>>>>>>> What about using NMI to wake sleepers? I think it was
>>>>>>>>>>>>> discussed, but
>>>>>>>>>>>>> forgot why it was dismissed.
>>>>>>>>>>>>
>>>>>>>>>>>> I think I have missed that discussion. 'll go back and check. so
>>>>>>>>>>>> what is the idea here? we can easily wake up the halted vcpus that
>>>>>>>>>>>> have interrupt disabled?
>>>>>>>>>>> We can of course. IIRC the objection was that NMI handling path
>>>>>>>>>>> is very
>>>>>>>>>>> fragile and handling NMI on each wakeup will be more expensive then
>>>>>>>>>>> waking up a guest without injecting an event, but it is still
>>>>>>>>>>> interesting
>>>>>>>>>>> to see the numbers.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Haam, now I remember, We had tried request based mechanism. (new
>>>>>>>>>> request like REQ_UNHALT) and process that. It had worked, but had
>>>>>>>>>> some
>>>>>>>>>> complex hacks in vcpu_enter_guest to avoid guest hang in case of
>>>>>>>>>> request cleared.  So had left it there..
>>>>>>>>>>
>>>>>>>>>> https://lkml.org/lkml/2012/4/30/67
>>>>>>>>>>
>>>>>>>>>> But I do not remember performance impact though.
>>>>>>>>> No, this is something different. Wakeup with NMI does not need KVM
>>>>>>>>> changes at
>>>>>>>>> all. Instead of kvm_kick_cpu(cpu) in kvm_unlock_kick you send NMI IPI.
>>>>>>>>>
>>>>>>>>
>>>>>>>> True. It was not NMI.
>>>>>>>> just to confirm, are you talking about something like this to be
>>>>>>>> tried ?
>>>>>>>>
>>>>>>>> apic->send_IPI_mask(cpumask_of(cpu), APIC_DM_NMI);
>>>>>>>
>>>>>>> When I started benchmark, I started seeing
>>>>>>> "Dazed and confused, but trying to continue" from unknown nmi error
>>>>>>> handling.
>>>>>>> Did I miss anything (because we did not register any NMI handler)? or
>>>>>>> is it that spurious NMIs are trouble because we could get spurious NMIs
>>>>>>> if next waiter already acquired the lock.
>>>>>> There is a default NMI handler that tries to detect the reason why NMI
>>>>>> happened (which is no so easy on x86) and prints this message if it
>>>>>> fails. You need to add logic to detect spinlock slow path there. Check
>>>>>> bit in waiting_cpus for instance.
>>>>>
>>>>> aha.. Okay. will check that.
>>>>
>>>> yes. Thanks.. that did the trick.
>>>>
>>>> I did like below in unknown_nmi_error():
>>>> if (cpumask_test_cpu(smp_processor_id(), &waiting_cpus))
>>>>     return;
>>>>
>>>> But I believe you asked NMI method only for experimental purpose to
>>>> check the upperbound. because as I doubted above, for spurious NMI
>>>> (i.e. when unlocker kicks when waiter already got the lock), we would
>>>> still hit unknown NMI error.
>>>>
>>>> I had hit spurious NMI over 1656 times over entire benchmark run.
>>>> along with
>>>> INFO: NMI handler (arch_trigger_all_cpu_backtrace_handler) took too
>>>> long to run: 24.886 msecs etc...
>>>>
>>> I wonder why this happens.
>>>
>>>> (and we cannot get away with that too because it means we bypass the
>>>> unknown NMI error even in genuine cases too)
>>>>
>>>> Here was the result for the my dbench test( 32 core  machine with 32
>>>> vcpu guest HT off)
>>>>
>>>>                   ---------- % improvement --------------
>>>> 		pvspinlock      pvspin_ipi      pvpsin_nmi
>>>> dbench_1x	0.9016    	0.7442    	0.7522
>>>> dbench_2x	14.7513   	18.0164   	15.9421
>>>> dbench_3x	14.7571   	17.0793   	13.3572
>>>> dbench_4x	6.3625    	8.7897    	5.3800
>>>>
>>>> So I am seeing over 2-4% improvement with IPI method.
>>>>
>>> Yeah, this was expected.
>>>
>>>> Gleb,
>>>>   do you think the current series looks good to you? [one patch I
>>>> have resent with in_nmi() check] or do you think I have to respin the
>>>> series with IPI method etc. or is there any concerns that I have to
>>>> address. Please let me know..
>>>>
>>> The current code looks fine to me.
>>
>> Gleb,
>>
>> Shall I consider this as an ack for kvm part?
>>
> For everything except 18/18. For that I still want to see numbers. But
> 18/18 is pretty independent from the reset of the series so it should
> not stop the reset from going in.

Yes. agreed.
I am going to evaluate patch 18 separately and come with results for
that. Now we can consider only 1-17 patches.



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T Aug. 2, 2013, 3:22 a.m. UTC | #15
On 08/01/2013 02:34 PM, Raghavendra K T wrote:
> On 08/01/2013 01:15 PM, Gleb Natapov wrote:
>>> Shall I consider this as an ack for kvm part?
>>>
>> For everything except 18/18. For that I still want to see numbers. But
>> 18/18 is pretty independent from the reset of the series so it should
>> not stop the reset from going in.
>
> Yes. agreed.
> I am going to evaluate patch 18 separately and come with results for
> that. Now we can consider only 1-17 patches.
>

Gleb,

32 core machine with HT off 32 vcpu guests.
base = 3.11-rc + patch 1 -17 pvspinlock_v11
patched = base + patch 18

+-----------+-----------+-----------+------------+-----------+
                   dbench  (Throughput in MB/sec higher is better)
+-----------+-----------+-----------+------------+-----------+
       base      stdev       patched    stdev       %improvement
+-----------+-----------+-----------+------------+-----------+
1x 14584.3800   146.9074   14705.1000   163.1060     0.82773
2x  1713.7300    32.8750    1717.3200    45.5979     0.20948
3x   967.8212    42.0257     971.8855    18.8532     0.41994
4x   685.2764    25.7150     694.5881     8.3907     1.35882
+-----------+-----------+-----------+------------+-----------+

I saw -0.78% - .84% changes in ebizzy and 1-2% improvement in
hackbench.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar Aug. 2, 2013, 9:23 a.m. UTC | #16
* Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com> wrote:

> On 08/01/2013 02:34 PM, Raghavendra K T wrote:
> >On 08/01/2013 01:15 PM, Gleb Natapov wrote:
> >>>Shall I consider this as an ack for kvm part?
> >>>
> >>For everything except 18/18. For that I still want to see numbers. But
> >>18/18 is pretty independent from the reset of the series so it should
> >>not stop the reset from going in.
> >
> >Yes. agreed.
> >I am going to evaluate patch 18 separately and come with results for
> >that. Now we can consider only 1-17 patches.
> >
> 
> Gleb,
> 
> 32 core machine with HT off 32 vcpu guests.
> base = 3.11-rc + patch 1 -17 pvspinlock_v11
> patched = base + patch 18
> 
> +-----------+-----------+-----------+------------+-----------+
>                   dbench  (Throughput in MB/sec higher is better)
> +-----------+-----------+-----------+------------+-----------+
>       base      stdev       patched    stdev       %improvement
> +-----------+-----------+-----------+------------+-----------+
> 1x 14584.3800   146.9074   14705.1000   163.1060     0.82773
> 2x  1713.7300    32.8750    1717.3200    45.5979     0.20948
> 3x   967.8212    42.0257     971.8855    18.8532     0.41994
> 4x   685.2764    25.7150     694.5881     8.3907     1.35882
> +-----------+-----------+-----------+------------+-----------+

Please list stddev in percentage as well ...

a blind stab gave me these figures:

>       base      stdev       patched    stdev       %improvement
> 3x   967.8212    4.3%     971.8855      1.8%     0.4

That makes the improvement an order of magnitude smaller than the noise of 
the measurement ... i.e. totally inconclusive.

Also please cut the excessive decimal points: with 2-4% noise what point 
is there in 5 decimal point results??

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar Aug. 2, 2013, 9:25 a.m. UTC | #17
* Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com> wrote:

> On 07/31/2013 11:54 AM, Gleb Natapov wrote:
> >On Tue, Jul 30, 2013 at 10:13:12PM +0530, Raghavendra K T wrote:
> >>On 07/25/2013 03:08 PM, Raghavendra K T wrote:
> >>>On 07/25/2013 02:45 PM, Gleb Natapov wrote:
> >>>>On Thu, Jul 25, 2013 at 02:47:37PM +0530, Raghavendra K T wrote:
> >>>>>On 07/24/2013 06:06 PM, Raghavendra K T wrote:
> >>>>>>On 07/24/2013 05:36 PM, Gleb Natapov wrote:
> >>>>>>>On Wed, Jul 24, 2013 at 05:30:20PM +0530, Raghavendra K T wrote:
> >>>>>>>>On 07/24/2013 04:09 PM, Gleb Natapov wrote:
> >>>>>>>>>On Wed, Jul 24, 2013 at 03:15:50PM +0530, Raghavendra K T wrote:
> >>>>>>>>>>On 07/23/2013 08:37 PM, Gleb Natapov wrote:
> >>>>>>>>>>>On Mon, Jul 22, 2013 at 11:50:16AM +0530, Raghavendra K T wrote:
> >>>>>>>>>>>>+static void kvm_lock_spinning(struct arch_spinlock *lock,
> >>>>>>>>>>>>__ticket_t want)
> >>>>>>>>>>[...]

[ a few hundred lines of unnecessary quotation snipped. ]

> Ingo,
> 
> Do you have any concerns reg this series? please let me know if this 
> looks good now to you.

I'm inclined to NAK it for excessive quotation - who knows how many people 
left the discussion in disgust? Was it done to drive away as many 
reviewers as possible?

Anyway, see my other reply, the measurement results seem hard to interpret 
and inconclusive at the moment.

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T Aug. 2, 2013, 9:44 a.m. UTC | #18
On 08/02/2013 02:53 PM, Ingo Molnar wrote:
>
> * Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com> wrote:
>
>> On 08/01/2013 02:34 PM, Raghavendra K T wrote:
>>> On 08/01/2013 01:15 PM, Gleb Natapov wrote:
>>>>> Shall I consider this as an ack for kvm part?
>>>>>
>>>> For everything except 18/18. For that I still want to see numbers. But
>>>> 18/18 is pretty independent from the reset of the series so it should
>>>> not stop the reset from going in.
>>>
>>> Yes. agreed.
>>> I am going to evaluate patch 18 separately and come with results for
>>> that. Now we can consider only 1-17 patches.
>>>
>>
>> Gleb,
>>
>> 32 core machine with HT off 32 vcpu guests.
>> base = 3.11-rc + patch 1 -17 pvspinlock_v11
>> patched = base + patch 18
>>
>> +-----------+-----------+-----------+------------+-----------+
>>                    dbench  (Throughput in MB/sec higher is better)
>> +-----------+-----------+-----------+------------+-----------+
>>        base      stdev       patched    stdev       %improvement
>> +-----------+-----------+-----------+------------+-----------+
>> 1x 14584.3800   146.9074   14705.1000   163.1060     0.82773
>> 2x  1713.7300    32.8750    1717.3200    45.5979     0.20948
>> 3x   967.8212    42.0257     971.8855    18.8532     0.41994
>> 4x   685.2764    25.7150     694.5881     8.3907     1.35882
>> +-----------+-----------+-----------+------------+-----------+
>
> Please list stddev in percentage as well ...

Sure. will do this from next time.

>
> a blind stab gave me these figures:
>
>>        base      stdev       patched    stdev       %improvement
>> 3x   967.8212    4.3%     971.8855      1.8%     0.4
>
> That makes the improvement an order of magnitude smaller than the noise of
> the measurement ... i.e. totally inconclusive.

Okay. agreed.

I always had seen the positive effect of the patch since it uses ple
handler heuristics, and thus avoiding the directed yield to vcpu's in
halt handler. But the current results clearly does not conclude
anything favoring that. :(

So please drop patch 18 for now.

>
> Also please cut the excessive decimal points: with 2-4% noise what point
> is there in 5 decimal point results??

Yes.

Ingo, do you think now the patch series (patch 1 to 17) are in good
shape? or please let me know if you have any concerns to be
addressed.



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Aug. 2, 2013, 9:54 a.m. UTC | #19
On Fri, Aug 02, 2013 at 11:25:39AM +0200, Ingo Molnar wrote:
> > Ingo,
> > 
> > Do you have any concerns reg this series? please let me know if this 
> > looks good now to you.
> 
> I'm inclined to NAK it for excessive quotation - who knows how many people 
> left the discussion in disgust? Was it done to drive away as many 
> reviewers as possible?
> 
> Anyway, see my other reply, the measurement results seem hard to interpret 
> and inconclusive at the moment.
> 
That result was only for patch 18 of the series, not pvspinlock in
general.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T Aug. 2, 2013, 10:57 a.m. UTC | #20
On 08/02/2013 03:24 PM, Gleb Natapov wrote:
> On Fri, Aug 02, 2013 at 11:25:39AM +0200, Ingo Molnar wrote:
>>> Ingo,
>>>
>>> Do you have any concerns reg this series? please let me know if this
>>> looks good now to you.
>>
>> I'm inclined to NAK it for excessive quotation - who knows how many people
>> left the discussion in disgust? Was it done to drive away as many
>> reviewers as possible?

Ingo, Peter,
Sorry for the confusion caused because of nesting. I should have trimmed 
it as Peter also pointed in other thread.
will ensure that is future mails.


>> Anyway, see my other reply, the measurement results seem hard to interpret
>> and inconclusive at the moment.

As Gleb already pointed, patch1-17 as a whole giving good performance 
improvement. It was only the patch 18, Gleb had concerns.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar Aug. 5, 2013, 9:46 a.m. UTC | #21
* Gleb Natapov <gleb@redhat.com> wrote:

> On Fri, Aug 02, 2013 at 11:25:39AM +0200, Ingo Molnar wrote:
> > > Ingo,
> > > 
> > > Do you have any concerns reg this series? please let me know if this 
> > > looks good now to you.
> > 
> > I'm inclined to NAK it for excessive quotation - who knows how many 
> > people left the discussion in disgust? Was it done to drive away as 
> > many reviewers as possible?
> > 
> > Anyway, see my other reply, the measurement results seem hard to 
> > interpret and inconclusive at the moment.
>
> That result was only for patch 18 of the series, not pvspinlock in 
> general.

Okay - I've re-read the performance numbers and they are impressive, so no 
objections from me.

The x86 impact seems to be a straightforward API change, with most of the 
changes on the virtualization side. So:

Acked-by: Ingo Molnar <mingo@kernel.org>

I guess you'd want to carry this in the KVM tree or so - maybe in a 
separate branch because it changes Xen as well?

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T Aug. 5, 2013, 10:42 a.m. UTC | #22
>> That result was only for patch 18 of the series, not pvspinlock in
>> general.
>
> Okay - I've re-read the performance numbers and they are impressive, so no
> objections from me.
>
> The x86 impact seems to be a straightforward API change, with most of the
> changes on the virtualization side. So:
>
> Acked-by: Ingo Molnar <mingo@kernel.org>
>
> I guess you'd want to carry this in the KVM tree or so - maybe in a
> separate branch because it changes Xen as well?
>

Thank you Ingo for taking a relook.

Gleb, Please let me know if you want me to resend the first 17 patches
with acked-bys. i.e excluding the 18th patch.




--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar Aug. 5, 2013, 1:52 p.m. UTC | #23
* Gleb Natapov <gleb@redhat.com> wrote:

> On Mon, Aug 05, 2013 at 11:46:03AM +0200, Ingo Molnar wrote:
> > Acked-by: Ingo Molnar <mingo@kernel.org>
> > 
> > I guess you'd want to carry this in the KVM tree or so - maybe in a 
> > separate branch because it changes Xen as well?
> 
> It changes KVM host and guest side, XEN and common x86 spinlock code. I 
> think it would be best to merge common x86 spinlock bits and guest side 
> KVM/XEN bits through tip tree and host KVM part will go through KVM 
> tree. If this is OK with you, Ingo, and XEN folks Raghavendra can send 
> two separate patch series one for the tip and one for KVM host side.

Sure, that's fine - if the initial series works fine in isolation as well 
(i.e. won't break anything).

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Aug. 5, 2013, 2:05 p.m. UTC | #24
> > On Mon, Aug 05, 2013 at 11:46:03AM +0200, Ingo Molnar wrote:
> > > Acked-by: Ingo Molnar <mingo@kernel.org>
> > > 
> > > I guess you'd want to carry this in the KVM tree or so - maybe in a
> > > separate branch because it changes Xen as well?
> > 
> > It changes KVM host and guest side, XEN and common x86 spinlock code. I
> > think it would be best to merge common x86 spinlock bits and guest side
> > KVM/XEN bits through tip tree and host KVM part will go through KVM
> > tree. If this is OK with you, Ingo, and XEN folks Raghavendra can send
> > two separate patch series one for the tip and one for KVM host side.
> 
> Sure, that's fine - if the initial series works fine in isolation as well
> (i.e. won't break anything).

It would be a big problem if it didn't!  Raghavendra, please send the
two separate series as Gleb explained above.

Thanks,

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Konrad Rzeszutek Wilk Aug. 5, 2013, 3:37 p.m. UTC | #25
On Mon, Aug 05, 2013 at 11:46:03AM +0200, Ingo Molnar wrote:
> 
> * Gleb Natapov <gleb@redhat.com> wrote:
> 
> > On Fri, Aug 02, 2013 at 11:25:39AM +0200, Ingo Molnar wrote:
> > > > Ingo,
> > > > 
> > > > Do you have any concerns reg this series? please let me know if this 
> > > > looks good now to you.
> > > 
> > > I'm inclined to NAK it for excessive quotation - who knows how many 
> > > people left the discussion in disgust? Was it done to drive away as 
> > > many reviewers as possible?
> > > 
> > > Anyway, see my other reply, the measurement results seem hard to 
> > > interpret and inconclusive at the moment.
> >
> > That result was only for patch 18 of the series, not pvspinlock in 
> > general.
> 
> Okay - I've re-read the performance numbers and they are impressive, so no 
> objections from me.
> 
> The x86 impact seems to be a straightforward API change, with most of the 
> changes on the virtualization side. So:
> 
> Acked-by: Ingo Molnar <mingo@kernel.org>
> 
> I guess you'd want to carry this in the KVM tree or so - maybe in a 
> separate branch because it changes Xen as well?

May I suggest an alternate way - perhaps you can put them in a tip/spinlock
tree for v3.12 - since both KVM and Xen maintainers have acked and carefully
reviewed them?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 695399f..427afcb 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -118,10 +118,20 @@  void kvm_async_pf_task_wait(u32 token);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 extern void kvm_disable_steal_time(void);
-#else
-#define kvm_guest_init() do { } while (0)
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init kvm_spinlock_init(void);
+#else /* !CONFIG_PARAVIRT_SPINLOCKS */
+static inline void kvm_spinlock_init(void)
+{
+}
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+#else /* CONFIG_KVM_GUEST */
+#define kvm_guest_init() do {} while (0)
 #define kvm_async_pf_task_wait(T) do {} while(0)
 #define kvm_async_pf_task_wake(T) do {} while(0)
+
 static inline u32 kvm_read_and_reset_pf_reason(void)
 {
 	return 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index cd6d9a5..b5aa5f4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -34,6 +34,7 @@ 
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/kprobes.h>
+#include <linux/debugfs.h>
 #include <asm/timer.h>
 #include <asm/cpu.h>
 #include <asm/traps.h>
@@ -419,6 +420,7 @@  static void __init kvm_smp_prepare_boot_cpu(void)
 	WARN_ON(kvm_register_clock("primary cpu clock"));
 	kvm_guest_cpu_init();
 	native_smp_prepare_boot_cpu();
+	kvm_spinlock_init();
 }
 
 static void __cpuinit kvm_guest_cpu_online(void *dummy)
@@ -523,3 +525,260 @@  static __init int activate_jump_labels(void)
 	return 0;
 }
 arch_initcall(activate_jump_labels);
+
+/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
+void kvm_kick_cpu(int cpu)
+{
+	int apicid;
+	unsigned long flags = 0;
+
+	apicid = per_cpu(x86_cpu_to_apicid, cpu);
+	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+enum kvm_contention_stat {
+	TAKEN_SLOW,
+	TAKEN_SLOW_PICKUP,
+	RELEASED_SLOW,
+	RELEASED_SLOW_KICKED,
+	NR_CONTENTION_STATS
+};
+
+#ifdef CONFIG_KVM_DEBUG_FS
+#define HISTO_BUCKETS	30
+
+static struct kvm_spinlock_stats
+{
+	u32 contention_stats[NR_CONTENTION_STATS];
+	u32 histo_spin_blocked[HISTO_BUCKETS+1];
+	u64 time_blocked;
+} spinlock_stats;
+
+static u8 zero_stats;
+
+static inline void check_zero(void)
+{
+	u8 ret;
+	u8 old;
+
+	old = ACCESS_ONCE(zero_stats);
+	if (unlikely(old)) {
+		ret = cmpxchg(&zero_stats, old, 0);
+		/* This ensures only one fellow resets the stat */
+		if (ret == old)
+			memset(&spinlock_stats, 0, sizeof(spinlock_stats));
+	}
+}
+
+static inline void add_stats(enum kvm_contention_stat var, u32 val)
+{
+	check_zero();
+	spinlock_stats.contention_stats[var] += val;
+}
+
+
+static inline u64 spin_time_start(void)
+{
+	return sched_clock();
+}
+
+static void __spin_time_accum(u64 delta, u32 *array)
+{
+	unsigned index;
+
+	index = ilog2(delta);
+	check_zero();
+
+	if (index < HISTO_BUCKETS)
+		array[index]++;
+	else
+		array[HISTO_BUCKETS]++;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+	u32 delta;
+
+	delta = sched_clock() - start;
+	__spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
+	spinlock_stats.time_blocked += delta;
+}
+
+static struct dentry *d_spin_debug;
+static struct dentry *d_kvm_debug;
+
+struct dentry *kvm_init_debugfs(void)
+{
+	d_kvm_debug = debugfs_create_dir("kvm", NULL);
+	if (!d_kvm_debug)
+		printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");
+
+	return d_kvm_debug;
+}
+
+static int __init kvm_spinlock_debugfs(void)
+{
+	struct dentry *d_kvm;
+
+	d_kvm = kvm_init_debugfs();
+	if (d_kvm == NULL)
+		return -ENOMEM;
+
+	d_spin_debug = debugfs_create_dir("spinlocks", d_kvm);
+
+	debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
+
+	debugfs_create_u32("taken_slow", 0444, d_spin_debug,
+		   &spinlock_stats.contention_stats[TAKEN_SLOW]);
+	debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
+		   &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
+
+	debugfs_create_u32("released_slow", 0444, d_spin_debug,
+		   &spinlock_stats.contention_stats[RELEASED_SLOW]);
+	debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
+		   &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
+
+	debugfs_create_u64("time_blocked", 0444, d_spin_debug,
+			   &spinlock_stats.time_blocked);
+
+	debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
+		     spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
+
+	return 0;
+}
+fs_initcall(kvm_spinlock_debugfs);
+#else  /* !CONFIG_KVM_DEBUG_FS */
+static inline void add_stats(enum kvm_contention_stat var, u32 val)
+{
+}
+
+static inline u64 spin_time_start(void)
+{
+	return 0;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+}
+#endif  /* CONFIG_KVM_DEBUG_FS */
+
+struct kvm_lock_waiting {
+	struct arch_spinlock *lock;
+	__ticket_t want;
+};
+
+/* cpus 'waiting' on a spinlock to become available */
+static cpumask_t waiting_cpus;
+
+/* Track spinlock on which a cpu is waiting */
+static DEFINE_PER_CPU(struct kvm_lock_waiting, lock_waiting);
+
+static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
+{
+	struct kvm_lock_waiting *w;
+	int cpu;
+	u64 start;
+	unsigned long flags;
+
+	w = &__get_cpu_var(lock_waiting);
+	cpu = smp_processor_id();
+	start = spin_time_start();
+
+	/*
+	 * Make sure an interrupt handler can't upset things in a
+	 * partially setup state.
+	 */
+	local_irq_save(flags);
+
+	/*
+	 * The ordering protocol on this is that the "lock" pointer
+	 * may only be set non-NULL if the "want" ticket is correct.
+	 * If we're updating "want", we must first clear "lock".
+	 */
+	w->lock = NULL;
+	smp_wmb();
+	w->want = want;
+	smp_wmb();
+	w->lock = lock;
+
+	add_stats(TAKEN_SLOW, 1);
+
+	/*
+	 * This uses set_bit, which is atomic but we should not rely on its
+	 * reordering gurantees. So barrier is needed after this call.
+	 */
+	cpumask_set_cpu(cpu, &waiting_cpus);
+
+	barrier();
+
+	/*
+	 * Mark entry to slowpath before doing the pickup test to make
+	 * sure we don't deadlock with an unlocker.
+	 */
+	__ticket_enter_slowpath(lock);
+
+	/*
+	 * check again make sure it didn't become free while
+	 * we weren't looking.
+	 */
+	if (ACCESS_ONCE(lock->tickets.head) == want) {
+		add_stats(TAKEN_SLOW_PICKUP, 1);
+		goto out;
+	}
+
+	/*
+	 * halt until it's our turn and kicked. Note that we do safe halt
+	 * for irq enabled case to avoid hang when lock info is overwritten
+	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
+	 */
+	if (arch_irqs_disabled_flags(flags))
+		halt();
+	else
+		safe_halt();
+
+out:
+	cpumask_clear_cpu(cpu, &waiting_cpus);
+	w->lock = NULL;
+	local_irq_restore(flags);
+	spin_time_accum_blocked(start);
+}
+PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
+
+/* Kick vcpu waiting on @lock->head to reach value @ticket */
+static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
+{
+	int cpu;
+
+	add_stats(RELEASED_SLOW, 1);
+	for_each_cpu(cpu, &waiting_cpus) {
+		const struct kvm_lock_waiting *w = &per_cpu(lock_waiting, cpu);
+		if (ACCESS_ONCE(w->lock) == lock &&
+		    ACCESS_ONCE(w->want) == ticket) {
+			add_stats(RELEASED_SLOW_KICKED, 1);
+			kvm_kick_cpu(cpu);
+			break;
+		}
+	}
+}
+
+/*
+ * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
+ */
+void __init kvm_spinlock_init(void)
+{
+	if (!kvm_para_available())
+		return;
+	/* Does host kernel support KVM_FEATURE_PV_UNHALT? */
+	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
+		return;
+
+	printk(KERN_INFO "KVM setup paravirtual spinlock\n");
+
+	static_key_slow_inc(&paravirt_ticketlocks_enabled);
+
+	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
+	pv_lock_ops.unlock_kick = kvm_unlock_kick;
+}
+#endif	/* CONFIG_PARAVIRT_SPINLOCKS */