[-rt,1/2] KVM: use simple waitqueue for vcpu->wq
diff mbox

Message ID 20150218140320.GY5029@twins.programming.kicks-ass.net
State New, archived
Headers show

Commit Message

Peter Zijlstra Feb. 18, 2015, 2:03 p.m. UTC
On Tue, Feb 17, 2015 at 06:44:19PM +0100, Sebastian Andrzej Siewior wrote:
> * Peter Zijlstra | 2015-01-21 16:07:16 [+0100]:
> 
> >On Tue, Jan 20, 2015 at 01:16:13PM -0500, Steven Rostedt wrote:
> >> I'm actually wondering if we should just nuke the _interruptible()
> >> version of swait. As it should only be all interruptible or all not
> >> interruptible, that the swait_wake() should just do the wake up
> >> regardless. In which case, swait_wake() is good enough. No need to have
> >> different versions where people may think do something special.
> >> 
> >> Peter?
> >
> >Yeah, I think the lastest thing I have sitting here on my disk only has
> >the swake_up() which does TASK_NORMAL, no choice there.
> 
> what is the swait status in terms of mainline? This sounds like it
> beeing worked on.
> I could take the series but then I would drop it again if the mainline
> implementation changes…

Well, 'worked' on might be putting too much on it, its one of the many
many 'spare' time things that never get attention unless people bug me
;-)

The below is my current patch, I've not actually tried it yet, I (think
I) had one patch doing some conversions but I'm having trouble locating
it.

Mostly-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/swait.h |  172 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/swait.c  |  122 +++++++++++++++++++++++++++++++++++
 2 files changed, 294 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Sebastian Andrzej Siewior Feb. 25, 2015, 9:02 p.m. UTC | #1
* Peter Zijlstra | 2015-02-18 15:03:20 [+0100]:

>On Tue, Feb 17, 2015 at 06:44:19PM +0100, Sebastian Andrzej Siewior wrote:
>> * Peter Zijlstra | 2015-01-21 16:07:16 [+0100]:
>> 
>> >On Tue, Jan 20, 2015 at 01:16:13PM -0500, Steven Rostedt wrote:
>> >> I'm actually wondering if we should just nuke the _interruptible()
>> >> version of swait. As it should only be all interruptible or all not
>> >> interruptible, that the swait_wake() should just do the wake up
>> >> regardless. In which case, swait_wake() is good enough. No need to have
>> >> different versions where people may think do something special.
>> >> 
>> >> Peter?
>> >
>> >Yeah, I think the lastest thing I have sitting here on my disk only has
>> >the swake_up() which does TASK_NORMAL, no choice there.
>> 
>> what is the swait status in terms of mainline? This sounds like it
>> beeing worked on.
>> I could take the series but then I would drop it again if the mainline
>> implementation changes…
>
>Well, 'worked' on might be putting too much on it, its one of the many
>many 'spare' time things that never get attention unless people bug me
>;-)
>
>The below is my current patch, I've not actually tried it yet, I (think
>I) had one patch doing some conversions but I'm having trouble locating
>it.
>
>Mostly-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>---
> include/linux/swait.h |  172 ++++++++++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/swait.c  |  122 +++++++++++++++++++++++++++++++++++
> 2 files changed, 294 insertions(+)
>
>--- /dev/null
>+++ b/include/linux/swait.h
>@@ -0,0 +1,172 @@
>+#ifndef _LINUX_SWAIT_H
>+#define _LINUX_SWAIT_H
>+
>+#include <linux/list.h>
>+#include <linux/stddef.h>
>+#include <linux/spinlock.h>
>+#include <asm/current.h>
>+
>+/*
>+ * Simple wait queues
>+ *
>+ * While these are very similar to the other/complex wait queues (wait.h) the
>+ * most important difference is that the simple waitqueue allows for
>+ * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
>+ * times.
>+ *
>+ * In order to make this so, we had to drop a fair number of features of the
>+ * other waitqueue code; notably:
>+ *
>+ *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
>+ *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
>+ *    sleeper state.
>+ *
>+ *  - the exclusive mode; because this requires preserving the list order
>+ *    and this is hard.
>+ *
>+ *  - custom wake functions; because you cannot give any guarantees about
>+ *    random code.
>+ *
>+ * As a side effect of this; the data structures are slimmer.
>+ *
>+ * One would recommend using this wait queue where possible.
>+ */
>+
>+struct task_struct;
>+
>+struct swait_queue_head {
>+	raw_spinlock_t		lock;
>+	struct list_head	task_list;
>+};
>+
>+struct swait_queue {
>+	struct task_struct	*task;
>+	struct list_head	task_list;

I would prefer something different than task_list here since this is an
item. Scrolling down you tried to use node once so maybe that would be
good here :)

>+};
>+
>+#define __SWAITQUEUE_INITIALIZER(name) {				\
>+	.task		= current,					\
>+	.task_list	= LIST_HEAD_INIT((name).task_list),		\
>+}
>+
>+#define DECLARE_SWAITQUEUE(name)					\
>+	struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
>+
>+#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) {				\
>+	.lock		= __RAW_SPIN_LOCK_UNLOCKED(name.lock),		\
>+	.task_list	= LIST_HEAD_INIT((name).task_list),		\
>+}
>+
>+#define DECLARE_SWAIT_QUEUE_HEAD(name)					\
>+	struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
>+
>+extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
>+				    struct lock_class_key *key);
>+
>+#define init_swait_queue_head(q)				\
>+	do {							\
>+		static struct lock_class_key __key;		\
>+		__init_swait_queue_head((q), #q, &__key);	\
>+	} while (0)
>+
>+#ifdef CONFIG_LOCKDEP
>+# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)			\
>+	({ init_swait_queue_head(&name); name; })
>+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)			\
>+	struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
>+#else
>+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)			\
>+	DECLARE_SWAIT_QUEUE_HEAD(name)
>+#endif
>+
>+static inline int swait_active(struct swait_queue_head *q)
>+{
>+	return !list_empty(&q->task_list);

In RT there was a smp_mb() which you dropped and I assume you had
reasons for it. I assumed that one can perform list_empty_careful()
without a lock if the items were removed with list_del_init(). But since
nothing in -RT blow up so far I guess this here is legal, too :)

>+}
>+
>+extern void swake_up(struct swait_queue_head *q);
>+extern void swake_up_all(struct swait_queue_head *q);
>+extern void swake_up_locked(struct swait_queue_head *q);
>+
>+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
>+extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
>+extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
>+
>+extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
>+extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
>+
>+/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
>+#define ___swait_event(wq, condition, state, ret, cmd)			\
>+({									\
>+	struct swait_queue __wait;					\
>+	long __ret = ret;						\
>+									\
>+	INIT_LIST_HEAD(&__wait.task_list);				\
>+	for (;;) {							\
>+		long __int = prepare_to_swait_event(&wq, &__wait, state);\
>+									\
>+		if (condition)						\
>+			break;						\
>+									\
>+		if (___wait_is_interruptible(state) && __int) {		\
>+			__ret = __int;					\
>+			break;						\
>+		}							\
>+									\
>+		cmd;							\
>+	}								\
>+	finish_swait(&wq, &__wait);					\
>+	__ret;								\
>+})
>+
>+#define __swait_event(wq, condition)					\
>+	(void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,	\
>+			    schedule())
>+
>+#define swait_event(wq, condition)					\
>+do {									\
>+	if (condition)							\
>+		break;							\
>+	__swait_event(wq, condition);					\
>+} while (0)
>+
>+#define __swait_event_timeout(wq, condition, timeout)			\
>+	___swait_event(wq, ___wait_cond_timeout(condition),		\
>+		      TASK_UNINTERRUPTIBLE, timeout,			\
>+		      __ret = schedule_timeout(__ret))
>+
>+#define swait_event_timeout(wq, condition, timeout)			\
>+({									\
>+	long __ret = timeout;						\
>+	if (!___wait_cond_timeout(condition))				\
>+		__ret = __swait_event_timeout(wq, condition, timeout);	\
>+	__ret;								\
>+})
>+
>+#define __swait_event_interruptible(wq, condition)			\
>+	___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,		\
>+		      schedule())
>+
>+#define swait_event_interruptible(wq, condition)			\
>+({									\
>+	int __ret = 0;							\
>+	if (!(condition))						\
>+		__ret = __swait_event_interruptible(wq, condition);	\
>+	__ret;								\
>+})
>+
>+#define __swait_event_interruptible_timeout(wq, condition, timeout)	\
>+	___swait_event(wq, ___wait_cond_timeout(condition),		\
>+		      TASK_INTERRUPTIBLE, timeout,			\
>+		      __ret = schedule_timeout(__ret))
>+
>+#define swait_event_interruptible_timeout(wq, condition, timeout)	\
>+({									\
>+	long __ret = timeout;						\
>+	if (!___wait_cond_timeout(condition))				\
>+		__ret = __swait_event_interruptible_timeout(wq,		\
>+						condition, timeout);	\
>+	__ret;								\
>+})
>+
>+#endif /* _LINUX_SWAIT_H */
>--- /dev/null
>+++ b/kernel/sched/swait.c
>@@ -0,0 +1,122 @@
>+
>+#include <linux/swait.h>
>+
>+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
>+			     struct lock_class_key *key)
>+{
>+	raw_spin_lock_init(&q->lock);
>+	lockdep_set_class_and_name(&q->lock, key, name);
>+	INIT_LIST_HEAD(&q->task_list);
>+}
>+EXPORT_SYMBOL(__init_swait_queue_head);
>+
>+/*
>+ * The thing about the wake_up_state() return value; I think we can ignore it.
>+ *
>+ * If for some reason it would return 0, that means the previously waiting
>+ * task is already running, so it will observe condition true (or has already).
>+ */
>+void swake_up_locked(struct swait_queue_head *q)
>+{
>+	struct swait_queue *curr;
>+
>+	list_for_each_entry(curr, &q->task_list, task_list) {
>+		wake_up_process(curr->task);

okay. So since we limit everything to TASK_NORMAL which has to sleep
while on the list there is no need to check if we actually woken up
someone.

>+		list_del_init(&curr->task_list);
>+		break;
>+	}
>+}
>+EXPORT_SYMBOL(swake_up_locked);
>+
>+void swake_up(struct swait_queue_head *q)
>+{
>+	unsigned long flags;
>+
>+	if (!swait_active(q))
>+		return;
>+
>+	raw_spin_lock_irqsave(&q->lock, flags);
>+	__swake_up_locked(q);

I thing this should have been swake_up_locked() instead since
__swake_up_locked() isn't part of this patch.

Just a nitpick: later there is __prepare_to_swait() and __finish_swait()
which have the __ prefix instead a _locked suffix. Not sure what is
better for a better for a public API but maybe one way would be good.

>+	raw_spin_unlock_irqrestore(&q->lock, flags);
>+}
>+EXPORT_SYMBOL(swake_up);
>+
>+/*
>+ * Does not allow usage from IRQ disabled, since we must be able to
>+ * release IRQs to guarantee bounded hold time.
>+ */
>+void swake_up_all(struct swait_queue_head *q)
>+{
>+	struct swait_queue *curr, *next;
>+	LIST_HEAD(tmp);

WARN_ON(irqs_disabled()) ?

>+	if (!swait_active(q))
>+		return;
>+
>+	raw_spin_lock_irq(&q->lock);
>+	list_splice_init(&q->task_list, &tmp);
>+	while (!list_empty(&tmp)) {
>+		curr = list_first_entry(&tmp, typeof(curr), task_list);
>+
>+		wake_up_state(curr->task, state);
>+		list_del_init(&curr->task_list);

So because the task may timeout and remove itself from the list at
anytime you need to hold the lock during wakeup and the removal from the
list

>+
>+		if (list_empty(&tmp))
>+			break;
>+
>+		raw_spin_unlock_irq(&q->lock);

and you drop the lock after each iteration in case there is an IRQ 
pending or the task, that has been just woken up, has a higher priority
than the current task and needs to get on the CPU.
Not sure if this case matters:
- _this_ task (wake_all) prio 120
- first task in queue prio 10, RR
- second task in queue prio 9, RR

the *old* behavior would put the second task before the first task on
CPU. The *new* behaviour puts the first task on the CPU after dropping
the lock. The second task (that has a higher priority but nobody knows)
has to wait until the first one is done (and anything else that might
been woken up in the meantime with a higher prio than 120).

>+		raw_spin_lock_irq(&q->lock);
>+	}
>+	raw_spin_unlock_irq(&q->lock);
>+}
>+EXPORT_SYMBOL(swake_up_all);
>+
>+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
>+{
>+	wait->task = current;
>+	if (list_empty(&wait->node))
>+		list_add(&wait->task_list, &q->task_list);
>+}
>+
>+void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
>+{
>+	unsigned long flags;
>+
>+	raw_spin_lock_irqsave(&q->lock, flags);
>+	__prepare_to_swait(q, wait);
>+	set_current_state(state);
>+	raw_spin_unlock_irqrestore(&q->lock, flags);
>+}
>+EXPORT_SYMBOL(prepare_to_swait);
>+
>+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
>+{
>+	if (signal_pending_state(state, current))
>+		return -ERESTARTSYS;
>+
>+	prepare_to_swait(q, wait, state);
>+
>+	return 0;
>+}
>+EXPORT_SYMBOL(prepare_to_swait_event);
>+
>+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
this one has no users the __ suggests that it is locked edition. Maybe
it is for the completions…

>+{
>+	__set_current_state(TASK_RUNNING);
>+	if (!list_empty(&wait->task_list))
>+		list_del_init(&wait->task_list);
>+}
>+
>+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
>+{
>+	unsigned long flags;
>+
>+	__set_current_state(TASK_RUNNING);
>+
>+	if (!list_empty_careful(&wait->task_list)) {
>+		raw_spin_lock_irqsave(&q->lock, flags);
>+		list_del_init(&wait->task_list);
>+		raw_spin_unlock_irqrestore(&q->lock, flags);
>+	}
>+}
>+EXPORT_SYMBOL(finish_swait);

Sebastian
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Aug. 7, 2015, 10:57 a.m. UTC | #2
On Wed, Feb 25, 2015 at 10:02:50PM +0100, Sebastian Andrzej Siewior wrote:

> >+static inline int swait_active(struct swait_queue_head *q)
> >+{
> >+	return !list_empty(&q->task_list);
> 
> In RT there was a smp_mb() which you dropped and I assume you had
> reasons for it.

Yeah, RT didn't have a reason for the smp_mb() -- any barrier without a
comment is a bug :-)

Also waitqueue_active(), its counterpart, does not have a barrier there
either.

Nor did I see any reason for that mb to be there.

> I assumed that one can perform list_empty_careful()
> without a lock if the items were removed with list_del_init(). But since
> nothing in -RT blow up so far I guess this here is legal, too :)

Nobody will come and arrest us for software bugs -- yet ;-)

> >+/*
> >+ * The thing about the wake_up_state() return value; I think we can ignore it.
> >+ *
> >+ * If for some reason it would return 0, that means the previously waiting
> >+ * task is already running, so it will observe condition true (or has already).
> >+ */
> >+void swake_up_locked(struct swait_queue_head *q)
> >+{
> >+	struct swait_queue *curr;
> >+
> >+	list_for_each_entry(curr, &q->task_list, task_list) {
> >+		wake_up_process(curr->task);
> 
> okay. So since we limit everything to TASK_NORMAL which has to sleep
> while on the list there is no need to check if we actually woken up
> someone.

Partly that, also that I don't see how that return value is meaningful
in the first place.

If it were to return false, the task was/is already running and it will
observe whatever condition we just satisfied to allow waking things up.

So either way around, we'll get (at least) 1 task running.

> >+		list_del_init(&curr->task_list);
> >+		break;
> >+	}
> >+}
> >+EXPORT_SYMBOL(swake_up_locked);



> >+void swake_up(struct swait_queue_head *q)
> >+{
> >+	unsigned long flags;
> >+
> >+	if (!swait_active(q))
> >+		return;
> >+
> >+	raw_spin_lock_irqsave(&q->lock, flags);
> >+	__swake_up_locked(q);
> 
> I thing this should have been swake_up_locked() instead since
> __swake_up_locked() isn't part of this patch.
> 
> Just a nitpick: later there is __prepare_to_swait() and __finish_swait()
> which have the __ prefix instead a _locked suffix. Not sure what is
> better for a better for a public API but maybe one way would be good.

Yeah, I suppose that's true ;-)

> >+	raw_spin_unlock_irqrestore(&q->lock, flags);
> >+}
> >+EXPORT_SYMBOL(swake_up);
> >+
> >+/*
> >+ * Does not allow usage from IRQ disabled, since we must be able to
> >+ * release IRQs to guarantee bounded hold time.
> >+ */
> >+void swake_up_all(struct swait_queue_head *q)
> >+{
> >+	struct swait_queue *curr, *next;
> >+	LIST_HEAD(tmp);
> 
> WARN_ON(irqs_disabled()) ?

Lockdep should already catch that by virtue of using unconditional _irq
spinlock primitives.

> >+	if (!swait_active(q))
> >+		return;
> >+
> >+	raw_spin_lock_irq(&q->lock);
> >+	list_splice_init(&q->task_list, &tmp);
> >+	while (!list_empty(&tmp)) {
> >+		curr = list_first_entry(&tmp, typeof(curr), task_list);
> >+
> >+		wake_up_state(curr->task, state);
> >+		list_del_init(&curr->task_list);
> 
> So because the task may timeout and remove itself from the list at
> anytime you need to hold the lock during wakeup and the removal from the
> list

Indeed.

> >+
> >+		if (list_empty(&tmp))
> >+			break;
> >+
> >+		raw_spin_unlock_irq(&q->lock);
> 
> and you drop the lock after each iteration in case there is an IRQ 
> pending or the task, that has been just woken up, has a higher priority
> than the current task and needs to get on the CPU.

> Not sure if this case matters:
> - _this_ task (wake_all) prio 120
> - first task in queue prio 10, RR
> - second task in queue prio 9, RR

Why complicate things? Better to not assume anything and just do the
simple correct thing.

> the *old* behavior would put the second task before the first task on
> CPU. The *new* behaviour puts the first task on the CPU after dropping
> the lock. The second task (that has a higher priority but nobody knows)
> has to wait until the first one is done (and anything else that might
> been woken up in the meantime with a higher prio than 120).

Irrelevant, we _must_ drop the lock in order to maintain bounded
behaviour.

> >+		raw_spin_lock_irq(&q->lock);
> >+	}
> >+	raw_spin_unlock_irq(&q->lock);
> >+}
> >+EXPORT_SYMBOL(swake_up_all);

> >+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
> this one has no users the __ suggests that it is locked edition. Maybe
> it is for the completions…

Yeah, who knows, I certainly do not anymore ;-)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Aug. 7, 2015, 11:14 a.m. UTC | #3
On Fri, Aug 07, 2015 at 12:57:38PM +0200, Peter Zijlstra wrote:

> > >+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)

> > this one has no users the __ suggests that it is locked edition. Maybe
> > it is for the completions…
> 
> Yeah, who knows, I certainly do not anymore ;-)

On that, we cannot convert completions to swait. Because swait wake_all
must not happen from IRQ context, and complete_all() typically is used
from just that.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Aug. 7, 2015, 4:41 p.m. UTC | #4
On Fri, Aug 07, 2015 at 01:14:15PM +0200, Peter Zijlstra wrote:
> On that, we cannot convert completions to swait. Because swait wake_all
> must not happen from IRQ context, and complete_all() typically is used
> from just that.

If swait queues aren't useable from IRQ context they will be fairly
useless.  What's the problem with making them irq safe?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Aug. 7, 2015, 4:45 p.m. UTC | #5
On Fri, Aug 07, 2015 at 09:41:31AM -0700, Christoph Hellwig wrote:
> On Fri, Aug 07, 2015 at 01:14:15PM +0200, Peter Zijlstra wrote:
> > On that, we cannot convert completions to swait. Because swait wake_all
> > must not happen from IRQ context, and complete_all() typically is used
> > from just that.
> 
> If swait queues aren't useable from IRQ context they will be fairly
> useless.  What's the problem with making them irq safe?

Its just the swait_wake_all() that is not. The entire purpose of them
was to have something that allows bounded execution (RT and all).

Since you can have unbounded numbers of tasks waiting on a waitqueue
(well, reality has bounds of course, like total memory available etc..)
a wake_all() can end up being many many wake_process() calls.

We've had this be a problem in RT.

So the proposed swait_wake_all() requires being called from task
context, such that it can drop the lock (and IRQ disable) after every
wakeup, and thereby guarantee that higher priority things will not
experience undue latencies.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Aug. 9, 2015, 6:39 a.m. UTC | #6
On Fri, Aug 07, 2015 at 06:45:26PM +0200, Peter Zijlstra wrote:
> Its just the swait_wake_all() that is not. The entire purpose of them
> was to have something that allows bounded execution (RT and all).

Still not sure i that might be a too big burden for mainline, but at
least it's not as severe..
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Gleixner Aug. 17, 2015, 8:31 p.m. UTC | #7
On Fri, 7 Aug 2015, Peter Zijlstra wrote:
> On Fri, Aug 07, 2015 at 12:57:38PM +0200, Peter Zijlstra wrote:
> 
> > > >+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
> 
> > > this one has no users the __ suggests that it is locked edition. Maybe
> > > it is for the completions…
> > 
> > Yeah, who knows, I certainly do not anymore ;-)
> 
> On that, we cannot convert completions to swait. Because swait wake_all
> must not happen from IRQ context, and complete_all() typically is used
> from just that.

Well, completions are not the ones which have a gazillion of waiters
and we got quite some performance back from using swait in completions
on RT.

Thanks,

	tglx

Patch
diff mbox

--- /dev/null
+++ b/include/linux/swait.h
@@ -0,0 +1,172 @@ 
+#ifndef _LINUX_SWAIT_H
+#define _LINUX_SWAIT_H
+
+#include <linux/list.h>
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <asm/current.h>
+
+/*
+ * Simple wait queues
+ *
+ * While these are very similar to the other/complex wait queues (wait.h) the
+ * most important difference is that the simple waitqueue allows for
+ * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
+ * times.
+ *
+ * In order to make this so, we had to drop a fair number of features of the
+ * other waitqueue code; notably:
+ *
+ *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
+ *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
+ *    sleeper state.
+ *
+ *  - the exclusive mode; because this requires preserving the list order
+ *    and this is hard.
+ *
+ *  - custom wake functions; because you cannot give any guarantees about
+ *    random code.
+ *
+ * As a side effect of this; the data structures are slimmer.
+ *
+ * One would recommend using this wait queue where possible.
+ */
+
+struct task_struct;
+
+struct swait_queue_head {
+	raw_spinlock_t		lock;
+	struct list_head	task_list;
+};
+
+struct swait_queue {
+	struct task_struct	*task;
+	struct list_head	task_list;
+};
+
+#define __SWAITQUEUE_INITIALIZER(name) {				\
+	.task		= current,					\
+	.task_list	= LIST_HEAD_INIT((name).task_list),		\
+}
+
+#define DECLARE_SWAITQUEUE(name)					\
+	struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
+
+#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) {				\
+	.lock		= __RAW_SPIN_LOCK_UNLOCKED(name.lock),		\
+	.task_list	= LIST_HEAD_INIT((name).task_list),		\
+}
+
+#define DECLARE_SWAIT_QUEUE_HEAD(name)					\
+	struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
+
+extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+				    struct lock_class_key *key);
+
+#define init_swait_queue_head(q)				\
+	do {							\
+		static struct lock_class_key __key;		\
+		__init_swait_queue_head((q), #q, &__key);	\
+	} while (0)
+
+#ifdef CONFIG_LOCKDEP
+# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)			\
+	({ init_swait_queue_head(&name); name; })
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)			\
+	struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
+#else
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)			\
+	DECLARE_SWAIT_QUEUE_HEAD(name)
+#endif
+
+static inline int swait_active(struct swait_queue_head *q)
+{
+	return !list_empty(&q->task_list);
+}
+
+extern void swake_up(struct swait_queue_head *q);
+extern void swake_up_all(struct swait_queue_head *q);
+extern void swake_up_locked(struct swait_queue_head *q);
+
+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
+extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
+
+extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
+#define ___swait_event(wq, condition, state, ret, cmd)			\
+({									\
+	struct swait_queue __wait;					\
+	long __ret = ret;						\
+									\
+	INIT_LIST_HEAD(&__wait.task_list);				\
+	for (;;) {							\
+		long __int = prepare_to_swait_event(&wq, &__wait, state);\
+									\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			break;						\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_swait(&wq, &__wait);					\
+	__ret;								\
+})
+
+#define __swait_event(wq, condition)					\
+	(void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,	\
+			    schedule())
+
+#define swait_event(wq, condition)					\
+do {									\
+	if (condition)							\
+		break;							\
+	__swait_event(wq, condition);					\
+} while (0)
+
+#define __swait_event_timeout(wq, condition, timeout)			\
+	___swait_event(wq, ___wait_cond_timeout(condition),		\
+		      TASK_UNINTERRUPTIBLE, timeout,			\
+		      __ret = schedule_timeout(__ret))
+
+#define swait_event_timeout(wq, condition, timeout)			\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __swait_event_timeout(wq, condition, timeout);	\
+	__ret;								\
+})
+
+#define __swait_event_interruptible(wq, condition)			\
+	___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,		\
+		      schedule())
+
+#define swait_event_interruptible(wq, condition)			\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__ret = __swait_event_interruptible(wq, condition);	\
+	__ret;								\
+})
+
+#define __swait_event_interruptible_timeout(wq, condition, timeout)	\
+	___swait_event(wq, ___wait_cond_timeout(condition),		\
+		      TASK_INTERRUPTIBLE, timeout,			\
+		      __ret = schedule_timeout(__ret))
+
+#define swait_event_interruptible_timeout(wq, condition, timeout)	\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __swait_event_interruptible_timeout(wq,		\
+						condition, timeout);	\
+	__ret;								\
+})
+
+#endif /* _LINUX_SWAIT_H */
--- /dev/null
+++ b/kernel/sched/swait.c
@@ -0,0 +1,122 @@ 
+
+#include <linux/swait.h>
+
+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+			     struct lock_class_key *key)
+{
+	raw_spin_lock_init(&q->lock);
+	lockdep_set_class_and_name(&q->lock, key, name);
+	INIT_LIST_HEAD(&q->task_list);
+}
+EXPORT_SYMBOL(__init_swait_queue_head);
+
+/*
+ * The thing about the wake_up_state() return value; I think we can ignore it.
+ *
+ * If for some reason it would return 0, that means the previously waiting
+ * task is already running, so it will observe condition true (or has already).
+ */
+void swake_up_locked(struct swait_queue_head *q)
+{
+	struct swait_queue *curr;
+
+	list_for_each_entry(curr, &q->task_list, task_list) {
+		wake_up_process(curr->task);
+		list_del_init(&curr->task_list);
+		break;
+	}
+}
+EXPORT_SYMBOL(swake_up_locked);
+
+void swake_up(struct swait_queue_head *q)
+{
+	unsigned long flags;
+
+	if (!swait_active(q))
+		return;
+
+	raw_spin_lock_irqsave(&q->lock, flags);
+	__swake_up_locked(q);
+	raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(swake_up);
+
+/*
+ * Does not allow usage from IRQ disabled, since we must be able to
+ * release IRQs to guarantee bounded hold time.
+ */
+void swake_up_all(struct swait_queue_head *q)
+{
+	struct swait_queue *curr, *next;
+	LIST_HEAD(tmp);
+
+	if (!swait_active(q))
+		return;
+
+	raw_spin_lock_irq(&q->lock);
+	list_splice_init(&q->task_list, &tmp);
+	while (!list_empty(&tmp)) {
+		curr = list_first_entry(&tmp, typeof(curr), task_list);
+
+		wake_up_state(curr->task, state);
+		list_del_init(&curr->task_list);
+
+		if (list_empty(&tmp))
+			break;
+
+		raw_spin_unlock_irq(&q->lock);
+		raw_spin_lock_irq(&q->lock);
+	}
+	raw_spin_unlock_irq(&q->lock);
+}
+EXPORT_SYMBOL(swake_up_all);
+
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+	wait->task = current;
+	if (list_empty(&wait->node))
+		list_add(&wait->task_list, &q->task_list);
+}
+
+void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&q->lock, flags);
+	__prepare_to_swait(q, wait);
+	set_current_state(state);
+	raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_swait);
+
+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+	if (signal_pending_state(state, current))
+		return -ERESTARTSYS;
+
+	prepare_to_swait(q, wait, state);
+
+	return 0;
+}
+EXPORT_SYMBOL(prepare_to_swait_event);
+
+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+	__set_current_state(TASK_RUNNING);
+	if (!list_empty(&wait->task_list))
+		list_del_init(&wait->task_list);
+}
+
+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+	unsigned long flags;
+
+	__set_current_state(TASK_RUNNING);
+
+	if (!list_empty_careful(&wait->task_list)) {
+		raw_spin_lock_irqsave(&q->lock, flags);
+		list_del_init(&wait->task_list);
+		raw_spin_unlock_irqrestore(&q->lock, flags);
+	}
+}
+EXPORT_SYMBOL(finish_swait);