diff mbox

[RFC] Improving directed yield scalability for PLE handler

Message ID 1347293035.2124.22.camel@twins (mailing list archive)
State New, archived
Headers show

Commit Message

Peter Zijlstra Sept. 10, 2012, 4:03 p.m. UTC
On Mon, 2012-09-10 at 08:16 -0500, Andrew Theurer wrote:
> > > @@ -4856,8 +4859,6 @@ again:
> > >     if (curr->sched_class != p->sched_class)
> > >             goto out;
> > > 
> > > -   if (task_running(p_rq, p) || p->state)
> > > -           goto out;
> > 
> > Is it possible that by this time the current thread takes double rq
> > lock, thread p could actually be running?  i.e is there merit to keep
> > this check around even with your similar check above?
> 
> I think that's a good idea.  I'll add that back in. 

Right, it needs to still be there, the test before acquiring p_rq is an
optimistic test to avoid work, but you have to still test it once you
acquire p_rq since the rest of the code relies on this not being so.

How about something like this instead.. ?

---
 kernel/sched/core.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Srikar Dronamraju Sept. 10, 2012, 4:56 p.m. UTC | #1
* Peter Zijlstra <peterz@infradead.org> [2012-09-10 18:03:55]:

> On Mon, 2012-09-10 at 08:16 -0500, Andrew Theurer wrote:
> > > > @@ -4856,8 +4859,6 @@ again:
> > > >     if (curr->sched_class != p->sched_class)
> > > >             goto out;
> > > > 
> > > > -   if (task_running(p_rq, p) || p->state)
> > > > -           goto out;
> > > 
> > > Is it possible that by this time the current thread takes double rq
> > > lock, thread p could actually be running?  i.e is there merit to keep
> > > this check around even with your similar check above?
> > 
> > I think that's a good idea.  I'll add that back in. 
> 
> Right, it needs to still be there, the test before acquiring p_rq is an
> optimistic test to avoid work, but you have to still test it once you
> acquire p_rq since the rest of the code relies on this not being so.
> 
> How about something like this instead.. ?
> 
> ---
>  kernel/sched/core.c | 35 ++++++++++++++++++++++++++---------
>  1 file changed, 26 insertions(+), 9 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c46a011..c9ecab2 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4300,6 +4300,23 @@ void __sched yield(void)
>  }
>  EXPORT_SYMBOL(yield);
>  
> +/*
> + * Tests preconditions required for sched_class::yield_to().
> + */
> +static bool __yield_to_candidate(struct task_struct *curr, struct task_struct *p)
> +{
> +	if (!curr->sched_class->yield_to_task)
> +		return false;
> +
> +	if (curr->sched_class != p->sched_class)
> +		return false;


Peter, 

Should we also add a check if the runq has a skip buddy (as pointed out
by Raghu) and return if the skip buddy is already set.  Something akin
to 

	if (p_rq->cfs_rq->skip)
		return false;

So if somebody has already acquired a double run queue lock and almost
set the next buddy, we dont need to take run queue lock and also avoid
overwriting the already set skip buddy.

> +
> +	if (task_running(p_rq, p) || p->state)
> +		return false;
> +
> +	return true;
> +}
> +
>  /**
>   * yield_to - yield the current processor to another thread in
>   * your thread group, or accelerate that thread toward the
> @@ -4323,6 +4340,10 @@ bool __sched yield_to(struct task_struct *p, bool preempt)
>  	rq = this_rq();
>  
>  again:
> +	/* optimistic test to avoid taking locks */
> +	if (!__yield_to_candidate(curr, p))
> +		goto out_irq;
> +
>  	p_rq = task_rq(p);
>  	double_rq_lock(rq, p_rq);
>  	while (task_rq(p) != p_rq) {
> @@ -4330,14 +4351,9 @@ bool __sched yield_to(struct task_struct *p, bool preempt)
>  		goto again;
>  	}
>  
> -	if (!curr->sched_class->yield_to_task)
> -		goto out;
> -
> -	if (curr->sched_class != p->sched_class)
> -		goto out;
> -
> -	if (task_running(p_rq, p) || p->state)
> -		goto out;
> +	/* validate state, holding p_rq ensures p's state cannot change */
> +	if (!__yield_to_candidate(curr, p))
> +		goto out_unlock;
>  
>  	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
>  	if (yielded) {
> @@ -4350,8 +4366,9 @@ bool __sched yield_to(struct task_struct *p, bool preempt)
>  			resched_task(p_rq->curr);
>  	}
>  
> -out:
> +out_unlock:
>  	double_rq_unlock(rq, p_rq);
> +out_irq:
>  	local_irq_restore(flags);
>  
>  	if (yielded)
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Sept. 10, 2012, 5:12 p.m. UTC | #2
On Mon, 2012-09-10 at 22:26 +0530, Srikar Dronamraju wrote:
> > +static bool __yield_to_candidate(struct task_struct *curr, struct task_struct *p)
> > +{
> > +     if (!curr->sched_class->yield_to_task)
> > +             return false;
> > +
> > +     if (curr->sched_class != p->sched_class)
> > +             return false;
> 
> 
> Peter, 
> 
> Should we also add a check if the runq has a skip buddy (as pointed out
> by Raghu) and return if the skip buddy is already set. 

Oh right, I missed that suggestion.. the performance improvement went
from 81% to 139% using this, right?

It might make more sense to keep that separate, outside of this
function, since its not a strict prerequisite.

> > 
> > +     if (task_running(p_rq, p) || p->state)
> > +             return false;
> > +
> > +     return true;
> > +} 


> > @@ -4323,6 +4340,10 @@ bool __sched yield_to(struct task_struct *p,
> bool preempt)
> >       rq = this_rq();
> >  
> >  again:
> > +     /* optimistic test to avoid taking locks */
> > +     if (!__yield_to_candidate(curr, p))
> > +             goto out_irq;
> > +

So add something like:

	/* Optimistic, if we 'raced' with another yield_to(), don't bother */
	if (p_rq->cfs_rq->skip)
		goto out_irq;
> 
> 
> >       p_rq = task_rq(p);
> >       double_rq_lock(rq, p_rq);
> 
> 
But I do have a question on this optimization though,.. Why do we check
p_rq->cfs_rq->skip and not rq->cfs_rq->skip ?

That is, I'd like to see this thing explained a little better.

Does it go something like: p_rq is the runqueue of the task we'd like to
yield to, rq is our own, they might be the same. If we have a ->skip,
there's nothing we can do about it, OTOH p_rq having a ->skip and
failing the yield_to() simply means us picking the next VCPU thread,
which might be running on an entirely different cpu (rq) and could
succeed?


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Raghavendra K T Sept. 10, 2012, 7:10 p.m. UTC | #3
On 09/10/2012 10:42 PM, Peter Zijlstra wrote:
> On Mon, 2012-09-10 at 22:26 +0530, Srikar Dronamraju wrote:
>>> +static bool __yield_to_candidate(struct task_struct *curr, struct task_struct *p)
>>> +{
>>> +     if (!curr->sched_class->yield_to_task)
>>> +             return false;
>>> +
>>> +     if (curr->sched_class != p->sched_class)
>>> +             return false;
>>
>>
>> Peter,
>>
>> Should we also add a check if the runq has a skip buddy (as pointed out
>> by Raghu) and return if the skip buddy is already set.
>
> Oh right, I missed that suggestion.. the performance improvement went
> from 81% to 139% using this, right?
>
> It might make more sense to keep that separate, outside of this
> function, since its not a strict prerequisite.
>
>>>
>>> +     if (task_running(p_rq, p) || p->state)
>>> +             return false;
>>> +
>>> +     return true;
>>> +}
>
>
>>> @@ -4323,6 +4340,10 @@ bool __sched yield_to(struct task_struct *p,
>> bool preempt)
>>>        rq = this_rq();
>>>
>>>   again:
>>> +     /* optimistic test to avoid taking locks */
>>> +     if (!__yield_to_candidate(curr, p))
>>> +             goto out_irq;
>>> +
>
> So add something like:
>
> 	/* Optimistic, if we 'raced' with another yield_to(), don't bother */
> 	if (p_rq->cfs_rq->skip)
> 		goto out_irq;
>>
>>
>>>        p_rq = task_rq(p);
>>>        double_rq_lock(rq, p_rq);
>>
>>
> But I do have a question on this optimization though,.. Why do we check
> p_rq->cfs_rq->skip and not rq->cfs_rq->skip ?
>
> That is, I'd like to see this thing explained a little better.
>
> Does it go something like: p_rq is the runqueue of the task we'd like to
> yield to, rq is our own, they might be the same. If we have a ->skip,
> there's nothing we can do about it, OTOH p_rq having a ->skip and
> failing the yield_to() simply means us picking the next VCPU thread,
> which might be running on an entirely different cpu (rq) and could
> succeed?
>

Yes, That is the intention (mean checking  p_rq->cfs->skip). Though we
may be overdoing this check in the scenario when multiple vcpus of same
VM pinned to same CPU.

I am testing the above patch. Hope to be able to get back with the
results tomorrow.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srikar Dronamraju Sept. 11, 2012, 7:04 a.m. UTC | #4
> > > @@ -4323,6 +4340,10 @@ bool __sched yield_to(struct task_struct *p,
> > bool preempt)
> > >       rq = this_rq();
> > >  
> > >  again:
> > > +     /* optimistic test to avoid taking locks */
> > > +     if (!__yield_to_candidate(curr, p))
> > > +             goto out_irq;
> > > +
> 
> So add something like:
> 
> 	/* Optimistic, if we 'raced' with another yield_to(), don't bother */
> 	if (p_rq->cfs_rq->skip)
> 		goto out_irq;
> > 
> > 
> > >       p_rq = task_rq(p);
> > >       double_rq_lock(rq, p_rq);
> > 
> > 
> But I do have a question on this optimization though,.. Why do we check
> p_rq->cfs_rq->skip and not rq->cfs_rq->skip ?
> 
> That is, I'd like to see this thing explained a little better.
> 
> Does it go something like: p_rq is the runqueue of the task we'd like to
> yield to, rq is our own, they might be the same. If we have a ->skip,
> there's nothing we can do about it, OTOH p_rq having a ->skip and
> failing the yield_to() simply means us picking the next VCPU thread,
> which might be running on an entirely different cpu (rq) and could
> succeed?
> 

Oh this made me look back at yield_to() again.  I had misread the
yield_to_task_fair() code. I had wrongly thought that both ->skip and
->next buddies for the p_rq would be set. But it looks like only ->next
for the p_rq is set and ->skip is set for rq.

This should also explains why Andrew saw a regression when checking for
->skip flag instead of PF_VCPU.

Can we check for p_rq->cfs.next and bail out if 


@@ -4820,6 +4820,23 @@ void __sched yield(void)
 }
 EXPORT_SYMBOL(yield);

+/*
+ * Tests preconditions required for sched_class::yield_to().
+ */
+static bool __yield_to_candidate(struct task_struct *curr, struct task_struct *p, struct rq *p_rq)
+{
+	if (!curr->sched_class->yield_to_task)
+		return false;
+
+	if (curr->sched_class != p->sched_class)
+		return false;
+
+	if (task_running(p_rq, p) || p->state)
+		return false;
+
+	return true;
+}
+
 /**
  * yield_to - yield the current processor to another thread in
  * your thread group, or accelerate that thread toward the
@@ -4844,20 +4861,24 @@ bool __sched yield_to(struct task_struct *p, bool preempt)

 again:
 	p_rq = task_rq(p);
+
+	/* optimistic test to avoid taking locks */
+	if (!__yield_to_candidate(curr, p, p_rq))
+		goto out_irq;
+
+	/* if next buddy is set, assume yield is in progress */
+	if (p_rq->cfs.next)
+		goto out_irq;
+
 	double_rq_lock(rq, p_rq);
 	while (task_rq(p) != p_rq) {
 		double_rq_unlock(rq, p_rq);
 		goto again;
 	}

-	if (!curr->sched_class->yield_to_task)
-		goto out;
-
-	if (curr->sched_class != p->sched_class)
-		goto out;
-
-	if (task_running(p_rq, p) || p->state)
-		goto out;
+	/* validate state, holding p_rq ensures p's state cannot change */
+	if (!__yield_to_candidate(curr, p, p_rq))
+		goto out_unlock;

 	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
 	if (yielded) {
@@ -4877,8 +4898,9 @@ again:
 		rq->skip_clock_update = 0;
 	}

-out:
+out_unlock:
 	double_rq_unlock(rq, p_rq);
+out_irq:
 	local_irq_restore(flags);

 	if (yielded)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c46a011..c9ecab2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4300,6 +4300,23 @@  void __sched yield(void)
 }
 EXPORT_SYMBOL(yield);
 
+/*
+ * Tests preconditions required for sched_class::yield_to().
+ */
+static bool __yield_to_candidate(struct task_struct *curr, struct task_struct *p)
+{
+	if (!curr->sched_class->yield_to_task)
+		return false;
+
+	if (curr->sched_class != p->sched_class)
+		return false;
+
+	if (task_running(p_rq, p) || p->state)
+		return false;
+
+	return true;
+}
+
 /**
  * yield_to - yield the current processor to another thread in
  * your thread group, or accelerate that thread toward the
@@ -4323,6 +4340,10 @@  bool __sched yield_to(struct task_struct *p, bool preempt)
 	rq = this_rq();
 
 again:
+	/* optimistic test to avoid taking locks */
+	if (!__yield_to_candidate(curr, p))
+		goto out_irq;
+
 	p_rq = task_rq(p);
 	double_rq_lock(rq, p_rq);
 	while (task_rq(p) != p_rq) {
@@ -4330,14 +4351,9 @@  bool __sched yield_to(struct task_struct *p, bool preempt)
 		goto again;
 	}
 
-	if (!curr->sched_class->yield_to_task)
-		goto out;
-
-	if (curr->sched_class != p->sched_class)
-		goto out;
-
-	if (task_running(p_rq, p) || p->state)
-		goto out;
+	/* validate state, holding p_rq ensures p's state cannot change */
+	if (!__yield_to_candidate(curr, p))
+		goto out_unlock;
 
 	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
 	if (yielded) {
@@ -4350,8 +4366,9 @@  bool __sched yield_to(struct task_struct *p, bool preempt)
 			resched_task(p_rq->curr);
 	}
 
-out:
+out_unlock:
 	double_rq_unlock(rq, p_rq);
+out_irq:
 	local_irq_restore(flags);
 
 	if (yielded)