diff mbox

drm/i915: Only report a wakeup if the waiter was truly asleep

Message ID 20171209124710.1606-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Dec. 9, 2017, 12:47 p.m. UTC
If we attempt to wake up a waiter, who is currently checking the seqno
it will be in the TASK_INTERRUPTIBLE state and ttwu will report success.
However, it is actually awake and functioning -- so delay reporting the
actual wake up until it sleeps. This fixes some spurious claims of
missed_breadcrumbs when running under heavy load; i.e. sufficient load to
preempt away the newly woken waiter before they complete their checks.
However, it does so at the cost of a rare false negative; where the
waiter changes between the check and ttwu -- the only way to fix that
would be to extend the reporting from ttwu where the check could be done
atomically.

v2: Defend against !CONFIG_SMP
v3: Don't filter out calls to wake_up_process

Testcase: igt/drv_missed_irq # sanity check we do detect missed_breadcrumb()
Testcase: igt/gem_concurrent_blit # for generating false positives
References: https://bugs.freedesktop.org/show_bug.cgi?id=100007
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 39 ++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 9 deletions(-)

Comments

Tvrtko Ursulin Dec. 11, 2017, 4:10 p.m. UTC | #1
On 09/12/2017 12:47, Chris Wilson wrote:
> If we attempt to wake up a waiter, who is currently checking the seqno
> it will be in the TASK_INTERRUPTIBLE state and ttwu will report success.
> However, it is actually awake and functioning -- so delay reporting the
> actual wake up until it sleeps. This fixes some spurious claims of
> missed_breadcrumbs when running under heavy load; i.e. sufficient load to
> preempt away the newly woken waiter before they complete their checks.
> However, it does so at the cost of a rare false negative; where the
> waiter changes between the check and ttwu -- the only way to fix that
> would be to extend the reporting from ttwu where the check could be done
> atomically.
> 
> v2: Defend against !CONFIG_SMP
> v3: Don't filter out calls to wake_up_process
> 
> Testcase: igt/drv_missed_irq # sanity check we do detect missed_breadcrumb()
> Testcase: igt/gem_concurrent_blit # for generating false positives
> References: https://bugs.freedesktop.org/show_bug.cgi?id=100007
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/intel_breadcrumbs.c | 39 ++++++++++++++++++++++++--------
>   1 file changed, 30 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 24c6fefdd0b1..76e6f8e7cfd4 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -27,6 +27,12 @@
>   
>   #include "i915_drv.h"
>   
> +#ifdef CONFIG_SMP
> +#define task_asleep(tsk) ((tsk)->state & TASK_NORMAL && !(tsk)->on_cpu)
> +#else
> +#define task_asleep(tsk) ((tsk)->state & TASK_NORMAL)
> +#endif
> +

I kind of remember the on_cpu from before and I was probably complaining 
about it. Sigh, if it helps ok..

>   static unsigned int __intel_breadcrumbs_wakeup(struct intel_breadcrumbs *b)
>   {
>   	struct intel_wait *wait;
> @@ -36,8 +42,20 @@ static unsigned int __intel_breadcrumbs_wakeup(struct intel_breadcrumbs *b)
>   
>   	wait = b->irq_wait;
>   	if (wait) {
> +		/*
> +		 * N.B. Since task_asleep() and ttwu are not atomic, the
> +		 * waiter may actually go to sleep after the check, causing
> +		 * us to suppress a valid wakeup. We prefer to reduce the
> +		 * number of false positive missed_breadcrumb() warnings
> +		 * at the expense of a few false negatives, as it it easy
> +		 * to trigger a false positive under heavy load. Enough
> +		 * signal should remain from genuine missed_breadcrumb()
> +		 * for us to detect in CI.
> +		 */
> +		bool was_asleep = task_asleep(wait->tsk);
> +
>   		result = ENGINE_WAKEUP_WAITER;
> -		if (wake_up_process(wait->tsk))
> +		if (wake_up_process(wait->tsk) && was_asleep)
>   			result |= ENGINE_WAKEUP_ASLEEP;
>   	}
>   
> @@ -47,12 +65,15 @@ static unsigned int __intel_breadcrumbs_wakeup(struct intel_breadcrumbs *b)
>   unsigned int intel_engine_wakeup(struct intel_engine_cs *engine)
>   {
>   	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> -	unsigned long flags;
> -	unsigned int result;
> +	unsigned int result = 0;
>   
> -	spin_lock_irqsave(&b->irq_lock, flags);
> -	result = __intel_breadcrumbs_wakeup(b);
> -	spin_unlock_irqrestore(&b->irq_lock, flags);
> +	if (READ_ONCE(b->irq_wait)) {
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&b->irq_lock, flags);
> +		result = __intel_breadcrumbs_wakeup(b);
> +		spin_unlock_irqrestore(&b->irq_lock, flags);
> +	}

This hunk I'd leave out from the fix.

>   
>   	return result;
>   }
> @@ -77,8 +98,8 @@ static noinline void missed_breadcrumb(struct intel_engine_cs *engine)
>   
>   static void intel_breadcrumbs_hangcheck(struct timer_list *t)
>   {
> -	struct intel_engine_cs *engine = from_timer(engine, t,
> -						    breadcrumbs.hangcheck);
> +	struct intel_engine_cs *engine =
> +		from_timer(engine, t, breadcrumbs.hangcheck);
>   	struct intel_breadcrumbs *b = &engine->breadcrumbs;
>   
>   	if (!b->irq_armed)
> @@ -104,7 +125,7 @@ static void intel_breadcrumbs_hangcheck(struct timer_list *t)
>   	 */
>   	if (intel_engine_wakeup(engine) & ENGINE_WAKEUP_ASLEEP) {
>   		missed_breadcrumb(engine);
> -		mod_timer(&engine->breadcrumbs.fake_irq, jiffies + 1);
> +		mod_timer(&b->fake_irq, jiffies + 1);
>   	} else {
>   		mod_timer(&b->hangcheck, wait_timeout());
>   	}
> 

I'll turn a blind eye to this one. :)

Regards,

Tvrtko
Chris Wilson Dec. 11, 2017, 5:08 p.m. UTC | #2
Quoting Tvrtko Ursulin (2017-12-11 16:10:49)
> 
> On 09/12/2017 12:47, Chris Wilson wrote:
> > If we attempt to wake up a waiter, who is currently checking the seqno
> > it will be in the TASK_INTERRUPTIBLE state and ttwu will report success.
> > However, it is actually awake and functioning -- so delay reporting the
> > actual wake up until it sleeps. This fixes some spurious claims of
> > missed_breadcrumbs when running under heavy load; i.e. sufficient load to
> > preempt away the newly woken waiter before they complete their checks.
> > However, it does so at the cost of a rare false negative; where the
> > waiter changes between the check and ttwu -- the only way to fix that
> > would be to extend the reporting from ttwu where the check could be done
> > atomically.
> > 
> > v2: Defend against !CONFIG_SMP
> > v3: Don't filter out calls to wake_up_process
> > 
> > Testcase: igt/drv_missed_irq # sanity check we do detect missed_breadcrumb()
> > Testcase: igt/gem_concurrent_blit # for generating false positives
> > References: https://bugs.freedesktop.org/show_bug.cgi?id=100007
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > ---
> >   drivers/gpu/drm/i915/intel_breadcrumbs.c | 39 ++++++++++++++++++++++++--------
> >   1 file changed, 30 insertions(+), 9 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> > index 24c6fefdd0b1..76e6f8e7cfd4 100644
> > --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> > +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> > @@ -27,6 +27,12 @@
> >   
> >   #include "i915_drv.h"
> >   
> > +#ifdef CONFIG_SMP
> > +#define task_asleep(tsk) ((tsk)->state & TASK_NORMAL && !(tsk)->on_cpu)
> > +#else
> > +#define task_asleep(tsk) ((tsk)->state & TASK_NORMAL)
> > +#endif
> > +
> 
> I kind of remember the on_cpu from before and I was probably complaining 
> about it. Sigh, if it helps ok..
> 
> >   static unsigned int __intel_breadcrumbs_wakeup(struct intel_breadcrumbs *b)
> >   {
> >       struct intel_wait *wait;
> > @@ -36,8 +42,20 @@ static unsigned int __intel_breadcrumbs_wakeup(struct intel_breadcrumbs *b)
> >   
> >       wait = b->irq_wait;
> >       if (wait) {
> > +             /*
> > +              * N.B. Since task_asleep() and ttwu are not atomic, the
> > +              * waiter may actually go to sleep after the check, causing
> > +              * us to suppress a valid wakeup. We prefer to reduce the
> > +              * number of false positive missed_breadcrumb() warnings
> > +              * at the expense of a few false negatives, as it it easy
> > +              * to trigger a false positive under heavy load. Enough
> > +              * signal should remain from genuine missed_breadcrumb()
> > +              * for us to detect in CI.
> > +              */
> > +             bool was_asleep = task_asleep(wait->tsk);
> > +
> >               result = ENGINE_WAKEUP_WAITER;
> > -             if (wake_up_process(wait->tsk))
> > +             if (wake_up_process(wait->tsk) && was_asleep)
> >                       result |= ENGINE_WAKEUP_ASLEEP;
> >       }
> >   
> > @@ -47,12 +65,15 @@ static unsigned int __intel_breadcrumbs_wakeup(struct intel_breadcrumbs *b)
> >   unsigned int intel_engine_wakeup(struct intel_engine_cs *engine)
> >   {
> >       struct intel_breadcrumbs *b = &engine->breadcrumbs;
> > -     unsigned long flags;
> > -     unsigned int result;
> > +     unsigned int result = 0;
> >   
> > -     spin_lock_irqsave(&b->irq_lock, flags);
> > -     result = __intel_breadcrumbs_wakeup(b);
> > -     spin_unlock_irqrestore(&b->irq_lock, flags);
> > +     if (READ_ONCE(b->irq_wait)) {
> > +             unsigned long flags;
> > +
> > +             spin_lock_irqsave(&b->irq_lock, flags);
> > +             result = __intel_breadcrumbs_wakeup(b);
> > +             spin_unlock_irqrestore(&b->irq_lock, flags);
> > +     }
> 
> This hunk I'd leave out from the fix.

And if I postpone that hunk to tomorrow, would r-b the rest?
-Chris
Tvrtko Ursulin Dec. 11, 2017, 5:21 p.m. UTC | #3
On 11/12/2017 17:08, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2017-12-11 16:10:49)
>>
>> On 09/12/2017 12:47, Chris Wilson wrote:
>>> If we attempt to wake up a waiter, who is currently checking the seqno
>>> it will be in the TASK_INTERRUPTIBLE state and ttwu will report success.
>>> However, it is actually awake and functioning -- so delay reporting the
>>> actual wake up until it sleeps. This fixes some spurious claims of
>>> missed_breadcrumbs when running under heavy load; i.e. sufficient load to
>>> preempt away the newly woken waiter before they complete their checks.
>>> However, it does so at the cost of a rare false negative; where the
>>> waiter changes between the check and ttwu -- the only way to fix that
>>> would be to extend the reporting from ttwu where the check could be done
>>> atomically.
>>>
>>> v2: Defend against !CONFIG_SMP
>>> v3: Don't filter out calls to wake_up_process
>>>
>>> Testcase: igt/drv_missed_irq # sanity check we do detect missed_breadcrumb()
>>> Testcase: igt/gem_concurrent_blit # for generating false positives
>>> References: https://bugs.freedesktop.org/show_bug.cgi?id=100007
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/intel_breadcrumbs.c | 39 ++++++++++++++++++++++++--------
>>>    1 file changed, 30 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
>>> index 24c6fefdd0b1..76e6f8e7cfd4 100644
>>> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
>>> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
>>> @@ -27,6 +27,12 @@
>>>    
>>>    #include "i915_drv.h"
>>>    
>>> +#ifdef CONFIG_SMP
>>> +#define task_asleep(tsk) ((tsk)->state & TASK_NORMAL && !(tsk)->on_cpu)
>>> +#else
>>> +#define task_asleep(tsk) ((tsk)->state & TASK_NORMAL)
>>> +#endif
>>> +
>>
>> I kind of remember the on_cpu from before and I was probably complaining
>> about it. Sigh, if it helps ok..
>>
>>>    static unsigned int __intel_breadcrumbs_wakeup(struct intel_breadcrumbs *b)
>>>    {
>>>        struct intel_wait *wait;
>>> @@ -36,8 +42,20 @@ static unsigned int __intel_breadcrumbs_wakeup(struct intel_breadcrumbs *b)
>>>    
>>>        wait = b->irq_wait;
>>>        if (wait) {
>>> +             /*
>>> +              * N.B. Since task_asleep() and ttwu are not atomic, the
>>> +              * waiter may actually go to sleep after the check, causing
>>> +              * us to suppress a valid wakeup. We prefer to reduce the
>>> +              * number of false positive missed_breadcrumb() warnings
>>> +              * at the expense of a few false negatives, as it it easy
>>> +              * to trigger a false positive under heavy load. Enough
>>> +              * signal should remain from genuine missed_breadcrumb()
>>> +              * for us to detect in CI.
>>> +              */
>>> +             bool was_asleep = task_asleep(wait->tsk);
>>> +
>>>                result = ENGINE_WAKEUP_WAITER;
>>> -             if (wake_up_process(wait->tsk))
>>> +             if (wake_up_process(wait->tsk) && was_asleep)
>>>                        result |= ENGINE_WAKEUP_ASLEEP;
>>>        }
>>>    
>>> @@ -47,12 +65,15 @@ static unsigned int __intel_breadcrumbs_wakeup(struct intel_breadcrumbs *b)
>>>    unsigned int intel_engine_wakeup(struct intel_engine_cs *engine)
>>>    {
>>>        struct intel_breadcrumbs *b = &engine->breadcrumbs;
>>> -     unsigned long flags;
>>> -     unsigned int result;
>>> +     unsigned int result = 0;
>>>    
>>> -     spin_lock_irqsave(&b->irq_lock, flags);
>>> -     result = __intel_breadcrumbs_wakeup(b);
>>> -     spin_unlock_irqrestore(&b->irq_lock, flags);
>>> +     if (READ_ONCE(b->irq_wait)) {
>>> +             unsigned long flags;
>>> +
>>> +             spin_lock_irqsave(&b->irq_lock, flags);
>>> +             result = __intel_breadcrumbs_wakeup(b);
>>> +             spin_unlock_irqrestore(&b->irq_lock, flags);
>>> +     }
>>
>> This hunk I'd leave out from the fix.
> 
> And if I postpone that hunk to tomorrow, would r-b the rest?

Yep.

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 24c6fefdd0b1..76e6f8e7cfd4 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -27,6 +27,12 @@ 
 
 #include "i915_drv.h"
 
+#ifdef CONFIG_SMP
+#define task_asleep(tsk) ((tsk)->state & TASK_NORMAL && !(tsk)->on_cpu)
+#else
+#define task_asleep(tsk) ((tsk)->state & TASK_NORMAL)
+#endif
+
 static unsigned int __intel_breadcrumbs_wakeup(struct intel_breadcrumbs *b)
 {
 	struct intel_wait *wait;
@@ -36,8 +42,20 @@  static unsigned int __intel_breadcrumbs_wakeup(struct intel_breadcrumbs *b)
 
 	wait = b->irq_wait;
 	if (wait) {
+		/*
+		 * N.B. Since task_asleep() and ttwu are not atomic, the
+		 * waiter may actually go to sleep after the check, causing
+		 * us to suppress a valid wakeup. We prefer to reduce the
+		 * number of false positive missed_breadcrumb() warnings
+		 * at the expense of a few false negatives, as it it easy
+		 * to trigger a false positive under heavy load. Enough
+		 * signal should remain from genuine missed_breadcrumb()
+		 * for us to detect in CI.
+		 */
+		bool was_asleep = task_asleep(wait->tsk);
+
 		result = ENGINE_WAKEUP_WAITER;
-		if (wake_up_process(wait->tsk))
+		if (wake_up_process(wait->tsk) && was_asleep)
 			result |= ENGINE_WAKEUP_ASLEEP;
 	}
 
@@ -47,12 +65,15 @@  static unsigned int __intel_breadcrumbs_wakeup(struct intel_breadcrumbs *b)
 unsigned int intel_engine_wakeup(struct intel_engine_cs *engine)
 {
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	unsigned long flags;
-	unsigned int result;
+	unsigned int result = 0;
 
-	spin_lock_irqsave(&b->irq_lock, flags);
-	result = __intel_breadcrumbs_wakeup(b);
-	spin_unlock_irqrestore(&b->irq_lock, flags);
+	if (READ_ONCE(b->irq_wait)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&b->irq_lock, flags);
+		result = __intel_breadcrumbs_wakeup(b);
+		spin_unlock_irqrestore(&b->irq_lock, flags);
+	}
 
 	return result;
 }
@@ -77,8 +98,8 @@  static noinline void missed_breadcrumb(struct intel_engine_cs *engine)
 
 static void intel_breadcrumbs_hangcheck(struct timer_list *t)
 {
-	struct intel_engine_cs *engine = from_timer(engine, t,
-						    breadcrumbs.hangcheck);
+	struct intel_engine_cs *engine =
+		from_timer(engine, t, breadcrumbs.hangcheck);
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
 
 	if (!b->irq_armed)
@@ -104,7 +125,7 @@  static void intel_breadcrumbs_hangcheck(struct timer_list *t)
 	 */
 	if (intel_engine_wakeup(engine) & ENGINE_WAKEUP_ASLEEP) {
 		missed_breadcrumb(engine);
-		mod_timer(&engine->breadcrumbs.fake_irq, jiffies + 1);
+		mod_timer(&b->fake_irq, jiffies + 1);
 	} else {
 		mod_timer(&b->hangcheck, wait_timeout());
 	}