diff mbox

[06/18] drm/i915/breadcrumbs: Keep the fake irq armed across reset

Message ID 20180409111413.6352-7-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson April 9, 2018, 11:14 a.m. UTC
Instead of synchronously cancelling the timer and re-enabling it inside
the reset callbacks, keep the timer enabled and let it die on its next
wakeup if no longer required. This allows
intel_engine_reset_breadcrumbs() to be used from an atomic
(timer/softirq) context such as required for resetting an engine.

It also allows us to react better to the user poking around debugfs for
testing missed irqs.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 27 +++++++++++++++++-------
 1 file changed, 19 insertions(+), 8 deletions(-)

Comments

Mika Kuoppala April 23, 2018, 1:03 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Instead of synchronously cancelling the timer and re-enabling it inside
> the reset callbacks, keep the timer enabled and let it die on its next
> wakeup if no longer required. This allows
> intel_engine_reset_breadcrumbs() to be used from an atomic
> (timer/softirq) context such as required for resetting an engine.
>
> It also allows us to react better to the user poking around debugfs for
> testing missed irqs.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/intel_breadcrumbs.c | 27 +++++++++++++++++-------
>  1 file changed, 19 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 671a6d61e29d..0a2128c6b418 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -130,11 +130,12 @@ static void intel_breadcrumbs_hangcheck(struct timer_list *t)
>  
>  static void intel_breadcrumbs_fake_irq(struct timer_list *t)
>  {
> -	struct intel_engine_cs *engine = from_timer(engine, t,
> -						    breadcrumbs.fake_irq);
> +	struct intel_engine_cs *engine =
> +		from_timer(engine, t, breadcrumbs.fake_irq);
>  	struct intel_breadcrumbs *b = &engine->breadcrumbs;
>  
> -	/* The timer persists in case we cannot enable interrupts,
> +	/*
> +	 * The timer persists in case we cannot enable interrupts,
>  	 * or if we have previously seen seqno/interrupt incoherency
>  	 * ("missed interrupt" syndrome, better known as a "missed breadcrumb").
>  	 * Here the worker will wake up every jiffie in order to kick the
> @@ -148,6 +149,12 @@ static void intel_breadcrumbs_fake_irq(struct timer_list *t)
>  	if (!b->irq_armed)
>  		return;
>  
> +	/* If the user has disabled the fake-irq, restore the hangchecking */
> +	if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) {
> +		mod_timer(&b->hangcheck, wait_timeout());
> +		return;
> +	}
> +

Looking at the cancel_fake_irq() now, which we still need to keep as
sync, I think there is race introduce now as this can queue itself.

I think we want to also change the cancel_fake_irq() to do
the bit clearing first, not last after the del_timer_syncs().

-Mika

>  	mod_timer(&b->fake_irq, jiffies + 1);
>  }
>  
> @@ -840,15 +847,22 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
>  {
>  	struct intel_breadcrumbs *b = &engine->breadcrumbs;
>  
> -	cancel_fake_irq(engine);
>  	spin_lock_irq(&b->irq_lock);
>  
> +	/*
> +	 * Leave the fake_irq timer enabled (if it is running), but clear the
> +	 * bit so that it turns itself off on its next wake up and goes back
> +	 * to the long hangcheck interval if still required.
> +	 */
> +	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
> +
>  	if (b->irq_enabled)
>  		irq_enable(engine);
>  	else
>  		irq_disable(engine);
>  
> -	/* We set the IRQ_BREADCRUMB bit when we enable the irq presuming the
> +	/*
> +	 * We set the IRQ_BREADCRUMB bit when we enable the irq presuming the
>  	 * GPU is active and may have already executed the MI_USER_INTERRUPT
>  	 * before the CPU is ready to receive. However, the engine is currently
>  	 * idle (we haven't started it yet), there is no possibility for a
> @@ -857,9 +871,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
>  	 */
>  	clear_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted);
>  
> -	if (b->irq_armed)
> -		enable_fake_irq(b);
> -
>  	spin_unlock_irq(&b->irq_lock);
>  }
>  
> -- 
> 2.17.0
Chris Wilson April 23, 2018, 2:53 p.m. UTC | #2
Quoting Mika Kuoppala (2018-04-23 14:03:02)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> > @@ -148,6 +149,12 @@ static void intel_breadcrumbs_fake_irq(struct timer_list *t)
> >       if (!b->irq_armed)
> >               return;
> >  
> > +     /* If the user has disabled the fake-irq, restore the hangchecking */
> > +     if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) {
> > +             mod_timer(&b->hangcheck, wait_timeout());
> > +             return;
> > +     }
> > +
> 
> Looking at the cancel_fake_irq() now, which we still need to keep as
> sync, I think there is race introduce now as this can queue itself.
> 
> I think we want to also change the cancel_fake_irq() to do
> the bit clearing first, not last after the del_timer_syncs().

I see what you mean, but I think we want

	del_timer_sync(&b->fake_irq); // may queue b->hangcheck
	del_timer_sync(&b->hangcheck);
	clear_bit(engine->id. missed_irq_rings);

-Chris
Mika Kuoppala April 24, 2018, 11:06 a.m. UTC | #3
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2018-04-23 14:03:02)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> > @@ -148,6 +149,12 @@ static void intel_breadcrumbs_fake_irq(struct timer_list *t)
>> >       if (!b->irq_armed)
>> >               return;
>> >  
>> > +     /* If the user has disabled the fake-irq, restore the hangchecking */
>> > +     if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) {
>> > +             mod_timer(&b->hangcheck, wait_timeout());
>> > +             return;
>> > +     }
>> > +
>> 
>> Looking at the cancel_fake_irq() now, which we still need to keep as
>> sync, I think there is race introduce now as this can queue itself.
>> 
>> I think we want to also change the cancel_fake_irq() to do
>> the bit clearing first, not last after the del_timer_syncs().
>
> I see what you mean, but I think we want
>
> 	del_timer_sync(&b->fake_irq); // may queue b->hangcheck
> 	del_timer_sync(&b->hangcheck);
> 	clear_bit(engine->id. missed_irq_rings);
>

Ok got it. Swapping the order makes the newly
queued hangcheck from fake_irq to be cancelled and
the clearing should be last.

But now as the cancel is done only in fini path,
consider adding assertion for !irq_armed in start of
cancel_fake_irq().

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

-Mika
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 671a6d61e29d..0a2128c6b418 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -130,11 +130,12 @@  static void intel_breadcrumbs_hangcheck(struct timer_list *t)
 
 static void intel_breadcrumbs_fake_irq(struct timer_list *t)
 {
-	struct intel_engine_cs *engine = from_timer(engine, t,
-						    breadcrumbs.fake_irq);
+	struct intel_engine_cs *engine =
+		from_timer(engine, t, breadcrumbs.fake_irq);
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
 
-	/* The timer persists in case we cannot enable interrupts,
+	/*
+	 * The timer persists in case we cannot enable interrupts,
 	 * or if we have previously seen seqno/interrupt incoherency
 	 * ("missed interrupt" syndrome, better known as a "missed breadcrumb").
 	 * Here the worker will wake up every jiffie in order to kick the
@@ -148,6 +149,12 @@  static void intel_breadcrumbs_fake_irq(struct timer_list *t)
 	if (!b->irq_armed)
 		return;
 
+	/* If the user has disabled the fake-irq, restore the hangchecking */
+	if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) {
+		mod_timer(&b->hangcheck, wait_timeout());
+		return;
+	}
+
 	mod_timer(&b->fake_irq, jiffies + 1);
 }
 
@@ -840,15 +847,22 @@  void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
 {
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
 
-	cancel_fake_irq(engine);
 	spin_lock_irq(&b->irq_lock);
 
+	/*
+	 * Leave the fake_irq timer enabled (if it is running), but clear the
+	 * bit so that it turns itself off on its next wake up and goes back
+	 * to the long hangcheck interval if still required.
+	 */
+	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
+
 	if (b->irq_enabled)
 		irq_enable(engine);
 	else
 		irq_disable(engine);
 
-	/* We set the IRQ_BREADCRUMB bit when we enable the irq presuming the
+	/*
+	 * We set the IRQ_BREADCRUMB bit when we enable the irq presuming the
 	 * GPU is active and may have already executed the MI_USER_INTERRUPT
 	 * before the CPU is ready to receive. However, the engine is currently
 	 * idle (we haven't started it yet), there is no possibility for a
@@ -857,9 +871,6 @@  void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
 	 */
 	clear_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted);
 
-	if (b->irq_armed)
-		enable_fake_irq(b);
-
 	spin_unlock_irq(&b->irq_lock);
 }