diff mbox series

[29/34] drm/i915: Drop fake breadcrumb irq

Message ID 20190121222117.23305-30-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/34] drm/i915/execlists: Mark up priority boost on preemption | expand

Commit Message

Chris Wilson Jan. 21, 2019, 10:21 p.m. UTC
Missed breadcrumb detection is defunct due to the tight coupling with
dma_fence signaling and the myriad ways we may signal fences from
everywhere but from an interrupt, i.e. we frequently signal a fence
before we even see its interrupt. This means that even if we miss an
interrupt for a fence, it still is signaled before our breadcrumb
hangcheck fires, so simplify the breadcrumb hangchecking by moving it
into the GPU hangcheck and forgo fake interrupts.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  93 -----------
 drivers/gpu/drm/i915/i915_gpu_error.c         |   2 -
 drivers/gpu/drm/i915/i915_gpu_error.h         |   5 -
 drivers/gpu/drm/i915/intel_breadcrumbs.c      | 147 +-----------------
 drivers/gpu/drm/i915/intel_hangcheck.c        |   2 +
 drivers/gpu/drm/i915/intel_ringbuffer.h       |   5 -
 .../gpu/drm/i915/selftests/igt_live_test.c    |   7 -
 7 files changed, 5 insertions(+), 256 deletions(-)

Comments

Tvrtko Ursulin Jan. 24, 2019, 5:55 p.m. UTC | #1
On 21/01/2019 22:21, Chris Wilson wrote:
> Missed breadcrumb detection is defunct due to the tight coupling with

How it is defunct.. oh because there is no irq_count any more... could 
you describe the transition from irq_count to irq_fired and then to 
nothing briefly?

> dma_fence signaling and the myriad ways we may signal fences from
> everywhere but from an interrupt, i.e. we frequently signal a fence
> before we even see its interrupt. This means that even if we miss an
> interrupt for a fence, it still is signaled before our breadcrumb
> hangcheck fires, so simplify the breadcrumb hangchecking by moving it
> into the GPU hangcheck and forgo fake interrupts.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_debugfs.c           |  93 -----------
>   drivers/gpu/drm/i915/i915_gpu_error.c         |   2 -
>   drivers/gpu/drm/i915/i915_gpu_error.h         |   5 -
>   drivers/gpu/drm/i915/intel_breadcrumbs.c      | 147 +-----------------
>   drivers/gpu/drm/i915/intel_hangcheck.c        |   2 +
>   drivers/gpu/drm/i915/intel_ringbuffer.h       |   5 -
>   .../gpu/drm/i915/selftests/igt_live_test.c    |   7 -
>   7 files changed, 5 insertions(+), 256 deletions(-)

With this balance of insertions and deletions I cannot decide if this is 
easy or hard to review.

IGT uses intel_detect_and_clear_missed_interrupts a lot. What is the 
plan there?

> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index d7764e62e9b4..c2aaf010c3d1 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1321,9 +1321,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
>   			   intel_engine_last_submit(engine),
>   			   jiffies_to_msecs(jiffies -
>   					    engine->hangcheck.action_timestamp));
> -		seq_printf(m, "\tfake irq active? %s\n",
> -			   yesno(test_bit(engine->id,
> -					  &dev_priv->gpu_error.missed_irq_rings)));
>   
>   		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
>   			   (long long)engine->hangcheck.acthd,
> @@ -3874,94 +3871,6 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
>   			i915_wedged_get, i915_wedged_set,
>   			"%llu\n");
>   
> -static int
> -fault_irq_set(struct drm_i915_private *i915,
> -	      unsigned long *irq,
> -	      unsigned long val)
> -{
> -	int err;
> -
> -	err = mutex_lock_interruptible(&i915->drm.struct_mutex);
> -	if (err)
> -		return err;
> -
> -	err = i915_gem_wait_for_idle(i915,
> -				     I915_WAIT_LOCKED |
> -				     I915_WAIT_INTERRUPTIBLE,
> -				     MAX_SCHEDULE_TIMEOUT);
> -	if (err)
> -		goto err_unlock;
> -
> -	*irq = val;
> -	mutex_unlock(&i915->drm.struct_mutex);
> -
> -	/* Flush idle worker to disarm irq */
> -	drain_delayed_work(&i915->gt.idle_work);
> -
> -	return 0;
> -
> -err_unlock:
> -	mutex_unlock(&i915->drm.struct_mutex);
> -	return err;
> -}
> -
> -static int
> -i915_ring_missed_irq_get(void *data, u64 *val)
> -{
> -	struct drm_i915_private *dev_priv = data;
> -
> -	*val = dev_priv->gpu_error.missed_irq_rings;
> -	return 0;
> -}
> -
> -static int
> -i915_ring_missed_irq_set(void *data, u64 val)
> -{
> -	struct drm_i915_private *i915 = data;
> -
> -	return fault_irq_set(i915, &i915->gpu_error.missed_irq_rings, val);
> -}
> -
> -DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
> -			i915_ring_missed_irq_get, i915_ring_missed_irq_set,
> -			"0x%08llx\n");
> -
> -static int
> -i915_ring_test_irq_get(void *data, u64 *val)
> -{
> -	struct drm_i915_private *dev_priv = data;
> -
> -	*val = dev_priv->gpu_error.test_irq_rings;
> -
> -	return 0;
> -}
> -
> -static int
> -i915_ring_test_irq_set(void *data, u64 val)
> -{
> -	struct drm_i915_private *i915 = data;
> -
> -	/* GuC keeps the user interrupt permanently enabled for submission */
> -	if (USES_GUC_SUBMISSION(i915))
> -		return -ENODEV;
> -
> -	/*
> -	 * From icl, we can no longer individually mask interrupt generation
> -	 * from each engine.
> -	 */
> -	if (INTEL_GEN(i915) >= 11)
> -		return -ENODEV;
> -
> -	val &= INTEL_INFO(i915)->ring_mask;
> -	DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
> -
> -	return fault_irq_set(i915, &i915->gpu_error.test_irq_rings, val);
> -}
> -
> -DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
> -			i915_ring_test_irq_get, i915_ring_test_irq_set,
> -			"0x%08llx\n");
> -
>   #define DROP_UNBOUND	BIT(0)
>   #define DROP_BOUND	BIT(1)
>   #define DROP_RETIRE	BIT(2)
> @@ -4724,8 +4633,6 @@ static const struct i915_debugfs_files {
>   } i915_debugfs_files[] = {
>   	{"i915_wedged", &i915_wedged_fops},
>   	{"i915_cache_sharing", &i915_cache_sharing_fops},
> -	{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
> -	{"i915_ring_test_irq", &i915_ring_test_irq_fops},
>   	{"i915_gem_drop_caches", &i915_drop_caches_fops},
>   #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
>   	{"i915_error_state", &i915_error_state_fops},
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 825572127029..0584c8dfa6ae 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -718,8 +718,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
>   	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
>   	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
>   	err_printf(m, "CCID: 0x%08x\n", error->ccid);
> -	err_printf(m, "Missed interrupts: 0x%08lx\n",
> -		   m->i915->gpu_error.missed_irq_rings);
>   
>   	for (i = 0; i < error->nfence; i++)
>   		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index 0e184712cbcc..99a53c0cd6da 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -203,8 +203,6 @@ struct i915_gpu_error {
>   
>   	atomic_t pending_fb_pin;
>   
> -	unsigned long missed_irq_rings;
> -
>   	/**
>   	 * State variable controlling the reset flow and count
>   	 *
> @@ -273,9 +271,6 @@ struct i915_gpu_error {
>   	 */
>   	wait_queue_head_t reset_queue;
>   
> -	/* For missed irq/seqno simulation. */
> -	unsigned long test_irq_rings;
> -
>   	struct i915_gpu_restart *restart;
>   };
>   
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index faeb0083b561..3bdfa63ea4a1 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -91,7 +91,6 @@ bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine)
>   
>   	spin_lock(&b->irq_lock);
>   
> -	b->irq_fired = true;
>   	if (b->irq_armed && list_empty(&b->signalers))
>   		__intel_breadcrumbs_disarm_irq(b);
>   
> @@ -155,86 +154,6 @@ static void signal_irq_work(struct irq_work *work)
>   	intel_engine_breadcrumbs_irq(engine);
>   }
>   
> -static unsigned long wait_timeout(void)
> -{
> -	return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES);
> -}
> -
> -static noinline void missed_breadcrumb(struct intel_engine_cs *engine)
> -{
> -	if (GEM_SHOW_DEBUG()) {
> -		struct drm_printer p = drm_debug_printer(__func__);
> -
> -		intel_engine_dump(engine, &p,
> -				  "%s missed breadcrumb at %pS\n",
> -				  engine->name, __builtin_return_address(0));
> -	}
> -
> -	set_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
> -}
> -
> -static void intel_breadcrumbs_hangcheck(struct timer_list *t)
> -{
> -	struct intel_engine_cs *engine =
> -		from_timer(engine, t, breadcrumbs.hangcheck);
> -	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> -
> -	if (!b->irq_armed)
> -		return;
> -
> -	if (b->irq_fired)
> -		goto rearm;
> -
> -	/*
> -	 * We keep the hangcheck timer alive until we disarm the irq, even
> -	 * if there are no waiters at present.
> -	 *
> -	 * If the waiter was currently running, assume it hasn't had a chance
> -	 * to process the pending interrupt (e.g, low priority task on a loaded
> -	 * system) and wait until it sleeps before declaring a missed interrupt.
> -	 *
> -	 * If the waiter was asleep (and not even pending a wakeup), then we
> -	 * must have missed an interrupt as the GPU has stopped advancing
> -	 * but we still have a waiter. Assuming all batches complete within
> -	 * DRM_I915_HANGCHECK_JIFFIES [1.5s]!
> -	 */
> -	synchronize_hardirq(engine->i915->drm.irq);
> -	if (intel_engine_signal_breadcrumbs(engine)) {
> -		missed_breadcrumb(engine);
> -		mod_timer(&b->fake_irq, jiffies + 1);
> -	} else {
> -rearm:
> -		b->irq_fired = false;
> -		mod_timer(&b->hangcheck, wait_timeout());
> -	}
> -}
> -
> -static void intel_breadcrumbs_fake_irq(struct timer_list *t)
> -{
> -	struct intel_engine_cs *engine =
> -		from_timer(engine, t, breadcrumbs.fake_irq);
> -	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> -
> -	/*
> -	 * The timer persists in case we cannot enable interrupts,
> -	 * or if we have previously seen seqno/interrupt incoherency
> -	 * ("missed interrupt" syndrome, better known as a "missed breadcrumb").
> -	 * Here the worker will wake up every jiffie in order to kick the
> -	 * oldest waiter to do the coherent seqno check.
> -	 */
> -
> -	if (!intel_engine_signal_breadcrumbs(engine) && !b->irq_armed)
> -		return;
> -
> -	/* If the user has disabled the fake-irq, restore the hangchecking */
> -	if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) {
> -		mod_timer(&b->hangcheck, wait_timeout());
> -		return;
> -	}
> -
> -	mod_timer(&b->fake_irq, jiffies + 1);
> -}
> -
>   void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine)
>   {
>   	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> @@ -257,43 +176,14 @@ void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine)
>   	spin_unlock_irq(&b->irq_lock);
>   }
>   
> -static bool use_fake_irq(const struct intel_breadcrumbs *b)
> -{
> -	const struct intel_engine_cs *engine =
> -		container_of(b, struct intel_engine_cs, breadcrumbs);
> -
> -	if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
> -		return false;
> -
> -	/*
> -	 * Only start with the heavy weight fake irq timer if we have not
> -	 * seen any interrupts since enabling it the first time. If the
> -	 * interrupts are still arriving, it means we made a mistake in our
> -	 * engine->seqno_barrier(), a timing error that should be transient
> -	 * and unlikely to reoccur.
> -	 */
> -	return !b->irq_fired;
> -}
> -
> -static void enable_fake_irq(struct intel_breadcrumbs *b)
> -{
> -	/* Ensure we never sleep indefinitely */
> -	if (!b->irq_enabled || use_fake_irq(b))
> -		mod_timer(&b->fake_irq, jiffies + 1);
> -	else
> -		mod_timer(&b->hangcheck, wait_timeout());
> -}
> -
> -static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
> +static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
>   {
>   	struct intel_engine_cs *engine =
>   		container_of(b, struct intel_engine_cs, breadcrumbs);
> -	struct drm_i915_private *i915 = engine->i915;
> -	bool enabled;
>   
>   	lockdep_assert_held(&b->irq_lock);
>   	if (b->irq_armed)
> -		return false;
> +		return;
>   
>   	/*
>   	 * The breadcrumb irq will be disarmed on the interrupt after the
> @@ -311,16 +201,8 @@ static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
>   	 * the driver is idle) we disarm the breadcrumbs.
>   	 */
>   
> -	/* No interrupts? Kick the waiter every jiffie! */
> -	enabled = false;
> -	if (!b->irq_enabled++ &&
> -	    !test_bit(engine->id, &i915->gpu_error.test_irq_rings)) {
> +	if (!b->irq_enabled++)
>   		irq_enable(engine);
> -		enabled = true;
> -	}
> -
> -	enable_fake_irq(b);
> -	return enabled;
>   }
>   
>   void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
> @@ -331,18 +213,6 @@ void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
>   	INIT_LIST_HEAD(&b->signalers);
>   
>   	init_irq_work(&b->irq_work, signal_irq_work);
> -
> -	timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
> -	timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
> -}
> -
> -static void cancel_fake_irq(struct intel_engine_cs *engine)
> -{
> -	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> -
> -	del_timer_sync(&b->fake_irq); /* may queue b->hangcheck */
> -	del_timer_sync(&b->hangcheck);
> -	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
>   }
>   
>   void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
> @@ -352,13 +222,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
>   
>   	spin_lock_irqsave(&b->irq_lock, flags);
>   
> -	/*
> -	 * Leave the fake_irq timer enabled (if it is running), but clear the
> -	 * bit so that it turns itself off on its next wake up and goes back
> -	 * to the long hangcheck interval if still required.
> -	 */
> -	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
> -
>   	if (b->irq_enabled)
>   		irq_enable(engine);
>   	else
> @@ -369,7 +232,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
>   
>   void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
>   {
> -	cancel_fake_irq(engine);
>   }
>   
>   bool intel_engine_enable_signaling(struct i915_request *rq)
> @@ -451,7 +313,4 @@ void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine,
>   		}
>   	}
>   	spin_unlock_irq(&b->irq_lock);
> -
> -	if (test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
> -		drm_printf(p, "Fake irq active\n");
>   }
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index 5662d6fed523..a219c796e56d 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -275,6 +275,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   	for_each_engine(engine, dev_priv, id) {
>   		struct hangcheck hc;
>   
> +		intel_engine_signal_breadcrumbs(engine);
> +

Sounds fine. So only downside is detecting missed interrupts gets 
slower. But in practice they don't happen often?

>   		hangcheck_load_sample(engine, &hc);
>   		hangcheck_accumulate_sample(engine, &hc);
>   		hangcheck_store_sample(engine, &hc);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index b78cb9bd4bc2..7eec96cf2a0b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -382,14 +382,9 @@ struct intel_engine_cs {
>   
>   		struct irq_work irq_work;
>   
> -		struct timer_list fake_irq; /* used after a missed interrupt */
> -		struct timer_list hangcheck; /* detect missed interrupts */
> -
> -		unsigned int hangcheck_interrupts;
>   		unsigned int irq_enabled;
>   
>   		bool irq_armed;
> -		bool irq_fired;
>   	} breadcrumbs;
>   
>   	struct {
> diff --git a/drivers/gpu/drm/i915/selftests/igt_live_test.c b/drivers/gpu/drm/i915/selftests/igt_live_test.c
> index 5deb485fb942..3e902761cd16 100644
> --- a/drivers/gpu/drm/i915/selftests/igt_live_test.c
> +++ b/drivers/gpu/drm/i915/selftests/igt_live_test.c
> @@ -35,7 +35,6 @@ int igt_live_test_begin(struct igt_live_test *t,
>   		return err;
>   	}
>   
> -	i915->gpu_error.missed_irq_rings = 0;
>   	t->reset_global = i915_reset_count(&i915->gpu_error);
>   
>   	for_each_engine(engine, i915, id)
> @@ -75,11 +74,5 @@ int igt_live_test_end(struct igt_live_test *t)
>   		return -EIO;
>   	}
>   
> -	if (i915->gpu_error.missed_irq_rings) {
> -		pr_err("%s(%s): Missed interrupts on engines %lx\n",
> -		       t->func, t->name, i915->gpu_error.missed_irq_rings);
> -		return -EIO;
> -	}
> -
>   	return 0;
>   }
> 

Regards,

Tvrtko
Chris Wilson Jan. 24, 2019, 6:18 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-01-24 17:55:20)
> 
> On 21/01/2019 22:21, Chris Wilson wrote:
> > Missed breadcrumb detection is defunct due to the tight coupling with
> 
> How it is defunct.. oh because there is no irq_count any more... could 
> you describe the transition from irq_count to irq_fired and then to 
> nothing briefly?

We don't have an independent intel_wait to distinguish irq completions
from dma_fence_signals. Everytime we call dma_fence_signal() we think we
saw an interrupt, and we complete fences very often before we see
interrupts... And then our test completely fails to setup the machine to
detect the missed breadcrumb as the requests get completed by anything
but the missed breadcrumb timer.

> > dma_fence signaling and the myriad ways we may signal fences from
> > everywhere but from an interrupt, i.e. we frequently signal a fence
> > before we even see its interrupt. This means that even if we miss an
> > interrupt for a fence, it still is signaled before our breadcrumb
> > hangcheck fires, so simplify the breadcrumb hangchecking by moving it
> > into the GPU hangcheck and forgo fake interrupts.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   drivers/gpu/drm/i915/i915_debugfs.c           |  93 -----------
> >   drivers/gpu/drm/i915/i915_gpu_error.c         |   2 -
> >   drivers/gpu/drm/i915/i915_gpu_error.h         |   5 -
> >   drivers/gpu/drm/i915/intel_breadcrumbs.c      | 147 +-----------------
> >   drivers/gpu/drm/i915/intel_hangcheck.c        |   2 +
> >   drivers/gpu/drm/i915/intel_ringbuffer.h       |   5 -
> >   .../gpu/drm/i915/selftests/igt_live_test.c    |   7 -
> >   7 files changed, 5 insertions(+), 256 deletions(-)
> 
> With this balance of insertions and deletions I cannot decide if this is 
> easy or hard to review.
> 
> IGT uses intel_detect_and_clear_missed_interrupts a lot. What is the 
> plan there?

They are defunct. They no longer detect anything useful from the
previous patch, this just makes it official. igt has been prepped for
the loss of the debugfs.

Without this patch we get false positives from i915_missed_interrupt
instead.

I've tried and failed to replace the detection in an acceptable manner,
without a separate irq completion tracker it seems hopeless.

> > diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> > index 5662d6fed523..a219c796e56d 100644
> > --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> > +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> > @@ -275,6 +275,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> >       for_each_engine(engine, dev_priv, id) {
> >               struct hangcheck hc;
> >   
> > +             intel_engine_signal_breadcrumbs(engine);
> > +
> 
> Sounds fine. So only downside is detecting missed interrupts gets 
> slower. But in practice they don't happen often?

In practice, no missed interrupts. *touch wood*
That was why fixing gen5-gen7 beforehand was so important.

Having a signal here and in retire_work, means that in the worst case
everything updates once a second. Enough for users to be able to
complain. But more than likely every frame update will flush the earlier
requests anyway, hence why we couldn't detect missed breadcrumbs in igt
in the first place.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index d7764e62e9b4..c2aaf010c3d1 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1321,9 +1321,6 @@  static int i915_hangcheck_info(struct seq_file *m, void *unused)
 			   intel_engine_last_submit(engine),
 			   jiffies_to_msecs(jiffies -
 					    engine->hangcheck.action_timestamp));
-		seq_printf(m, "\tfake irq active? %s\n",
-			   yesno(test_bit(engine->id,
-					  &dev_priv->gpu_error.missed_irq_rings)));
 
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
@@ -3874,94 +3871,6 @@  DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
 			i915_wedged_get, i915_wedged_set,
 			"%llu\n");
 
-static int
-fault_irq_set(struct drm_i915_private *i915,
-	      unsigned long *irq,
-	      unsigned long val)
-{
-	int err;
-
-	err = mutex_lock_interruptible(&i915->drm.struct_mutex);
-	if (err)
-		return err;
-
-	err = i915_gem_wait_for_idle(i915,
-				     I915_WAIT_LOCKED |
-				     I915_WAIT_INTERRUPTIBLE,
-				     MAX_SCHEDULE_TIMEOUT);
-	if (err)
-		goto err_unlock;
-
-	*irq = val;
-	mutex_unlock(&i915->drm.struct_mutex);
-
-	/* Flush idle worker to disarm irq */
-	drain_delayed_work(&i915->gt.idle_work);
-
-	return 0;
-
-err_unlock:
-	mutex_unlock(&i915->drm.struct_mutex);
-	return err;
-}
-
-static int
-i915_ring_missed_irq_get(void *data, u64 *val)
-{
-	struct drm_i915_private *dev_priv = data;
-
-	*val = dev_priv->gpu_error.missed_irq_rings;
-	return 0;
-}
-
-static int
-i915_ring_missed_irq_set(void *data, u64 val)
-{
-	struct drm_i915_private *i915 = data;
-
-	return fault_irq_set(i915, &i915->gpu_error.missed_irq_rings, val);
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
-			i915_ring_missed_irq_get, i915_ring_missed_irq_set,
-			"0x%08llx\n");
-
-static int
-i915_ring_test_irq_get(void *data, u64 *val)
-{
-	struct drm_i915_private *dev_priv = data;
-
-	*val = dev_priv->gpu_error.test_irq_rings;
-
-	return 0;
-}
-
-static int
-i915_ring_test_irq_set(void *data, u64 val)
-{
-	struct drm_i915_private *i915 = data;
-
-	/* GuC keeps the user interrupt permanently enabled for submission */
-	if (USES_GUC_SUBMISSION(i915))
-		return -ENODEV;
-
-	/*
-	 * From icl, we can no longer individually mask interrupt generation
-	 * from each engine.
-	 */
-	if (INTEL_GEN(i915) >= 11)
-		return -ENODEV;
-
-	val &= INTEL_INFO(i915)->ring_mask;
-	DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
-
-	return fault_irq_set(i915, &i915->gpu_error.test_irq_rings, val);
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
-			i915_ring_test_irq_get, i915_ring_test_irq_set,
-			"0x%08llx\n");
-
 #define DROP_UNBOUND	BIT(0)
 #define DROP_BOUND	BIT(1)
 #define DROP_RETIRE	BIT(2)
@@ -4724,8 +4633,6 @@  static const struct i915_debugfs_files {
 } i915_debugfs_files[] = {
 	{"i915_wedged", &i915_wedged_fops},
 	{"i915_cache_sharing", &i915_cache_sharing_fops},
-	{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
-	{"i915_ring_test_irq", &i915_ring_test_irq_fops},
 	{"i915_gem_drop_caches", &i915_drop_caches_fops},
 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
 	{"i915_error_state", &i915_error_state_fops},
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 825572127029..0584c8dfa6ae 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -718,8 +718,6 @@  static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
 	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
 	err_printf(m, "CCID: 0x%08x\n", error->ccid);
-	err_printf(m, "Missed interrupts: 0x%08lx\n",
-		   m->i915->gpu_error.missed_irq_rings);
 
 	for (i = 0; i < error->nfence; i++)
 		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 0e184712cbcc..99a53c0cd6da 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -203,8 +203,6 @@  struct i915_gpu_error {
 
 	atomic_t pending_fb_pin;
 
-	unsigned long missed_irq_rings;
-
 	/**
 	 * State variable controlling the reset flow and count
 	 *
@@ -273,9 +271,6 @@  struct i915_gpu_error {
 	 */
 	wait_queue_head_t reset_queue;
 
-	/* For missed irq/seqno simulation. */
-	unsigned long test_irq_rings;
-
 	struct i915_gpu_restart *restart;
 };
 
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index faeb0083b561..3bdfa63ea4a1 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -91,7 +91,6 @@  bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine)
 
 	spin_lock(&b->irq_lock);
 
-	b->irq_fired = true;
 	if (b->irq_armed && list_empty(&b->signalers))
 		__intel_breadcrumbs_disarm_irq(b);
 
@@ -155,86 +154,6 @@  static void signal_irq_work(struct irq_work *work)
 	intel_engine_breadcrumbs_irq(engine);
 }
 
-static unsigned long wait_timeout(void)
-{
-	return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES);
-}
-
-static noinline void missed_breadcrumb(struct intel_engine_cs *engine)
-{
-	if (GEM_SHOW_DEBUG()) {
-		struct drm_printer p = drm_debug_printer(__func__);
-
-		intel_engine_dump(engine, &p,
-				  "%s missed breadcrumb at %pS\n",
-				  engine->name, __builtin_return_address(0));
-	}
-
-	set_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
-}
-
-static void intel_breadcrumbs_hangcheck(struct timer_list *t)
-{
-	struct intel_engine_cs *engine =
-		from_timer(engine, t, breadcrumbs.hangcheck);
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-	if (!b->irq_armed)
-		return;
-
-	if (b->irq_fired)
-		goto rearm;
-
-	/*
-	 * We keep the hangcheck timer alive until we disarm the irq, even
-	 * if there are no waiters at present.
-	 *
-	 * If the waiter was currently running, assume it hasn't had a chance
-	 * to process the pending interrupt (e.g, low priority task on a loaded
-	 * system) and wait until it sleeps before declaring a missed interrupt.
-	 *
-	 * If the waiter was asleep (and not even pending a wakeup), then we
-	 * must have missed an interrupt as the GPU has stopped advancing
-	 * but we still have a waiter. Assuming all batches complete within
-	 * DRM_I915_HANGCHECK_JIFFIES [1.5s]!
-	 */
-	synchronize_hardirq(engine->i915->drm.irq);
-	if (intel_engine_signal_breadcrumbs(engine)) {
-		missed_breadcrumb(engine);
-		mod_timer(&b->fake_irq, jiffies + 1);
-	} else {
-rearm:
-		b->irq_fired = false;
-		mod_timer(&b->hangcheck, wait_timeout());
-	}
-}
-
-static void intel_breadcrumbs_fake_irq(struct timer_list *t)
-{
-	struct intel_engine_cs *engine =
-		from_timer(engine, t, breadcrumbs.fake_irq);
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-	/*
-	 * The timer persists in case we cannot enable interrupts,
-	 * or if we have previously seen seqno/interrupt incoherency
-	 * ("missed interrupt" syndrome, better known as a "missed breadcrumb").
-	 * Here the worker will wake up every jiffie in order to kick the
-	 * oldest waiter to do the coherent seqno check.
-	 */
-
-	if (!intel_engine_signal_breadcrumbs(engine) && !b->irq_armed)
-		return;
-
-	/* If the user has disabled the fake-irq, restore the hangchecking */
-	if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) {
-		mod_timer(&b->hangcheck, wait_timeout());
-		return;
-	}
-
-	mod_timer(&b->fake_irq, jiffies + 1);
-}
-
 void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine)
 {
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
@@ -257,43 +176,14 @@  void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine)
 	spin_unlock_irq(&b->irq_lock);
 }
 
-static bool use_fake_irq(const struct intel_breadcrumbs *b)
-{
-	const struct intel_engine_cs *engine =
-		container_of(b, struct intel_engine_cs, breadcrumbs);
-
-	if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
-		return false;
-
-	/*
-	 * Only start with the heavy weight fake irq timer if we have not
-	 * seen any interrupts since enabling it the first time. If the
-	 * interrupts are still arriving, it means we made a mistake in our
-	 * engine->seqno_barrier(), a timing error that should be transient
-	 * and unlikely to reoccur.
-	 */
-	return !b->irq_fired;
-}
-
-static void enable_fake_irq(struct intel_breadcrumbs *b)
-{
-	/* Ensure we never sleep indefinitely */
-	if (!b->irq_enabled || use_fake_irq(b))
-		mod_timer(&b->fake_irq, jiffies + 1);
-	else
-		mod_timer(&b->hangcheck, wait_timeout());
-}
-
-static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
+static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
 {
 	struct intel_engine_cs *engine =
 		container_of(b, struct intel_engine_cs, breadcrumbs);
-	struct drm_i915_private *i915 = engine->i915;
-	bool enabled;
 
 	lockdep_assert_held(&b->irq_lock);
 	if (b->irq_armed)
-		return false;
+		return;
 
 	/*
 	 * The breadcrumb irq will be disarmed on the interrupt after the
@@ -311,16 +201,8 @@  static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
 	 * the driver is idle) we disarm the breadcrumbs.
 	 */
 
-	/* No interrupts? Kick the waiter every jiffie! */
-	enabled = false;
-	if (!b->irq_enabled++ &&
-	    !test_bit(engine->id, &i915->gpu_error.test_irq_rings)) {
+	if (!b->irq_enabled++)
 		irq_enable(engine);
-		enabled = true;
-	}
-
-	enable_fake_irq(b);
-	return enabled;
 }
 
 void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
@@ -331,18 +213,6 @@  void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
 	INIT_LIST_HEAD(&b->signalers);
 
 	init_irq_work(&b->irq_work, signal_irq_work);
-
-	timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
-	timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
-}
-
-static void cancel_fake_irq(struct intel_engine_cs *engine)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-	del_timer_sync(&b->fake_irq); /* may queue b->hangcheck */
-	del_timer_sync(&b->hangcheck);
-	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
 }
 
 void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
@@ -352,13 +222,6 @@  void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
 
 	spin_lock_irqsave(&b->irq_lock, flags);
 
-	/*
-	 * Leave the fake_irq timer enabled (if it is running), but clear the
-	 * bit so that it turns itself off on its next wake up and goes back
-	 * to the long hangcheck interval if still required.
-	 */
-	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
-
 	if (b->irq_enabled)
 		irq_enable(engine);
 	else
@@ -369,7 +232,6 @@  void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
 
 void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
 {
-	cancel_fake_irq(engine);
 }
 
 bool intel_engine_enable_signaling(struct i915_request *rq)
@@ -451,7 +313,4 @@  void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine,
 		}
 	}
 	spin_unlock_irq(&b->irq_lock);
-
-	if (test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
-		drm_printf(p, "Fake irq active\n");
 }
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 5662d6fed523..a219c796e56d 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -275,6 +275,8 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 	for_each_engine(engine, dev_priv, id) {
 		struct hangcheck hc;
 
+		intel_engine_signal_breadcrumbs(engine);
+
 		hangcheck_load_sample(engine, &hc);
 		hangcheck_accumulate_sample(engine, &hc);
 		hangcheck_store_sample(engine, &hc);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index b78cb9bd4bc2..7eec96cf2a0b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -382,14 +382,9 @@  struct intel_engine_cs {
 
 		struct irq_work irq_work;
 
-		struct timer_list fake_irq; /* used after a missed interrupt */
-		struct timer_list hangcheck; /* detect missed interrupts */
-
-		unsigned int hangcheck_interrupts;
 		unsigned int irq_enabled;
 
 		bool irq_armed;
-		bool irq_fired;
 	} breadcrumbs;
 
 	struct {
diff --git a/drivers/gpu/drm/i915/selftests/igt_live_test.c b/drivers/gpu/drm/i915/selftests/igt_live_test.c
index 5deb485fb942..3e902761cd16 100644
--- a/drivers/gpu/drm/i915/selftests/igt_live_test.c
+++ b/drivers/gpu/drm/i915/selftests/igt_live_test.c
@@ -35,7 +35,6 @@  int igt_live_test_begin(struct igt_live_test *t,
 		return err;
 	}
 
-	i915->gpu_error.missed_irq_rings = 0;
 	t->reset_global = i915_reset_count(&i915->gpu_error);
 
 	for_each_engine(engine, i915, id)
@@ -75,11 +74,5 @@  int igt_live_test_end(struct igt_live_test *t)
 		return -EIO;
 	}
 
-	if (i915->gpu_error.missed_irq_rings) {
-		pr_err("%s(%s): Missed interrupts on engines %lx\n",
-		       t->func, t->name, i915->gpu_error.missed_irq_rings);
-		return -EIO;
-	}
-
 	return 0;
 }