Message ID | 20190125023005.1007-28-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/33] drm/i915/execlists: Move RPCS setup to context pin | expand |
On 25/01/2019 02:30, Chris Wilson wrote: > Missed breadcrumb detection is defunct due to the tight coupling with > dma_fence signaling and the myriad ways we may signal fences from > everywhere but from an interrupt, i.e. we frequently signal a fence > before we even see its interrupt. This means that even if we miss an > interrupt for a fence, it still is signaled before our breadcrumb > hangcheck fires, so simplify the breadcrumb hangchecking by moving it > into the GPU hangcheck and forgo fake interrupts. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Regards, Tvrtko > --- > drivers/gpu/drm/i915/i915_debugfs.c | 93 ----------- > drivers/gpu/drm/i915/i915_gpu_error.c | 2 - > drivers/gpu/drm/i915/i915_gpu_error.h | 5 - > drivers/gpu/drm/i915/intel_breadcrumbs.c | 147 +----------------- > drivers/gpu/drm/i915/intel_hangcheck.c | 2 + > drivers/gpu/drm/i915/intel_ringbuffer.h | 5 - > .../gpu/drm/i915/selftests/igt_live_test.c | 7 - > 7 files changed, 5 insertions(+), 256 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index e0ca3987eed6..fb216ce8dd60 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -1322,9 +1322,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) > intel_engine_last_submit(engine), > jiffies_to_msecs(jiffies - > engine->hangcheck.action_timestamp)); > - seq_printf(m, "\tfake irq active? %s\n", > - yesno(test_bit(engine->id, > - &dev_priv->gpu_error.missed_irq_rings))); > > seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n", > (long long)engine->hangcheck.acthd, > @@ -3900,94 +3897,6 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops, > i915_wedged_get, i915_wedged_set, > "%llu\n"); > > -static int > -fault_irq_set(struct drm_i915_private *i915, > - unsigned long *irq, > - unsigned long val) > -{ > - int err; > - > - err = mutex_lock_interruptible(&i915->drm.struct_mutex); > - if (err) > - return err; > - > - err = i915_gem_wait_for_idle(i915, > - I915_WAIT_LOCKED | > - I915_WAIT_INTERRUPTIBLE, > - MAX_SCHEDULE_TIMEOUT); > - if (err) > - goto err_unlock; > - > - *irq = val; > - mutex_unlock(&i915->drm.struct_mutex); > - > - /* Flush idle worker to disarm irq */ > - drain_delayed_work(&i915->gt.idle_work); > - > - return 0; > - > -err_unlock: > - mutex_unlock(&i915->drm.struct_mutex); > - return err; > -} > - > -static int > -i915_ring_missed_irq_get(void *data, u64 *val) > -{ > - struct drm_i915_private *dev_priv = data; > - > - *val = dev_priv->gpu_error.missed_irq_rings; > - return 0; > -} > - > -static int > -i915_ring_missed_irq_set(void *data, u64 val) > -{ > - struct drm_i915_private *i915 = data; > - > - return fault_irq_set(i915, &i915->gpu_error.missed_irq_rings, val); > -} > - > -DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops, > - i915_ring_missed_irq_get, i915_ring_missed_irq_set, > - "0x%08llx\n"); > - > -static int > -i915_ring_test_irq_get(void *data, u64 *val) > -{ > - struct drm_i915_private *dev_priv = data; > - > - *val = dev_priv->gpu_error.test_irq_rings; > - > - return 0; > -} > - > -static int > -i915_ring_test_irq_set(void *data, u64 val) > -{ > - struct drm_i915_private *i915 = data; > - > - /* GuC keeps the user interrupt permanently enabled for submission */ > - if (USES_GUC_SUBMISSION(i915)) > - return -ENODEV; > - > - /* > - * From icl, we can no longer individually mask interrupt generation > - * from each engine. > - */ > - if (INTEL_GEN(i915) >= 11) > - return -ENODEV; > - > - val &= INTEL_INFO(i915)->ring_mask; > - DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val); > - > - return fault_irq_set(i915, &i915->gpu_error.test_irq_rings, val); > -} > - > -DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops, > - i915_ring_test_irq_get, i915_ring_test_irq_set, > - "0x%08llx\n"); > - > #define DROP_UNBOUND BIT(0) > #define DROP_BOUND BIT(1) > #define DROP_RETIRE BIT(2) > @@ -4750,8 +4659,6 @@ static const struct i915_debugfs_files { > } i915_debugfs_files[] = { > {"i915_wedged", &i915_wedged_fops}, > {"i915_cache_sharing", &i915_cache_sharing_fops}, > - {"i915_ring_missed_irq", &i915_ring_missed_irq_fops}, > - {"i915_ring_test_irq", &i915_ring_test_irq_fops}, > {"i915_gem_drop_caches", &i915_drop_caches_fops}, > #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) > {"i915_error_state", &i915_error_state_fops}, > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index 420b94341433..965356c460ce 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -723,8 +723,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, > err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); > err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); > err_printf(m, "CCID: 0x%08x\n", error->ccid); > - err_printf(m, "Missed interrupts: 0x%08lx\n", > - m->i915->gpu_error.missed_irq_rings); > > for (i = 0; i < error->nfence; i++) > err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h > index 74757c424aab..53b1f22dd365 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.h > +++ b/drivers/gpu/drm/i915/i915_gpu_error.h > @@ -204,8 +204,6 @@ struct i915_gpu_error { > > atomic_t pending_fb_pin; > > - unsigned long missed_irq_rings; > - > /** > * State variable controlling the reset flow and count > * > @@ -274,9 +272,6 @@ struct i915_gpu_error { > */ > wait_queue_head_t reset_queue; > > - /* For missed irq/seqno simulation. */ > - unsigned long test_irq_rings; > - > struct i915_gpu_restart *restart; > }; > > diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c > index b0795b0ad227..cacaa1d04d17 100644 > --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c > +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c > @@ -91,7 +91,6 @@ bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine) > > spin_lock(&b->irq_lock); > > - b->irq_fired = true; > if (b->irq_armed && list_empty(&b->signalers)) > __intel_breadcrumbs_disarm_irq(b); > > @@ -172,86 +171,6 @@ static void signal_irq_work(struct irq_work *work) > intel_engine_breadcrumbs_irq(engine); > } > > -static unsigned long wait_timeout(void) > -{ > - return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES); > -} > - > -static noinline void missed_breadcrumb(struct intel_engine_cs *engine) > -{ > - if (GEM_SHOW_DEBUG()) { > - struct drm_printer p = drm_debug_printer(__func__); > - > - intel_engine_dump(engine, &p, > - "%s missed breadcrumb at %pS\n", > - engine->name, __builtin_return_address(0)); > - } > - > - set_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings); > -} > - > -static void intel_breadcrumbs_hangcheck(struct timer_list *t) > -{ > - struct intel_engine_cs *engine = > - from_timer(engine, t, breadcrumbs.hangcheck); > - struct intel_breadcrumbs *b = &engine->breadcrumbs; > - > - if (!b->irq_armed) > - return; > - > - if (b->irq_fired) > - goto rearm; > - > - /* > - * We keep the hangcheck timer alive until we disarm the irq, even > - * if there are no waiters at present. > - * > - * If the waiter was currently running, assume it hasn't had a chance > - * to process the pending interrupt (e.g, low priority task on a loaded > - * system) and wait until it sleeps before declaring a missed interrupt. > - * > - * If the waiter was asleep (and not even pending a wakeup), then we > - * must have missed an interrupt as the GPU has stopped advancing > - * but we still have a waiter. Assuming all batches complete within > - * DRM_I915_HANGCHECK_JIFFIES [1.5s]! > - */ > - synchronize_hardirq(engine->i915->drm.irq); > - if (intel_engine_signal_breadcrumbs(engine)) { > - missed_breadcrumb(engine); > - mod_timer(&b->fake_irq, jiffies + 1); > - } else { > -rearm: > - b->irq_fired = false; > - mod_timer(&b->hangcheck, wait_timeout()); > - } > -} > - > -static void intel_breadcrumbs_fake_irq(struct timer_list *t) > -{ > - struct intel_engine_cs *engine = > - from_timer(engine, t, breadcrumbs.fake_irq); > - struct intel_breadcrumbs *b = &engine->breadcrumbs; > - > - /* > - * The timer persists in case we cannot enable interrupts, > - * or if we have previously seen seqno/interrupt incoherency > - * ("missed interrupt" syndrome, better known as a "missed breadcrumb"). > - * Here the worker will wake up every jiffie in order to kick the > - * oldest waiter to do the coherent seqno check. > - */ > - > - if (!intel_engine_signal_breadcrumbs(engine) && !b->irq_armed) > - return; > - > - /* If the user has disabled the fake-irq, restore the hangchecking */ > - if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) { > - mod_timer(&b->hangcheck, wait_timeout()); > - return; > - } > - > - mod_timer(&b->fake_irq, jiffies + 1); > -} > - > void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine) > { > struct intel_breadcrumbs *b = &engine->breadcrumbs; > @@ -274,43 +193,14 @@ void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine) > spin_unlock_irq(&b->irq_lock); > } > > -static bool use_fake_irq(const struct intel_breadcrumbs *b) > -{ > - const struct intel_engine_cs *engine = > - container_of(b, struct intel_engine_cs, breadcrumbs); > - > - if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) > - return false; > - > - /* > - * Only start with the heavy weight fake irq timer if we have not > - * seen any interrupts since enabling it the first time. If the > - * interrupts are still arriving, it means we made a mistake in our > - * engine->seqno_barrier(), a timing error that should be transient > - * and unlikely to reoccur. > - */ > - return !b->irq_fired; > -} > - > -static void enable_fake_irq(struct intel_breadcrumbs *b) > -{ > - /* Ensure we never sleep indefinitely */ > - if (!b->irq_enabled || use_fake_irq(b)) > - mod_timer(&b->fake_irq, jiffies + 1); > - else > - mod_timer(&b->hangcheck, wait_timeout()); > -} > - > -static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) > +static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) > { > struct intel_engine_cs *engine = > container_of(b, struct intel_engine_cs, breadcrumbs); > - struct drm_i915_private *i915 = engine->i915; > - bool enabled; > > lockdep_assert_held(&b->irq_lock); > if (b->irq_armed) > - return false; > + return; > > /* > * The breadcrumb irq will be disarmed on the interrupt after the > @@ -328,16 +218,8 @@ static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) > * the driver is idle) we disarm the breadcrumbs. > */ > > - /* No interrupts? Kick the waiter every jiffie! */ > - enabled = false; > - if (!b->irq_enabled++ && > - !test_bit(engine->id, &i915->gpu_error.test_irq_rings)) { > + if (!b->irq_enabled++) > irq_enable(engine); > - enabled = true; > - } > - > - enable_fake_irq(b); > - return enabled; > } > > void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine) > @@ -348,18 +230,6 @@ void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine) > INIT_LIST_HEAD(&b->signalers); > > init_irq_work(&b->irq_work, signal_irq_work); > - > - timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0); > - timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0); > -} > - > -static void cancel_fake_irq(struct intel_engine_cs *engine) > -{ > - struct intel_breadcrumbs *b = &engine->breadcrumbs; > - > - del_timer_sync(&b->fake_irq); /* may queue b->hangcheck */ > - del_timer_sync(&b->hangcheck); > - clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings); > } > > void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine) > @@ -369,13 +239,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine) > > spin_lock_irqsave(&b->irq_lock, flags); > > - /* > - * Leave the fake_irq timer enabled (if it is running), but clear the > - * bit so that it turns itself off on its next wake up and goes back > - * to the long hangcheck interval if still required. > - */ > - clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings); > - > if (b->irq_enabled) > irq_enable(engine); > else > @@ -386,7 +249,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine) > > void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine) > { > - cancel_fake_irq(engine); > } > > bool i915_request_enable_breadcrumb(struct i915_request *rq) > @@ -482,7 +344,4 @@ void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, > } > } > spin_unlock_irq(&b->irq_lock); > - > - if (test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) > - drm_printf(p, "Fake irq active\n"); > } > diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c > index 5662d6fed523..a219c796e56d 100644 > --- a/drivers/gpu/drm/i915/intel_hangcheck.c > +++ b/drivers/gpu/drm/i915/intel_hangcheck.c > @@ -275,6 +275,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work) > for_each_engine(engine, dev_priv, id) { > struct hangcheck hc; > > + intel_engine_signal_breadcrumbs(engine); > + > hangcheck_load_sample(engine, &hc); > hangcheck_accumulate_sample(engine, &hc); > hangcheck_store_sample(engine, &hc); > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index eb0f8667a246..8038397239ee 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -382,14 +382,9 @@ struct intel_engine_cs { > > struct irq_work irq_work; /* for use from inside irq_lock */ > > - struct timer_list fake_irq; /* used after a missed interrupt */ > - struct timer_list hangcheck; /* detect missed interrupts */ > - > - unsigned int hangcheck_interrupts; > unsigned int irq_enabled; > > bool irq_armed; > - bool irq_fired; > } breadcrumbs; > > struct { > diff --git a/drivers/gpu/drm/i915/selftests/igt_live_test.c b/drivers/gpu/drm/i915/selftests/igt_live_test.c > index 5deb485fb942..3e902761cd16 100644 > --- a/drivers/gpu/drm/i915/selftests/igt_live_test.c > +++ b/drivers/gpu/drm/i915/selftests/igt_live_test.c > @@ -35,7 +35,6 @@ int igt_live_test_begin(struct igt_live_test *t, > return err; > } > > - i915->gpu_error.missed_irq_rings = 0; > t->reset_global = i915_reset_count(&i915->gpu_error); > > for_each_engine(engine, i915, id) > @@ -75,11 +74,5 @@ int igt_live_test_end(struct igt_live_test *t) > return -EIO; > } > > - if (i915->gpu_error.missed_irq_rings) { > - pr_err("%s(%s): Missed interrupts on engines %lx\n", > - t->func, t->name, i915->gpu_error.missed_irq_rings); > - return -EIO; > - } > - > return 0; > } >
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index e0ca3987eed6..fb216ce8dd60 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -1322,9 +1322,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) intel_engine_last_submit(engine), jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp)); - seq_printf(m, "\tfake irq active? %s\n", - yesno(test_bit(engine->id, - &dev_priv->gpu_error.missed_irq_rings))); seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n", (long long)engine->hangcheck.acthd, @@ -3900,94 +3897,6 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops, i915_wedged_get, i915_wedged_set, "%llu\n"); -static int -fault_irq_set(struct drm_i915_private *i915, - unsigned long *irq, - unsigned long val) -{ - int err; - - err = mutex_lock_interruptible(&i915->drm.struct_mutex); - if (err) - return err; - - err = i915_gem_wait_for_idle(i915, - I915_WAIT_LOCKED | - I915_WAIT_INTERRUPTIBLE, - MAX_SCHEDULE_TIMEOUT); - if (err) - goto err_unlock; - - *irq = val; - mutex_unlock(&i915->drm.struct_mutex); - - /* Flush idle worker to disarm irq */ - drain_delayed_work(&i915->gt.idle_work); - - return 0; - -err_unlock: - mutex_unlock(&i915->drm.struct_mutex); - return err; -} - -static int -i915_ring_missed_irq_get(void *data, u64 *val) -{ - struct drm_i915_private *dev_priv = data; - - *val = dev_priv->gpu_error.missed_irq_rings; - return 0; -} - -static int -i915_ring_missed_irq_set(void *data, u64 val) -{ - struct drm_i915_private *i915 = data; - - return fault_irq_set(i915, &i915->gpu_error.missed_irq_rings, val); -} - -DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops, - i915_ring_missed_irq_get, i915_ring_missed_irq_set, - "0x%08llx\n"); - -static int -i915_ring_test_irq_get(void *data, u64 *val) -{ - struct drm_i915_private *dev_priv = data; - - *val = dev_priv->gpu_error.test_irq_rings; - - return 0; -} - -static int -i915_ring_test_irq_set(void *data, u64 val) -{ - struct drm_i915_private *i915 = data; - - /* GuC keeps the user interrupt permanently enabled for submission */ - if (USES_GUC_SUBMISSION(i915)) - return -ENODEV; - - /* - * From icl, we can no longer individually mask interrupt generation - * from each engine. - */ - if (INTEL_GEN(i915) >= 11) - return -ENODEV; - - val &= INTEL_INFO(i915)->ring_mask; - DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val); - - return fault_irq_set(i915, &i915->gpu_error.test_irq_rings, val); -} - -DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops, - i915_ring_test_irq_get, i915_ring_test_irq_set, - "0x%08llx\n"); - #define DROP_UNBOUND BIT(0) #define DROP_BOUND BIT(1) #define DROP_RETIRE BIT(2) @@ -4750,8 +4659,6 @@ static const struct i915_debugfs_files { } i915_debugfs_files[] = { {"i915_wedged", &i915_wedged_fops}, {"i915_cache_sharing", &i915_cache_sharing_fops}, - {"i915_ring_missed_irq", &i915_ring_missed_irq_fops}, - {"i915_ring_test_irq", &i915_ring_test_irq_fops}, {"i915_gem_drop_caches", &i915_drop_caches_fops}, #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) {"i915_error_state", &i915_error_state_fops}, diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 420b94341433..965356c460ce 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -723,8 +723,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); err_printf(m, "CCID: 0x%08x\n", error->ccid); - err_printf(m, "Missed interrupts: 0x%08lx\n", - m->i915->gpu_error.missed_irq_rings); for (i = 0; i < error->nfence; i++) err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index 74757c424aab..53b1f22dd365 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -204,8 +204,6 @@ struct i915_gpu_error { atomic_t pending_fb_pin; - unsigned long missed_irq_rings; - /** * State variable controlling the reset flow and count * @@ -274,9 +272,6 @@ struct i915_gpu_error { */ wait_queue_head_t reset_queue; - /* For missed irq/seqno simulation. */ - unsigned long test_irq_rings; - struct i915_gpu_restart *restart; }; diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c index b0795b0ad227..cacaa1d04d17 100644 --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c @@ -91,7 +91,6 @@ bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine) spin_lock(&b->irq_lock); - b->irq_fired = true; if (b->irq_armed && list_empty(&b->signalers)) __intel_breadcrumbs_disarm_irq(b); @@ -172,86 +171,6 @@ static void signal_irq_work(struct irq_work *work) intel_engine_breadcrumbs_irq(engine); } -static unsigned long wait_timeout(void) -{ - return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES); -} - -static noinline void missed_breadcrumb(struct intel_engine_cs *engine) -{ - if (GEM_SHOW_DEBUG()) { - struct drm_printer p = drm_debug_printer(__func__); - - intel_engine_dump(engine, &p, - "%s missed breadcrumb at %pS\n", - engine->name, __builtin_return_address(0)); - } - - set_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings); -} - -static void intel_breadcrumbs_hangcheck(struct timer_list *t) -{ - struct intel_engine_cs *engine = - from_timer(engine, t, breadcrumbs.hangcheck); - struct intel_breadcrumbs *b = &engine->breadcrumbs; - - if (!b->irq_armed) - return; - - if (b->irq_fired) - goto rearm; - - /* - * We keep the hangcheck timer alive until we disarm the irq, even - * if there are no waiters at present. - * - * If the waiter was currently running, assume it hasn't had a chance - * to process the pending interrupt (e.g, low priority task on a loaded - * system) and wait until it sleeps before declaring a missed interrupt. - * - * If the waiter was asleep (and not even pending a wakeup), then we - * must have missed an interrupt as the GPU has stopped advancing - * but we still have a waiter. Assuming all batches complete within - * DRM_I915_HANGCHECK_JIFFIES [1.5s]! - */ - synchronize_hardirq(engine->i915->drm.irq); - if (intel_engine_signal_breadcrumbs(engine)) { - missed_breadcrumb(engine); - mod_timer(&b->fake_irq, jiffies + 1); - } else { -rearm: - b->irq_fired = false; - mod_timer(&b->hangcheck, wait_timeout()); - } -} - -static void intel_breadcrumbs_fake_irq(struct timer_list *t) -{ - struct intel_engine_cs *engine = - from_timer(engine, t, breadcrumbs.fake_irq); - struct intel_breadcrumbs *b = &engine->breadcrumbs; - - /* - * The timer persists in case we cannot enable interrupts, - * or if we have previously seen seqno/interrupt incoherency - * ("missed interrupt" syndrome, better known as a "missed breadcrumb"). - * Here the worker will wake up every jiffie in order to kick the - * oldest waiter to do the coherent seqno check. - */ - - if (!intel_engine_signal_breadcrumbs(engine) && !b->irq_armed) - return; - - /* If the user has disabled the fake-irq, restore the hangchecking */ - if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) { - mod_timer(&b->hangcheck, wait_timeout()); - return; - } - - mod_timer(&b->fake_irq, jiffies + 1); -} - void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine) { struct intel_breadcrumbs *b = &engine->breadcrumbs; @@ -274,43 +193,14 @@ void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine) spin_unlock_irq(&b->irq_lock); } -static bool use_fake_irq(const struct intel_breadcrumbs *b) -{ - const struct intel_engine_cs *engine = - container_of(b, struct intel_engine_cs, breadcrumbs); - - if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) - return false; - - /* - * Only start with the heavy weight fake irq timer if we have not - * seen any interrupts since enabling it the first time. If the - * interrupts are still arriving, it means we made a mistake in our - * engine->seqno_barrier(), a timing error that should be transient - * and unlikely to reoccur. - */ - return !b->irq_fired; -} - -static void enable_fake_irq(struct intel_breadcrumbs *b) -{ - /* Ensure we never sleep indefinitely */ - if (!b->irq_enabled || use_fake_irq(b)) - mod_timer(&b->fake_irq, jiffies + 1); - else - mod_timer(&b->hangcheck, wait_timeout()); -} - -static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) +static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) { struct intel_engine_cs *engine = container_of(b, struct intel_engine_cs, breadcrumbs); - struct drm_i915_private *i915 = engine->i915; - bool enabled; lockdep_assert_held(&b->irq_lock); if (b->irq_armed) - return false; + return; /* * The breadcrumb irq will be disarmed on the interrupt after the @@ -328,16 +218,8 @@ static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) * the driver is idle) we disarm the breadcrumbs. */ - /* No interrupts? Kick the waiter every jiffie! */ - enabled = false; - if (!b->irq_enabled++ && - !test_bit(engine->id, &i915->gpu_error.test_irq_rings)) { + if (!b->irq_enabled++) irq_enable(engine); - enabled = true; - } - - enable_fake_irq(b); - return enabled; } void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine) @@ -348,18 +230,6 @@ void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine) INIT_LIST_HEAD(&b->signalers); init_irq_work(&b->irq_work, signal_irq_work); - - timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0); - timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0); -} - -static void cancel_fake_irq(struct intel_engine_cs *engine) -{ - struct intel_breadcrumbs *b = &engine->breadcrumbs; - - del_timer_sync(&b->fake_irq); /* may queue b->hangcheck */ - del_timer_sync(&b->hangcheck); - clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings); } void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine) @@ -369,13 +239,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine) spin_lock_irqsave(&b->irq_lock, flags); - /* - * Leave the fake_irq timer enabled (if it is running), but clear the - * bit so that it turns itself off on its next wake up and goes back - * to the long hangcheck interval if still required. - */ - clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings); - if (b->irq_enabled) irq_enable(engine); else @@ -386,7 +249,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine) void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine) { - cancel_fake_irq(engine); } bool i915_request_enable_breadcrumb(struct i915_request *rq) @@ -482,7 +344,4 @@ void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, } } spin_unlock_irq(&b->irq_lock); - - if (test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) - drm_printf(p, "Fake irq active\n"); } diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c index 5662d6fed523..a219c796e56d 100644 --- a/drivers/gpu/drm/i915/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/intel_hangcheck.c @@ -275,6 +275,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work) for_each_engine(engine, dev_priv, id) { struct hangcheck hc; + intel_engine_signal_breadcrumbs(engine); + hangcheck_load_sample(engine, &hc); hangcheck_accumulate_sample(engine, &hc); hangcheck_store_sample(engine, &hc); diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index eb0f8667a246..8038397239ee 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -382,14 +382,9 @@ struct intel_engine_cs { struct irq_work irq_work; /* for use from inside irq_lock */ - struct timer_list fake_irq; /* used after a missed interrupt */ - struct timer_list hangcheck; /* detect missed interrupts */ - - unsigned int hangcheck_interrupts; unsigned int irq_enabled; bool irq_armed; - bool irq_fired; } breadcrumbs; struct { diff --git a/drivers/gpu/drm/i915/selftests/igt_live_test.c b/drivers/gpu/drm/i915/selftests/igt_live_test.c index 5deb485fb942..3e902761cd16 100644 --- a/drivers/gpu/drm/i915/selftests/igt_live_test.c +++ b/drivers/gpu/drm/i915/selftests/igt_live_test.c @@ -35,7 +35,6 @@ int igt_live_test_begin(struct igt_live_test *t, return err; } - i915->gpu_error.missed_irq_rings = 0; t->reset_global = i915_reset_count(&i915->gpu_error); for_each_engine(engine, i915, id) @@ -75,11 +74,5 @@ int igt_live_test_end(struct igt_live_test *t) return -EIO; } - if (i915->gpu_error.missed_irq_rings) { - pr_err("%s(%s): Missed interrupts on engines %lx\n", - t->func, t->name, i915->gpu_error.missed_irq_rings); - return -EIO; - } - return 0; }
Missed breadcrumb detection is defunct due to the tight coupling with dma_fence signaling and the myriad ways we may signal fences from everywhere but from an interrupt, i.e. we frequently signal a fence before we even see its interrupt. This means that even if we miss an interrupt for a fence, it still is signaled before our breadcrumb hangcheck fires, so simplify the breadcrumb hangchecking by moving it into the GPU hangcheck and forgo fake interrupts. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/i915_debugfs.c | 93 ----------- drivers/gpu/drm/i915/i915_gpu_error.c | 2 - drivers/gpu/drm/i915/i915_gpu_error.h | 5 - drivers/gpu/drm/i915/intel_breadcrumbs.c | 147 +----------------- drivers/gpu/drm/i915/intel_hangcheck.c | 2 + drivers/gpu/drm/i915/intel_ringbuffer.h | 5 - .../gpu/drm/i915/selftests/igt_live_test.c | 7 - 7 files changed, 5 insertions(+), 256 deletions(-)