@@ -1886,6 +1886,8 @@ void i915_reset(struct drm_i915_private *i915)
int ret;
int i;
+ GEM_TRACE("flags=%lx\n", error->flags);
+
might_sleep();
lockdep_assert_held(&i915->drm.struct_mutex);
GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
@@ -2016,6 +2018,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
struct i915_request *active_request;
int ret;
+ GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
active_request = i915_gem_reset_prepare_engine(engine);
@@ -979,6 +979,23 @@ static int igt_wait_reset(void *arg)
return err;
}
+static int wait_for_others(struct drm_i915_private *i915,
+ struct intel_engine_cs *exclude)
+{
+ struct intel_engine_cs *engine;
+ enum intel_engine_id id;
+
+ for_each_engine(engine, i915, id) {
+ if (engine == exclude)
+ continue;
+
+ if (wait_for(intel_engine_is_idle(engine), 10))
+ return -EIO;
+ }
+
+ return 0;
+}
+
static int igt_reset_queue(void *arg)
{
struct drm_i915_private *i915 = arg;
@@ -1027,13 +1044,36 @@ static int igt_reset_queue(void *arg)
i915_request_get(rq);
__i915_request_add(rq, true);
+ /*
+ * XXX We don't handle resetting the kernel context
+ * very well. If we trigger a device reset twice in
+ * quick succession while the kernel context is
+ * executing, we may end up skipping the breadcrumb.
+ * This is really only a problem for the selftest as
+ * normally there is a large interlude between resets
+ * (hangcheck), or we focus on resetting just one
+ * engine and so avoid repeatedly resetting innocents.
+ */
+ err = wait_for_others(i915, engine);
+ if (err) {
+ pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
+ __func__, engine->name);
+ i915_request_put(rq);
+ i915_request_put(prev);
+
+ GEM_TRACE_DUMP();
+ i915_gem_set_wedged(i915);
+ goto fini;
+ }
+
if (!wait_for_hang(&h, prev)) {
struct drm_printer p = drm_info_printer(i915->drm.dev);
- pr_err("%s: Failed to start request %x, at %x\n",
- __func__, prev->fence.seqno, hws_seqno(&h, prev));
- intel_engine_dump(prev->engine, &p,
- "%s\n", prev->engine->name);
+ pr_err("%s(%s): Failed to start request %x, at %x\n",
+ __func__, engine->name,
+ prev->fence.seqno, hws_seqno(&h, prev));
+ intel_engine_dump(engine, &p,
+ "%s\n", engine->name);
i915_request_put(rq);
i915_request_put(prev);
We don't handle resetting the kernel context very well, or presumably any context executing its breadcrumb commands in the ring as opposed to the batchbuffer and flush. If we trigger a device reset twice in quick succession while the kernel context is executing, we may end up skipping the breadcrumb. This is really only a problem for the selftest as normally there is a large interlude between resets (hangcheck), or we focus on resetting just one engine and so avoid repeatedly resetting innocents. Something to try would be a preempt-to-idle to quiesce the engine before reset, so that innocent contexts would be spared the reset. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Cc: MichaĆ Winiarski <michal.winiarski@intel.com> CC: Michel Thierry <michel.thierry@intel.com> --- drivers/gpu/drm/i915/i915_drv.c | 3 ++ drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 48 ++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 4 deletions(-)