@@ -3833,11 +3833,15 @@ static const struct file_operations i915_cur_wm_latency_fops = {
static int
i915_wedged_get(void *data, u64 *val)
{
- struct drm_i915_private *dev_priv = data;
+ int ret;
- *val = i915_terminally_wedged(&dev_priv->gpu_error);
+ ret = i915_terminally_wedged(data);
+ if (ret == -EIO) {
+ *val = 1;
+ ret = 0;
+ }
- return 0;
+ return ret;
}
static int
@@ -3918,7 +3922,7 @@ i915_drop_caches_set(void *data, u64 val)
mutex_unlock(&i915->drm.struct_mutex);
}
- if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
+ if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(i915))
i915_handle_error(i915, ALL_ENGINES, 0, NULL);
fs_reclaim_acquire(GFP_KERNEL);
@@ -3020,11 +3020,16 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
struct i915_request *
i915_gem_find_active_request(struct intel_engine_cs *engine);
-static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
+static inline bool __i915_wedged(struct i915_gpu_error *error)
{
return unlikely(test_bit(I915_WEDGED, &error->flags));
}
+static inline bool i915_reset_failed(struct drm_i915_private *i915)
+{
+ return __i915_wedged(&i915->gpu_error);
+}
+
static inline u32 i915_reset_count(struct i915_gpu_error *error)
{
return READ_ONCE(error->reset_count);
@@ -1928,7 +1928,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
* fail). But any other -EIO isn't ours (e.g. swap in failure)
* and so needs to be reported.
*/
- if (!i915_terminally_wedged(&dev_priv->gpu_error))
+ if (!i915_terminally_wedged(dev_priv))
return VM_FAULT_SIGBUS;
/* else: fall through */
case -EAGAIN:
@@ -2958,7 +2958,7 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
struct intel_engine_cs *engine;
enum intel_engine_id id;
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_reset_failed(i915))
return;
GEM_BUG_ON(i915->gt.active_requests);
@@ -3806,8 +3806,9 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
long ret;
/* ABI: return -EIO if already wedged */
- if (i915_terminally_wedged(&dev_priv->gpu_error))
- return -EIO;
+ ret = i915_terminally_wedged(dev_priv);
+ if (ret)
+ return ret;
spin_lock(&file_priv->mm.lock);
list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
@@ -4460,7 +4461,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
* back to defaults, recovering from whatever wedged state we left it
* in and so worth trying to use the device once more.
*/
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_terminally_wedged(i915))
i915_gem_unset_wedged(i915);
/*
@@ -4504,7 +4505,7 @@ int i915_gem_suspend(struct drm_i915_private *i915)
* state. Fortunately, the kernel_context is disposable and we do
* not rely on its state.
*/
- if (!i915_terminally_wedged(&i915->gpu_error)) {
+ if (!i915_reset_failed(i915)) {
ret = i915_gem_switch_to_kernel_context(i915);
if (ret)
goto err_unlock;
@@ -4625,7 +4626,7 @@ void i915_gem_resume(struct drm_i915_private *i915)
return;
err_wedged:
- if (!i915_terminally_wedged(&i915->gpu_error)) {
+ if (!i915_terminally_wedged(i915)) {
DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
i915_gem_set_wedged(i915);
}
@@ -4731,10 +4732,9 @@ int i915_gem_init_hw(struct drm_i915_private *dev_priv)
init_unused_rings(dev_priv);
BUG_ON(!dev_priv->kernel_context);
- if (i915_terminally_wedged(&dev_priv->gpu_error)) {
- ret = -EIO;
+ ret = i915_terminally_wedged(dev_priv);
+ if (ret)
goto out;
- }
ret = i915_ppgtt_init_hw(dev_priv);
if (ret) {
@@ -5107,7 +5107,7 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
* wedged. But we only want to do this where the GPU is angry,
* for all other failure, such as an allocation failure, bail.
*/
- if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
+ if (!i915_reset_failed(dev_priv)) {
i915_load_error(dev_priv,
"Failed to initialize GPU, declaring it wedged!\n");
i915_gem_set_wedged(dev_priv);
@@ -559,8 +559,9 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
* ABI: Before userspace accesses the GPU (e.g. execbuffer), report
* EIO if the GPU is already wedged.
*/
- if (i915_terminally_wedged(&i915->gpu_error))
- return ERR_PTR(-EIO);
+ ret = i915_terminally_wedged(i915);
+ if (ret)
+ return ERR_PTR(ret);
/*
* Pinning the contexts may generate requests in order to acquire
@@ -1032,7 +1032,7 @@ void i915_reset(struct drm_i915_private *i915,
finish:
reset_finish(i915);
- if (!i915_terminally_wedged(error))
+ if (!__i915_wedged(error))
reset_restart(i915);
return;
@@ -1253,7 +1253,7 @@ void i915_handle_error(struct drm_i915_private *i915,
* Try engine reset when available. We fall back to full reset if
* single reset fails.
*/
- if (intel_has_reset_engine(i915) && !i915_terminally_wedged(error)) {
+ if (intel_has_reset_engine(i915) && !__i915_wedged(error)) {
for_each_engine_masked(engine, i915, engine_mask, tmp) {
BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
@@ -1339,6 +1339,29 @@ __releases(&i915->gpu_error.reset_backoff_srcu)
srcu_read_unlock(&error->reset_backoff_srcu, tag);
}
+int i915_terminally_wedged(struct drm_i915_private *i915)
+{
+ struct i915_gpu_error *error = &i915->gpu_error;
+
+ if (!__i915_wedged(error))
+ return 0;
+
+ /* Reset still in progress? Maybe we will recover? */
+ if (!test_bit(I915_RESET_BACKOFF, &error->flags))
+ return -EIO;
+
+ /* XXX intel_reset_finish() still takes struct_mutex!!! */
+ if (mutex_is_locked(&i915->drm.struct_mutex))
+ return -EAGAIN;
+
+ if (wait_event_interruptible(error->reset_queue,
+ !test_bit(I915_RESET_BACKOFF,
+ &error->flags)))
+ return -EINTR;
+
+ return __i915_wedged(error) ? -EIO : 0;
+}
+
bool i915_reset_flush(struct drm_i915_private *i915)
{
int err;
@@ -36,6 +36,8 @@ bool i915_reset_flush(struct drm_i915_private *i915);
int __must_check i915_reset_trylock(struct drm_i915_private *i915);
void i915_reset_unlock(struct drm_i915_private *i915, int tag);
+int i915_terminally_wedged(struct drm_i915_private *i915);
+
bool intel_has_gpu_reset(struct drm_i915_private *i915);
bool intel_has_reset_engine(struct drm_i915_private *i915);
@@ -1007,10 +1007,8 @@ static bool ring_is_idle(struct intel_engine_cs *engine)
*/
bool intel_engine_is_idle(struct intel_engine_cs *engine)
{
- struct drm_i915_private *dev_priv = engine->i915;
-
/* More white lies, if wedged, hw state is inconsistent */
- if (i915_terminally_wedged(&dev_priv->gpu_error))
+ if (i915_reset_failed(engine->i915))
return true;
/* Any inflight/incomplete requests? */
@@ -1054,7 +1052,7 @@ bool intel_engines_are_idle(struct drm_i915_private *dev_priv)
* If the driver is wedged, HW state may be very inconsistent and
* report that it is still busy, even though we have stopped using it.
*/
- if (i915_terminally_wedged(&dev_priv->gpu_error))
+ if (i915_reset_failed(dev_priv))
return true;
for_each_engine(engine, dev_priv, id) {
@@ -1496,7 +1494,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
va_end(ap);
}
- if (i915_terminally_wedged(&engine->i915->gpu_error))
+ if (i915_reset_failed(engine->i915))
drm_printf(m, "*** WEDGED ***\n");
drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
@@ -263,7 +263,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
if (!READ_ONCE(dev_priv->gt.awake))
return;
- if (i915_terminally_wedged(&dev_priv->gpu_error))
+ if (i915_terminally_wedged(dev_priv))
return;
/* As enabling the GPU requires fairly extensive mmio access,
@@ -1764,7 +1764,7 @@ int i915_gem_huge_page_live_selftests(struct drm_i915_private *dev_priv)
return 0;
}
- if (i915_terminally_wedged(&dev_priv->gpu_error))
+ if (i915_terminally_wedged(dev_priv))
return 0;
file = mock_file(dev_priv);
@@ -150,7 +150,7 @@ int i915_active_live_selftests(struct drm_i915_private *i915)
SUBTEST(live_active_retire),
};
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_terminally_wedged(i915))
return 0;
return i915_subtests(tests, i915);
@@ -248,12 +248,12 @@ static bool always_valid(struct drm_i915_private *i915)
static bool needs_fence_registers(struct drm_i915_private *i915)
{
- return !i915_terminally_wedged(&i915->gpu_error);
+ return !i915_terminally_wedged(i915);
}
static bool needs_mi_store_dword(struct drm_i915_private *i915)
{
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_terminally_wedged(i915))
return false;
return intel_engine_can_store_dword(i915->engine[RCS]);
@@ -1632,7 +1632,7 @@ int i915_gem_context_live_selftests(struct drm_i915_private *dev_priv)
SUBTEST(igt_vm_isolation),
};
- if (i915_terminally_wedged(&dev_priv->gpu_error))
+ if (i915_terminally_wedged(dev_priv))
return 0;
return i915_subtests(tests, dev_priv);
@@ -547,7 +547,7 @@ int i915_gem_evict_live_selftests(struct drm_i915_private *i915)
SUBTEST(igt_evict_contexts),
};
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_terminally_wedged(i915))
return 0;
return i915_subtests(tests, i915);
@@ -583,7 +583,7 @@ static int igt_mmap_offset_exhaustion(void *arg)
for (loop = 0; loop < 3; loop++) {
intel_wakeref_t wakeref;
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_terminally_wedged(i915))
break;
obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
@@ -1246,7 +1246,7 @@ int i915_request_live_selftests(struct drm_i915_private *i915)
SUBTEST(live_breadcrumbs_smoketest),
};
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_terminally_wedged(i915))
return 0;
return i915_subtests(tests, i915);
@@ -29,5 +29,5 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
i915_gem_set_wedged(i915);
}
- return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
+ return i915_terminally_wedged(i915);
}
@@ -342,7 +342,7 @@ static int igt_hang_sanitycheck(void *arg)
timeout = i915_request_wait(rq,
I915_WAIT_LOCKED,
MAX_SCHEDULE_TIMEOUT);
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_reset_failed(i915))
timeout = -EIO;
i915_request_put(rq);
@@ -383,7 +383,7 @@ static int igt_global_reset(void *arg)
igt_global_reset_unlock(i915);
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_reset_failed(i915))
err = -EIO;
return err;
@@ -401,13 +401,13 @@ static int igt_wedged_reset(void *arg)
i915_gem_set_wedged(i915);
- GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
+ GEM_BUG_ON(!i915_reset_failed(i915));
i915_reset(i915, ALL_ENGINES, NULL);
intel_runtime_pm_put(i915, wakeref);
igt_global_reset_unlock(i915);
- return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
+ return i915_reset_failed(i915) ? -EIO : 0;
}
static bool wait_for_idle(struct intel_engine_cs *engine)
@@ -529,7 +529,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
break;
}
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_reset_failed(i915))
err = -EIO;
if (active) {
@@ -843,7 +843,7 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
break;
}
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_reset_failed(i915))
err = -EIO;
if (flags & TEST_ACTIVE) {
@@ -969,7 +969,7 @@ static int igt_reset_wait(void *arg)
mutex_unlock(&i915->drm.struct_mutex);
igt_global_reset_unlock(i915);
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_reset_failed(i915))
return -EIO;
return err;
@@ -1166,7 +1166,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
unlock:
mutex_unlock(&i915->drm.struct_mutex);
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_reset_failed(i915))
return -EIO;
return err;
@@ -1372,7 +1372,7 @@ static int igt_reset_queue(void *arg)
mutex_unlock(&i915->drm.struct_mutex);
igt_global_reset_unlock(i915);
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_reset_failed(i915))
return -EIO;
return err;
@@ -1552,7 +1552,7 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
i915_request_wait(rq,
I915_WAIT_LOCKED,
MAX_SCHEDULE_TIMEOUT);
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_reset_failed(i915))
err = -EIO;
}
@@ -1591,7 +1591,7 @@ static int igt_atomic_reset(void *arg)
/* Flush any requests before we get started and check basics */
force_reset(i915);
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_reset_failed(i915))
goto unlock;
if (intel_has_gpu_reset(i915)) {
@@ -1665,7 +1665,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
if (!intel_has_gpu_reset(i915))
return 0;
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_terminally_wedged(i915))
return -EIO; /* we're long past hope of a successful reset */
wakeref = intel_runtime_pm_get(i915);
@@ -895,7 +895,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
if (!HAS_EXECLISTS(i915))
return 0;
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_terminally_wedged(i915))
return 0;
return i915_subtests(tests, i915);
@@ -181,7 +181,7 @@ static int check_whitelist(struct i915_gem_context *ctx,
err = 0;
igt_wedge_on_timeout(&wedge, ctx->i915, HZ / 5) /* a safety net! */
err = i915_gem_object_set_to_cpu_domain(results, false);
- if (i915_terminally_wedged(&ctx->i915->gpu_error))
+ if (i915_terminally_wedged(ctx->i915))
err = -EIO;
if (err)
goto out_put;
@@ -510,7 +510,7 @@ int intel_workarounds_live_selftests(struct drm_i915_private *i915)
};
int err;
- if (i915_terminally_wedged(&i915->gpu_error))
+ if (i915_terminally_wedged(i915))
return 0;
mutex_lock(&i915->drm.struct_mutex);
At a few points in our uABI, we check to see if the driver is wedged and report -EIO back to the user in that case. However, as we perform the check and reset asynchronously, we may instead see the temporary wedging used to cancel inflight rendering to avoid a deadlock during reset. If we suspect this is the case, that is we see a wedged driver and reset in progress, then wait until the reset is resolved before reporting upon the wedged status. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109580 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> --- drivers/gpu/drm/i915/i915_debugfs.c | 12 ++++++--- drivers/gpu/drm/i915/i915_drv.h | 7 ++++- drivers/gpu/drm/i915/i915_gem.c | 22 +++++++-------- drivers/gpu/drm/i915/i915_request.c | 5 ++-- drivers/gpu/drm/i915/i915_reset.c | 27 +++++++++++++++++-- drivers/gpu/drm/i915/i915_reset.h | 2 ++ drivers/gpu/drm/i915/intel_engine_cs.c | 8 +++--- drivers/gpu/drm/i915/intel_hangcheck.c | 2 +- drivers/gpu/drm/i915/selftests/huge_pages.c | 2 +- drivers/gpu/drm/i915/selftests/i915_active.c | 2 +- .../drm/i915/selftests/i915_gem_coherency.c | 4 +-- .../gpu/drm/i915/selftests/i915_gem_context.c | 2 +- .../gpu/drm/i915/selftests/i915_gem_evict.c | 2 +- .../gpu/drm/i915/selftests/i915_gem_object.c | 2 +- drivers/gpu/drm/i915/selftests/i915_request.c | 2 +- .../gpu/drm/i915/selftests/igt_flush_test.c | 2 +- .../gpu/drm/i915/selftests/intel_hangcheck.c | 24 ++++++++--------- drivers/gpu/drm/i915/selftests/intel_lrc.c | 2 +- .../drm/i915/selftests/intel_workarounds.c | 4 +-- 19 files changed, 83 insertions(+), 50 deletions(-)