[10/18] drm/i915: Perform a direct reset of the GPU from the waiter

Message ID	20160830081812.4110-11-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Tue, 30 Aug 2016 09:18:04 +0100 Message-Id: <20160830081812.4110-11-chris@chris-wilson.co.uk> In-Reply-To: <20160830081812.4110-1-chris@chris-wilson.co.uk> References: <20160830081812.4110-1-chris@chris-wilson.co.uk> Cc: mika.kuoppala@intel.com Subject: [Intel-gfx] [PATCH 10/18] drm/i915: Perform a direct reset of the GPU from the waiter Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 01b518dcbd7a..ea7d3e87815c 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -1752,6 +1752,8 @@ int i915_resume_switcheroo(struct drm_device *dev) * Reset the chip. Useful if a hang is detected. Returns zero on successful * reset or otherwise an error code. * + * Caller must hold the struct_mutex. + * * Procedure is fairly simple: * - reset the chip using the reset reg * - re-init context state @@ -1766,7 +1768,10 @@ int i915_reset(struct drm_i915_private *dev_priv) struct i915_gpu_error *error = &dev_priv->gpu_error; int ret; - mutex_lock(&dev->struct_mutex); + lockdep_assert_held(&dev->struct_mutex); + + if (!test_and_clear_bit(I915_RESET_IN_PROGRESS, &error->flags)) + return 0; /* Clear any previous failed attempts at recovery. Time to try again. */ __clear_bit(I915_WEDGED, &error->flags); @@ -1807,9 +1812,6 @@ int i915_reset(struct drm_i915_private *dev_priv) goto error; } - clear_bit(I915_RESET_IN_PROGRESS, &error->flags); - mutex_unlock(&dev->struct_mutex); - /* * rps/rc6 re-init is necessary to restore state lost after the * reset and the re-install of gt irqs. Skip for ironlake per @@ -1823,7 +1825,6 @@ int i915_reset(struct drm_i915_private *dev_priv) error: set_bit(I915_WEDGED, &error->flags); - mutex_unlock(&dev->struct_mutex); return ret; } diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e574eaa65c4d..c6dbc6b5798a 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3849,7 +3849,9 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) schedule_timeout_uninterruptible(remaining_jiffies); } } -static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req) + +static inline bool +__i915_request_irq_complete(struct drm_i915_gem_request *req) { struct intel_engine_cs *engine = req->engine; @@ -3911,17 +3913,6 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req) return true; } - /* We need to check whether any gpu reset happened in between - * the request being submitted and now. If a reset has occurred, - * the seqno will have been advance past ours and our request - * is complete. If we are in the process of handling a reset, - * the request is effectively complete as the rendering will - * be discarded, but we need to return in order to drop the - * struct_mutex. - */ - if (i915_reset_in_progress(&req->i915->gpu_error)) - return true; - return false; } diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 51bb05f62686..cc565f785888 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2800,7 +2800,8 @@ __i915_gem_object_sync(struct drm_i915_gem_request *to, if (!i915.semaphores) { ret = i915_wait_request(from, - from->i915->mm.interruptible, + from->i915->mm.interruptible | + I915_WAIT_LOCKED, NULL, NO_WAITBOOST); if (ret) diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index 3c600ff3e9d5..10a11b7d1114 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -666,8 +666,8 @@ int i915_wait_request(struct drm_i915_gem_request *req, if (i915_spin_request(req, state, 5)) goto complete; - set_current_state(state); - add_wait_queue(&req->i915->gpu_error.wait_queue, &reset); + if (flags & I915_WAIT_LOCKED) + add_wait_queue(&req->i915->gpu_error.wait_queue, &reset); intel_wait_init(&wait, req->fence.seqno); if (intel_engine_add_wait(req->engine, &wait)) @@ -692,9 +692,9 @@ int i915_wait_request(struct drm_i915_gem_request *req, if (intel_wait_complete(&wait)) break; +wakeup: set_current_state(state); -wakeup: /* Carefully check if the request is complete, giving time * for the seqno to be visible following the interrupt. * We also have to check in case we are kicked by the GPU @@ -703,11 +703,32 @@ wakeup: if (__i915_request_irq_complete(req)) break; + /* If the GPU is hung, and we hold the lock, reset the GPU + * and then check for completion. On a full reset, the engine's + * HW seqno will be advanced passed us and we are complete. + * If we do a partial reset, we have to wait for the GPU to + * resume and update the breadcrumb. + * + * If we don't hold the mutex, we can just wait for the worker + * to come along and update the breadcrumb (either directly + * itself, or indirectly by recovering the GPU). + */ + if (flags & I915_WAIT_LOCKED && + i915_reset_in_progress(&req->i915->gpu_error)) { + __set_current_state(TASK_RUNNING); + if (!i915_reset(req->i915)) + goto wakeup; + + break; + } + /* Only spin if we know the GPU is processing this request */ if (i915_spin_request(req, state, 2)) break; } - remove_wait_queue(&req->i915->gpu_error.wait_queue, &reset); + + if (flags & I915_WAIT_LOCKED) + remove_wait_queue(&req->i915->gpu_error.wait_queue, &reset); intel_engine_remove_wait(req->engine, &wait); __set_current_state(TASK_RUNNING); diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h index 41de4d9aa3fe..49c3c006ec17 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.h +++ b/drivers/gpu/drm/i915/i915_gem_request.h @@ -223,6 +223,7 @@ int i915_wait_request(struct drm_i915_gem_request *req, struct intel_rps_client *rps) __attribute__((nonnull(1))); #define I915_WAIT_INTERRUPTIBLE BIT(0) +#define I915_WAIT_LOCKED BIT(1) /* struct_mutex held, handle GPU reset */ static inline u32 intel_engine_get_seqno(struct intel_engine_cs *engine); @@ -576,7 +577,9 @@ i915_gem_active_wait(const struct i915_gem_active *active, struct mutex *mutex) if (!request) return 0; - return i915_wait_request(request, I915_WAIT_INTERRUPTIBLE, NULL, NULL); + return i915_wait_request(request, + I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, + NULL, NULL); } /** @@ -639,7 +642,9 @@ i915_gem_active_retire(struct i915_gem_active *active, if (!request) return 0; - ret = i915_wait_request(request, I915_WAIT_INTERRUPTIBLE, NULL, NULL); + ret = i915_wait_request(request, + I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, + NULL, NULL); if (ret) return ret; diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 3fd9cbdf2adb..0d1fb47f86c5 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2521,7 +2521,9 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) * pending state and not properly drop locks, resulting in * deadlocks with the reset work. */ + mutex_lock(&dev_priv->drm.struct_mutex); ret = i915_reset(dev_priv); + mutex_unlock(&dev_priv->drm.struct_mutex); intel_finish_reset(dev_priv); diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index a6e6f2c49299..fa4f3731be29 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -2223,14 +2223,12 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes) if (WARN_ON(&target->ring_link == &ring->request_list)) return -ENOSPC; - ret = i915_wait_request(target, I915_WAIT_INTERRUPTIBLE, + ret = i915_wait_request(target, + I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, NULL, NO_WAITBOOST); if (ret) return ret; - if (i915_reset_in_progress(&target->i915->gpu_error)) - return -EAGAIN; - i915_gem_request_retire_upto(target); intel_ring_update_space(ring);

[10/18] drm/i915: Perform a direct reset of the GPU from the waiter

Commit Message

Patch