drm/i915/gt: Be defensive in the face of false CS events
diff mbox series

Message ID 20200710120717.32484-1-chris@chris-wilson.co.uk
State New
Headers show
Series
  • drm/i915/gt: Be defensive in the face of false CS events
Related show

Commit Message

Chris Wilson July 10, 2020, 12:07 p.m. UTC
If the HW throws a curve ball and reports either en event before it is
possible, or just a completely impossible event, we have to grin and
bear it. The first few events, we will likely not notice as we would be
expecting some event, but as soon as we stop expecting an event and yet
they still keep coming, then we enter into undefined state terrority.
In which case, bail out, stop processing the events, and reset the
engine and our set of queued requests to recover.

The sporadic hangs and warnings will continue to plague CI, but at least
system stability should not be compromised.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2045
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c | 8 ++++++--
 drivers/gpu/drm/i915/i915_gem.h     | 2 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

Comments

Chris Wilson July 10, 2020, 12:15 p.m. UTC | #1
Quoting Chris Wilson (2020-07-10 13:07:17)
> If the HW throws a curve ball and reports either en event before it is
> possible, or just a completely impossible event, we have to grin and
> bear it. The first few events, we will likely not notice as we would be
> expecting some event, but as soon as we stop expecting an event and yet
> they still keep coming, then we enter into undefined state terrority.
> In which case, bail out, stop processing the events, and reset the
> engine and our set of queued requests to recover.
> 
> The sporadic hangs and warnings will continue to plague CI, but at least
> system stability should not be compromised.
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2045
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_lrc.c | 8 ++++++--
>  drivers/gpu/drm/i915/i915_gem.h     | 2 ++
>  2 files changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index fbcfeaed6441..f22cf8ed47ac 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -2567,6 +2567,7 @@ static void process_csb(struct intel_engine_cs *engine)
>         tail = READ_ONCE(*execlists->csb_write);
>         if (unlikely(head == tail))
>                 return;
> +       execlists->csb_head = tail;
>  
>         /*
>          * Hopefully paired with a wmb() in HW!
> @@ -2613,6 +2614,9 @@ static void process_csb(struct intel_engine_cs *engine)
>                 if (promote) {
>                         struct i915_request * const *old = execlists->active;
>  
> +                       if (GEM_WARN_ON_ONCE(!*execlists->pending))

I wonder if we should just default GEM_WARN_ON to be GEM_WARN_ON_ONCE,
CI reboots after a warning so the spam is unhelpful.
-Chris

Patch
diff mbox series

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index fbcfeaed6441..f22cf8ed47ac 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2567,6 +2567,7 @@  static void process_csb(struct intel_engine_cs *engine)
 	tail = READ_ONCE(*execlists->csb_write);
 	if (unlikely(head == tail))
 		return;
+	execlists->csb_head = tail;
 
 	/*
 	 * Hopefully paired with a wmb() in HW!
@@ -2613,6 +2614,9 @@  static void process_csb(struct intel_engine_cs *engine)
 		if (promote) {
 			struct i915_request * const *old = execlists->active;
 
+			if (GEM_WARN_ON_ONCE(!*execlists->pending))
+				break;
+
 			ring_set_paused(engine, 0);
 
 			/* Point active to the new ELSP; prevent overwriting */
@@ -2635,7 +2639,8 @@  static void process_csb(struct intel_engine_cs *engine)
 
 			WRITE_ONCE(execlists->pending[0], NULL);
 		} else {
-			GEM_BUG_ON(!*execlists->active);
+			if (GEM_WARN_ON_ONCE(!*execlists->active))
+				break;
 
 			/* port0 completed, advanced to port1 */
 			trace_ports(execlists, "completed", execlists->active);
@@ -2686,7 +2691,6 @@  static void process_csb(struct intel_engine_cs *engine)
 		}
 	} while (head != tail);
 
-	execlists->csb_head = head;
 	set_timeslice(engine);
 
 	/*
diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
index f333e88a2b6e..c0c689fa3f19 100644
--- a/drivers/gpu/drm/i915/i915_gem.h
+++ b/drivers/gpu/drm/i915/i915_gem.h
@@ -46,6 +46,7 @@  struct drm_i915_private;
 		} \
 	} while(0)
 #define GEM_WARN_ON(expr) WARN_ON(expr)
+#define GEM_WARN_ON_ONCE(expr) WARN_ON_ONCE(expr)
 
 #define GEM_DEBUG_DECL(var) var
 #define GEM_DEBUG_EXEC(expr) expr
@@ -58,6 +59,7 @@  struct drm_i915_private;
 
 #define GEM_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr)
 #define GEM_WARN_ON(expr) ({ unlikely(!!(expr)); })
+#define GEM_WARN_ON_ONCE(expr) ({ unlikely(!!(expr)); })
 
 #define GEM_DEBUG_DECL(var)
 #define GEM_DEBUG_EXEC(expr) do { } while (0)