diff mbox series

[3/4] drm/i915/gt: Apply the CSB w/a for all

Message ID 20200915124150.12045-3-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/4] drm/i915/gt: Widen CSB pointer to u64 for the parsers | expand

Commit Message

Chris Wilson Sept. 15, 2020, 12:41 p.m. UTC
Since we expect to inline the csb_parse() routines, the w/a for the
stale CSB data on Tigerlake will be pulled into process_csb(), and so we
might as well simply reuse the logic for all, and so will hopefully
avoid any strange behaviour on Icelake that was not covered by our
previous w/a.

References: d8f505311717 ("drm/i915/icl: Forcibly evict stale csb entries")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Bruce Chang <yu.bruce.chang@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c | 79 +++++++++++++++++++----------
 1 file changed, 51 insertions(+), 28 deletions(-)

Comments

Mika Kuoppala Sept. 15, 2020, 1:29 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Since we expect to inline the csb_parse() routines, the w/a for the
> stale CSB data on Tigerlake will be pulled into process_csb(), and so we
> might as well simply reuse the logic for all, and so will hopefully
> avoid any strange behaviour on Icelake that was not covered by our
> previous w/a.
>
> References: d8f505311717 ("drm/i915/icl: Forcibly evict stale csb entries")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Bruce Chang <yu.bruce.chang@intel.com>

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/gt/intel_lrc.c | 79 +++++++++++++++++++----------
>  1 file changed, 51 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index d75712a503b7..fcb6ec3d55f4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -2496,25 +2496,11 @@ invalidate_csb_entries(const u64 *first, const u64 *last)
>   *     bits 47-57: sw context id of the lrc the GT switched away from
>   *     bits 58-63: sw counter of the lrc the GT switched away from
>   */
> -static inline bool gen12_csb_parse(const u64 *csb)
> +static inline bool gen12_csb_parse(const u64 csb)
>  {
> -	bool ctx_away_valid;
> -	bool new_queue;
> -	u64 entry;
> -
> -	/* HSD#22011248461 */
> -	entry = READ_ONCE(*csb);
> -	if (unlikely(entry == -1)) {
> -		preempt_disable();
> -		if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50))
> -			GEM_WARN_ON("50us CSB timeout");
> -		preempt_enable();
> -	}
> -	WRITE_ONCE(*(u64 *)csb, -1);
> -
> -	ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry));
> -	new_queue =
> -		lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
> +	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(csb));
> +	bool new_queue =
> +		lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
>  
>  	/*
>  	 * The context switch detail is not guaranteed to be 5 when a preemption
> @@ -2524,7 +2510,7 @@ static inline bool gen12_csb_parse(const u64 *csb)
>  	 * would require some extra handling, but we don't support that.
>  	 */
>  	if (!ctx_away_valid || new_queue) {
> -		GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry)));
> +		GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(csb)));
>  		return true;
>  	}
>  
> @@ -2533,19 +2519,56 @@ static inline bool gen12_csb_parse(const u64 *csb)
>  	 * context switch on an unsuccessful wait instruction since we always
>  	 * use polling mode.
>  	 */
> -	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry)));
> +	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb)));
>  	return false;
>  }
>  
> -static inline bool gen8_csb_parse(const u64 *csb)
> +static inline bool gen8_csb_parse(const u64 csb)
> +{
> +	return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
> +}
> +
> +static noinline u64 wa_csb_read(u64 * const csb)
>  {
> -	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
> +	u64 entry;
> +
> +	preempt_disable();
> +	if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50))
> +		GEM_WARN_ON("50us CSB timeout");
> +	preempt_enable();
> +
> +	return entry;
> +}
> +
> +static inline u64 csb_read(u64 * const csb)
> +{
> +	u64 entry = READ_ONCE(*csb);
> +
> +	/*
> +	 * Unfortunately, the GPU does not always serialise its write
> +	 * of the CSB entries before its write of the CSB pointer, at least
> +	 * from the perspective of the CPU, using what is known as a Global
> +	 * Observation Point. We may read a new CSB tail pointer, but then
> +	 * read the stale CSB entries, causing us to misinterpret the
> +	 * context-switch events, and eventually declare the GPU hung.
> +	 *
> +	 * icl:HSDES#1806554093
> +	 * tgl:HSDES#22011248461
> +	 */
> +	if (unlikely(entry == -1))
> +		entry = wa_csb_read(csb);
> +
> +	/* Consume this entry so that we can spot its future reuse. */
> +	WRITE_ONCE(*csb, -1);
> +
> +	/* ELSP is an implicit wmb() before the GPU wraps and overwrites csb */
> +	return entry;
>  }
>  
>  static void process_csb(struct intel_engine_cs *engine)
>  {
>  	struct intel_engine_execlists * const execlists = &engine->execlists;
> -	const u64 * const buf = execlists->csb_status;
> +	u64 * const buf = execlists->csb_status;
>  	const u8 num_entries = execlists->csb_size;
>  	u8 head, tail;
>  
> @@ -2603,6 +2626,7 @@ static void process_csb(struct intel_engine_cs *engine)
>  	rmb();
>  	do {
>  		bool promote;
> +		u64 csb;
>  
>  		if (++head == num_entries)
>  			head = 0;
> @@ -2625,15 +2649,14 @@ static void process_csb(struct intel_engine_cs *engine)
>  		 * status notifier.
>  		 */
>  
> +		csb = csb_read(buf + head);
>  		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
> -			     head,
> -			     upper_32_bits(buf[head]),
> -			     lower_32_bits(buf[head]));
> +			     head, upper_32_bits(csb), lower_32_bits(csb));
>  
>  		if (INTEL_GEN(engine->i915) >= 12)
> -			promote = gen12_csb_parse(buf + head);
> +			promote = gen12_csb_parse(csb);
>  		else
> -			promote = gen8_csb_parse(buf + head);
> +			promote = gen8_csb_parse(csb);
>  		if (promote) {
>  			struct i915_request * const *old = execlists->active;
>  
> -- 
> 2.20.1
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index d75712a503b7..fcb6ec3d55f4 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2496,25 +2496,11 @@  invalidate_csb_entries(const u64 *first, const u64 *last)
  *     bits 47-57: sw context id of the lrc the GT switched away from
  *     bits 58-63: sw counter of the lrc the GT switched away from
  */
-static inline bool gen12_csb_parse(const u64 *csb)
+static inline bool gen12_csb_parse(const u64 csb)
 {
-	bool ctx_away_valid;
-	bool new_queue;
-	u64 entry;
-
-	/* HSD#22011248461 */
-	entry = READ_ONCE(*csb);
-	if (unlikely(entry == -1)) {
-		preempt_disable();
-		if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50))
-			GEM_WARN_ON("50us CSB timeout");
-		preempt_enable();
-	}
-	WRITE_ONCE(*(u64 *)csb, -1);
-
-	ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry));
-	new_queue =
-		lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
+	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(csb));
+	bool new_queue =
+		lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
 
 	/*
 	 * The context switch detail is not guaranteed to be 5 when a preemption
@@ -2524,7 +2510,7 @@  static inline bool gen12_csb_parse(const u64 *csb)
 	 * would require some extra handling, but we don't support that.
 	 */
 	if (!ctx_away_valid || new_queue) {
-		GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry)));
+		GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(csb)));
 		return true;
 	}
 
@@ -2533,19 +2519,56 @@  static inline bool gen12_csb_parse(const u64 *csb)
 	 * context switch on an unsuccessful wait instruction since we always
 	 * use polling mode.
 	 */
-	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry)));
+	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb)));
 	return false;
 }
 
-static inline bool gen8_csb_parse(const u64 *csb)
+static inline bool gen8_csb_parse(const u64 csb)
+{
+	return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
+}
+
+static noinline u64 wa_csb_read(u64 * const csb)
 {
-	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
+	u64 entry;
+
+	preempt_disable();
+	if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50))
+		GEM_WARN_ON("50us CSB timeout");
+	preempt_enable();
+
+	return entry;
+}
+
+static inline u64 csb_read(u64 * const csb)
+{
+	u64 entry = READ_ONCE(*csb);
+
+	/*
+	 * Unfortunately, the GPU does not always serialise its write
+	 * of the CSB entries before its write of the CSB pointer, at least
+	 * from the perspective of the CPU, using what is known as a Global
+	 * Observation Point. We may read a new CSB tail pointer, but then
+	 * read the stale CSB entries, causing us to misinterpret the
+	 * context-switch events, and eventually declare the GPU hung.
+	 *
+	 * icl:HSDES#1806554093
+	 * tgl:HSDES#22011248461
+	 */
+	if (unlikely(entry == -1))
+		entry = wa_csb_read(csb);
+
+	/* Consume this entry so that we can spot its future reuse. */
+	WRITE_ONCE(*csb, -1);
+
+	/* ELSP is an implicit wmb() before the GPU wraps and overwrites csb */
+	return entry;
 }
 
 static void process_csb(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	const u64 * const buf = execlists->csb_status;
+	u64 * const buf = execlists->csb_status;
 	const u8 num_entries = execlists->csb_size;
 	u8 head, tail;
 
@@ -2603,6 +2626,7 @@  static void process_csb(struct intel_engine_cs *engine)
 	rmb();
 	do {
 		bool promote;
+		u64 csb;
 
 		if (++head == num_entries)
 			head = 0;
@@ -2625,15 +2649,14 @@  static void process_csb(struct intel_engine_cs *engine)
 		 * status notifier.
 		 */
 
+		csb = csb_read(buf + head);
 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
-			     head,
-			     upper_32_bits(buf[head]),
-			     lower_32_bits(buf[head]));
+			     head, upper_32_bits(csb), lower_32_bits(csb));
 
 		if (INTEL_GEN(engine->i915) >= 12)
-			promote = gen12_csb_parse(buf + head);
+			promote = gen12_csb_parse(csb);
 		else
-			promote = gen8_csb_parse(buf + head);
+			promote = gen8_csb_parse(csb);
 		if (promote) {
 			struct i915_request * const *old = execlists->active;