diff mbox

[14/15] drm/i915: Disable per-engine reset for Broxton

Message ID 20170717091141.23102-14-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson July 17, 2017, 9:11 a.m. UTC
Triggering a GPU reset for one engine affects another, notably
corrupting the context status buffer (CSB) effectively losing track of
inflight requests.

Adding a few printks:

Comments

Michel Thierry July 17, 2017, 11:57 p.m. UTC | #1
On 17/07/17 02:11, Chris Wilson wrote:
> Triggering a GPU reset for one engine affects another, notably
> corrupting the context status buffer (CSB) effectively losing track of
> inflight requests.
> 
> Adding a few printks:
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index ad41836fa5e5..a969456bc0fa 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1953,6 +1953,7 @@ int i915_reset_engine(struct intel_engine_cs *engine)
>                  goto out;
>          }
> 
> +       pr_err("Resetting %s\n", engine->name);
>          ret = intel_gpu_reset(engine->i915, intel_engine_flag(engine));
>          if (ret) {
>                  /* If we fail here, we expect to fallback to a global reset */
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 716e5c9ea222..a72bc35d0870 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -355,6 +355,7 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>                                  execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
>                          port_set(&port[n], port_pack(rq, count));
>                          desc = execlists_update_context(rq);
> +                       pr_err("%s: in (rq=%x) ctx=%d\n", engine->name, rq->global_seqno, upper_32_bits(desc));
>                          GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
>                  } else {
>                          GEM_BUG_ON(!n);
> @@ -594,9 +595,23 @@ static void intel_lrc_irq_handler(unsigned long data)
>                          if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
>                                  continue;
> 
> +                       pr_err("%s: out CSB (%x head=%d, tail=%d), ctx=%d, rq=%d\n",
> +                                       engine->name,
> +                                       readl(csb_mmio),
> +                                       head, tail,
> +                                       readl(buf+2*head+1),
> +                                       port->context_id);
> +
>                          /* Check the context/desc id for this event matches */
> -                       GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
> -                                        port->context_id);
> +                       if (readl(buf + 2 * head + 1) != port->context_id) {
> +                               pr_err("%s: BUG CSB (%x head=%d, tail=%d), ctx=%d, rq=%d\n",
> +                                               engine->name,
> +                                               readl(csb_mmio),
> +                                               head, tail,
> +                                               readl(buf+2*head+1),
> +                                               port->context_id);
> +                               BUG();
> +                       }
> 
>                          rq = port_unpack(port, &count);
>                          GEM_BUG_ON(count == 0);
> 
> Results in:
> 
> [ 6423.006602] Resetting rcs0
> [ 6423.009080] rcs0: in (rq=fffffe70) ctx=1
> [ 6423.009216] rcs0: in (rq=fffffe6f) ctx=3
> [ 6423.009542] rcs0: out CSB (2 head=1, tail=2), ctx=3, rq=3
> [ 6423.009619] Resetting bcs0
> [ 6423.009980] rcs0: BUG CSB (0 head=1, tail=2), ctx=0, rq=3
> 
> Note that this bug may be affect all machines and not just Broxton,
> Broxton is just the first machine on which I have confirmed this bug.

Hopefully this is just broxton being broxton... I think I already sent 
this, but anyway...

Acked-by: Michel Thierry <michel.thierry@intel.com>

> 
> Fixes: 142bc7d99bcf ("drm/i915: Modify error handler for per engine hang recovery")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_pci.c | 1 +
>   1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
> index a1e6b696bcfa..09d97e0990b7 100644
> --- a/drivers/gpu/drm/i915/i915_pci.c
> +++ b/drivers/gpu/drm/i915/i915_pci.c
> @@ -398,6 +398,7 @@ static const struct intel_device_info intel_broxton_info = {
>   	GEN9_LP_FEATURES,
>   	.platform = INTEL_BROXTON,
>   	.ddb_size = 512,
> +	.has_reset_engine = false,
>   };
>   
>   static const struct intel_device_info intel_geminilake_info = {
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ad41836fa5e5..a969456bc0fa 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1953,6 +1953,7 @@  int i915_reset_engine(struct intel_engine_cs *engine)
                goto out;
        }

+       pr_err("Resetting %s\n", engine->name);
        ret = intel_gpu_reset(engine->i915, intel_engine_flag(engine));
        if (ret) {
                /* If we fail here, we expect to fallback to a global reset */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 716e5c9ea222..a72bc35d0870 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -355,6 +355,7 @@  static void execlists_submit_ports(struct intel_engine_cs *engine)
                                execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
                        port_set(&port[n], port_pack(rq, count));
                        desc = execlists_update_context(rq);
+                       pr_err("%s: in (rq=%x) ctx=%d\n", engine->name, rq->global_seqno, upper_32_bits(desc));
                        GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
                } else {
                        GEM_BUG_ON(!n);
@@ -594,9 +595,23 @@  static void intel_lrc_irq_handler(unsigned long data)
                        if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
                                continue;

+                       pr_err("%s: out CSB (%x head=%d, tail=%d), ctx=%d, rq=%d\n",
+                                       engine->name,
+                                       readl(csb_mmio),
+                                       head, tail,
+                                       readl(buf+2*head+1),
+                                       port->context_id);
+
                        /* Check the context/desc id for this event matches */
-                       GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
-                                        port->context_id);
+                       if (readl(buf + 2 * head + 1) != port->context_id) {
+                               pr_err("%s: BUG CSB (%x head=%d, tail=%d), ctx=%d, rq=%d\n",
+                                               engine->name,
+                                               readl(csb_mmio),
+                                               head, tail,
+                                               readl(buf+2*head+1),
+                                               port->context_id);
+                               BUG();
+                       }

                        rq = port_unpack(port, &count);
                        GEM_BUG_ON(count == 0);

Results in:

[ 6423.006602] Resetting rcs0
[ 6423.009080] rcs0: in (rq=fffffe70) ctx=1
[ 6423.009216] rcs0: in (rq=fffffe6f) ctx=3
[ 6423.009542] rcs0: out CSB (2 head=1, tail=2), ctx=3, rq=3
[ 6423.009619] Resetting bcs0
[ 6423.009980] rcs0: BUG CSB (0 head=1, tail=2), ctx=0, rq=3

Note that this bug may be affect all machines and not just Broxton,
Broxton is just the first machine on which I have confirmed this bug.

Fixes: 142bc7d99bcf ("drm/i915: Modify error handler for per engine hang recovery")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_pci.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index a1e6b696bcfa..09d97e0990b7 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -398,6 +398,7 @@  static const struct intel_device_info intel_broxton_info = {
 	GEN9_LP_FEATURES,
 	.platform = INTEL_BROXTON,
 	.ddb_size = 512,
+	.has_reset_engine = false,
 };
 
 static const struct intel_device_info intel_geminilake_info = {