Message ID | 20170717091141.23102-14-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 17/07/17 02:11, Chris Wilson wrote: > Triggering a GPU reset for one engine affects another, notably > corrupting the context status buffer (CSB) effectively losing track of > inflight requests. > > Adding a few printks: > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index ad41836fa5e5..a969456bc0fa 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -1953,6 +1953,7 @@ int i915_reset_engine(struct intel_engine_cs *engine) > goto out; > } > > + pr_err("Resetting %s\n", engine->name); > ret = intel_gpu_reset(engine->i915, intel_engine_flag(engine)); > if (ret) { > /* If we fail here, we expect to fallback to a global reset */ > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index 716e5c9ea222..a72bc35d0870 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -355,6 +355,7 @@ static void execlists_submit_ports(struct intel_engine_cs *engine) > execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); > port_set(&port[n], port_pack(rq, count)); > desc = execlists_update_context(rq); > + pr_err("%s: in (rq=%x) ctx=%d\n", engine->name, rq->global_seqno, upper_32_bits(desc)); > GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc)); > } else { > GEM_BUG_ON(!n); > @@ -594,9 +595,23 @@ static void intel_lrc_irq_handler(unsigned long data) > if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK)) > continue; > > + pr_err("%s: out CSB (%x head=%d, tail=%d), ctx=%d, rq=%d\n", > + engine->name, > + readl(csb_mmio), > + head, tail, > + readl(buf+2*head+1), > + port->context_id); > + > /* Check the context/desc id for this event matches */ > - GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) != > - port->context_id); > + if (readl(buf + 2 * head + 1) != port->context_id) { > + pr_err("%s: BUG CSB (%x head=%d, tail=%d), ctx=%d, rq=%d\n", > + engine->name, > + readl(csb_mmio), > + head, tail, > + readl(buf+2*head+1), > + port->context_id); > + BUG(); > + } > > rq = port_unpack(port, &count); > GEM_BUG_ON(count == 0); > > Results in: > > [ 6423.006602] Resetting rcs0 > [ 6423.009080] rcs0: in (rq=fffffe70) ctx=1 > [ 6423.009216] rcs0: in (rq=fffffe6f) ctx=3 > [ 6423.009542] rcs0: out CSB (2 head=1, tail=2), ctx=3, rq=3 > [ 6423.009619] Resetting bcs0 > [ 6423.009980] rcs0: BUG CSB (0 head=1, tail=2), ctx=0, rq=3 > > Note that this bug may be affect all machines and not just Broxton, > Broxton is just the first machine on which I have confirmed this bug. Hopefully this is just broxton being broxton... I think I already sent this, but anyway... Acked-by: Michel Thierry <michel.thierry@intel.com> > > Fixes: 142bc7d99bcf ("drm/i915: Modify error handler for per engine hang recovery") > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> > Cc: Michel Thierry <michel.thierry@intel.com> > --- > drivers/gpu/drm/i915/i915_pci.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c > index a1e6b696bcfa..09d97e0990b7 100644 > --- a/drivers/gpu/drm/i915/i915_pci.c > +++ b/drivers/gpu/drm/i915/i915_pci.c > @@ -398,6 +398,7 @@ static const struct intel_device_info intel_broxton_info = { > GEN9_LP_FEATURES, > .platform = INTEL_BROXTON, > .ddb_size = 512, > + .has_reset_engine = false, > }; > > static const struct intel_device_info intel_geminilake_info = { >
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index ad41836fa5e5..a969456bc0fa 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -1953,6 +1953,7 @@ int i915_reset_engine(struct intel_engine_cs *engine) goto out; } + pr_err("Resetting %s\n", engine->name); ret = intel_gpu_reset(engine->i915, intel_engine_flag(engine)); if (ret) { /* If we fail here, we expect to fallback to a global reset */ diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 716e5c9ea222..a72bc35d0870 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -355,6 +355,7 @@ static void execlists_submit_ports(struct intel_engine_cs *engine) execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); port_set(&port[n], port_pack(rq, count)); desc = execlists_update_context(rq); + pr_err("%s: in (rq=%x) ctx=%d\n", engine->name, rq->global_seqno, upper_32_bits(desc)); GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc)); } else { GEM_BUG_ON(!n); @@ -594,9 +595,23 @@ static void intel_lrc_irq_handler(unsigned long data) if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK)) continue; + pr_err("%s: out CSB (%x head=%d, tail=%d), ctx=%d, rq=%d\n", + engine->name, + readl(csb_mmio), + head, tail, + readl(buf+2*head+1), + port->context_id); + /* Check the context/desc id for this event matches */ - GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) != - port->context_id); + if (readl(buf + 2 * head + 1) != port->context_id) { + pr_err("%s: BUG CSB (%x head=%d, tail=%d), ctx=%d, rq=%d\n", + engine->name, + readl(csb_mmio), + head, tail, + readl(buf+2*head+1), + port->context_id); + BUG(); + } rq = port_unpack(port, &count); GEM_BUG_ON(count == 0); Results in: [ 6423.006602] Resetting rcs0 [ 6423.009080] rcs0: in (rq=fffffe70) ctx=1 [ 6423.009216] rcs0: in (rq=fffffe6f) ctx=3 [ 6423.009542] rcs0: out CSB (2 head=1, tail=2), ctx=3, rq=3 [ 6423.009619] Resetting bcs0 [ 6423.009980] rcs0: BUG CSB (0 head=1, tail=2), ctx=0, rq=3 Note that this bug may be affect all machines and not just Broxton, Broxton is just the first machine on which I have confirmed this bug. Fixes: 142bc7d99bcf ("drm/i915: Modify error handler for per engine hang recovery") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Cc: Michel Thierry <michel.thierry@intel.com> --- drivers/gpu/drm/i915/i915_pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index a1e6b696bcfa..09d97e0990b7 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -398,6 +398,7 @@ static const struct intel_device_info intel_broxton_info = { GEN9_LP_FEATURES, .platform = INTEL_BROXTON, .ddb_size = 512, + .has_reset_engine = false, }; static const struct intel_device_info intel_geminilake_info = {