diff mbox

[2/2] drm/i915: Flush the RING stop bit after clearing RING_HEAD in reset

Message ID 20180517142442.16979-2-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson May 17, 2018, 2:24 p.m. UTC
Inside the live_hangcheck (reset) selftests, we occasionally see
failures like

<7>[  239.094840] i915_gem_set_wedged rcs0
<7>[  239.094843] i915_gem_set_wedged 	current seqno 19a98, last 19a9a, hangcheck 0 [5158 ms]
<7>[  239.094846] i915_gem_set_wedged 	Reset count: 6239 (global 1)
<7>[  239.094848] i915_gem_set_wedged 	Requests:
<7>[  239.095052] i915_gem_set_wedged 		first  19a99 [e8c:5f] prio=1024 @ 5159ms: (null)
<7>[  239.095056] i915_gem_set_wedged 		last   19a9a [e81:1a] prio=139 @ 5159ms: igt/rcs0[5977]/1
<7>[  239.095059] i915_gem_set_wedged 		active 19a99 [e8c:5f] prio=1024 @ 5159ms: (null)
<7>[  239.095062] i915_gem_set_wedged 		[head 0220, postfix 0280, tail 02a8, batch 0xffffffff_ffffffff]
<7>[  239.100050] i915_gem_set_wedged 		ring->start:  0x00283000
<7>[  239.100053] i915_gem_set_wedged 		ring->head:   0x000001f8
<7>[  239.100055] i915_gem_set_wedged 		ring->tail:   0x000002a8
<7>[  239.100057] i915_gem_set_wedged 		ring->emit:   0x000002a8
<7>[  239.100059] i915_gem_set_wedged 		ring->space:  0x00000f10
<7>[  239.100085] i915_gem_set_wedged 	RING_START: 0x00283000
<7>[  239.100088] i915_gem_set_wedged 	RING_HEAD:  0x00000260
<7>[  239.100091] i915_gem_set_wedged 	RING_TAIL:  0x000002a8
<7>[  239.100094] i915_gem_set_wedged 	RING_CTL:   0x00000001
<7>[  239.100097] i915_gem_set_wedged 	RING_MODE:  0x00000300 [idle]
<7>[  239.100100] i915_gem_set_wedged 	RING_IMR: fffffefe
<7>[  239.100104] i915_gem_set_wedged 	ACTHD:  0x00000000_0000609c
<7>[  239.100108] i915_gem_set_wedged 	BBADDR: 0x00000000_0000609d
<7>[  239.100111] i915_gem_set_wedged 	DMA_FADDR: 0x00000000_00283260
<7>[  239.100114] i915_gem_set_wedged 	IPEIR: 0x00000000
<7>[  239.100117] i915_gem_set_wedged 	IPEHR: 0x02800000
<7>[  239.100120] i915_gem_set_wedged 	Execlist status: 0x00044052 00000002
<7>[  239.100124] i915_gem_set_wedged 	Execlist CSB read 5 [5 cached], write 5 [5 from hws], interrupt posted? no, tasklet queued? no (enabled)
<7>[  239.100128] i915_gem_set_wedged 		ELSP[0] count=1, ring->start=00283000, rq: 19a99 [e8c:5f] prio=1024 @ 5164ms: (null)
<7>[  239.100132] i915_gem_set_wedged 		ELSP[1] count=1, ring->start=00257000, rq: 19a9a [e81:1a] prio=139 @ 5164ms: igt/rcs0[5977]/1
<7>[  239.100135] i915_gem_set_wedged 		HW active? 0x5
<7>[  239.100250] i915_gem_set_wedged 		E 19a99 [e8c:5f] prio=1024 @ 5164ms: (null)
<7>[  239.100338] i915_gem_set_wedged 		E 19a9a [e81:1a] prio=139 @ 5164ms: igt/rcs0[5977]/1
<7>[  239.100340] i915_gem_set_wedged 		Queue priority: 139
<7>[  239.100343] i915_gem_set_wedged 		Q 0 [e98:19] prio=132 @ 5164ms: igt/rcs0[5977]/8
<7>[  239.100346] i915_gem_set_wedged 		Q 0 [e84:19] prio=121 @ 5165ms: igt/rcs0[5977]/2
<7>[  239.100349] i915_gem_set_wedged 		Q 0 [e87:19] prio=82 @ 5165ms: igt/rcs0[5977]/3
<7>[  239.100352] i915_gem_set_wedged 		Q 0 [e84:1a] prio=44 @ 5164ms: igt/rcs0[5977]/2
<7>[  239.100356] i915_gem_set_wedged 		Q 0 [e8b:19] prio=20 @ 5165ms: igt/rcs0[5977]/4
<7>[  239.100362] i915_gem_set_wedged 	drv_selftest [5894] waiting for 19a99

where the GPU saw an arbitration point and idles; AND HAS NOT BEEN RESET!
The RING_MODE indicates that is idle and has the STOP_RING bit set, so
try clearing it.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_uncore.c | 2 ++
 1 file changed, 2 insertions(+)

Comments

Tvrtko Ursulin May 18, 2018, 9:33 a.m. UTC | #1
On 17/05/2018 15:24, Chris Wilson wrote:
> Inside the live_hangcheck (reset) selftests, we occasionally see
> failures like
> 
> <7>[  239.094840] i915_gem_set_wedged rcs0
> <7>[  239.094843] i915_gem_set_wedged 	current seqno 19a98, last 19a9a, hangcheck 0 [5158 ms]
> <7>[  239.094846] i915_gem_set_wedged 	Reset count: 6239 (global 1)
> <7>[  239.094848] i915_gem_set_wedged 	Requests:
> <7>[  239.095052] i915_gem_set_wedged 		first  19a99 [e8c:5f] prio=1024 @ 5159ms: (null)
> <7>[  239.095056] i915_gem_set_wedged 		last   19a9a [e81:1a] prio=139 @ 5159ms: igt/rcs0[5977]/1
> <7>[  239.095059] i915_gem_set_wedged 		active 19a99 [e8c:5f] prio=1024 @ 5159ms: (null)
> <7>[  239.095062] i915_gem_set_wedged 		[head 0220, postfix 0280, tail 02a8, batch 0xffffffff_ffffffff]
> <7>[  239.100050] i915_gem_set_wedged 		ring->start:  0x00283000
> <7>[  239.100053] i915_gem_set_wedged 		ring->head:   0x000001f8
> <7>[  239.100055] i915_gem_set_wedged 		ring->tail:   0x000002a8
> <7>[  239.100057] i915_gem_set_wedged 		ring->emit:   0x000002a8
> <7>[  239.100059] i915_gem_set_wedged 		ring->space:  0x00000f10
> <7>[  239.100085] i915_gem_set_wedged 	RING_START: 0x00283000
> <7>[  239.100088] i915_gem_set_wedged 	RING_HEAD:  0x00000260
> <7>[  239.100091] i915_gem_set_wedged 	RING_TAIL:  0x000002a8
> <7>[  239.100094] i915_gem_set_wedged 	RING_CTL:   0x00000001
> <7>[  239.100097] i915_gem_set_wedged 	RING_MODE:  0x00000300 [idle]
> <7>[  239.100100] i915_gem_set_wedged 	RING_IMR: fffffefe
> <7>[  239.100104] i915_gem_set_wedged 	ACTHD:  0x00000000_0000609c
> <7>[  239.100108] i915_gem_set_wedged 	BBADDR: 0x00000000_0000609d
> <7>[  239.100111] i915_gem_set_wedged 	DMA_FADDR: 0x00000000_00283260
> <7>[  239.100114] i915_gem_set_wedged 	IPEIR: 0x00000000
> <7>[  239.100117] i915_gem_set_wedged 	IPEHR: 0x02800000
> <7>[  239.100120] i915_gem_set_wedged 	Execlist status: 0x00044052 00000002
> <7>[  239.100124] i915_gem_set_wedged 	Execlist CSB read 5 [5 cached], write 5 [5 from hws], interrupt posted? no, tasklet queued? no (enabled)
> <7>[  239.100128] i915_gem_set_wedged 		ELSP[0] count=1, ring->start=00283000, rq: 19a99 [e8c:5f] prio=1024 @ 5164ms: (null)
> <7>[  239.100132] i915_gem_set_wedged 		ELSP[1] count=1, ring->start=00257000, rq: 19a9a [e81:1a] prio=139 @ 5164ms: igt/rcs0[5977]/1
> <7>[  239.100135] i915_gem_set_wedged 		HW active? 0x5
> <7>[  239.100250] i915_gem_set_wedged 		E 19a99 [e8c:5f] prio=1024 @ 5164ms: (null)
> <7>[  239.100338] i915_gem_set_wedged 		E 19a9a [e81:1a] prio=139 @ 5164ms: igt/rcs0[5977]/1
> <7>[  239.100340] i915_gem_set_wedged 		Queue priority: 139
> <7>[  239.100343] i915_gem_set_wedged 		Q 0 [e98:19] prio=132 @ 5164ms: igt/rcs0[5977]/8
> <7>[  239.100346] i915_gem_set_wedged 		Q 0 [e84:19] prio=121 @ 5165ms: igt/rcs0[5977]/2
> <7>[  239.100349] i915_gem_set_wedged 		Q 0 [e87:19] prio=82 @ 5165ms: igt/rcs0[5977]/3
> <7>[  239.100352] i915_gem_set_wedged 		Q 0 [e84:1a] prio=44 @ 5164ms: igt/rcs0[5977]/2
> <7>[  239.100356] i915_gem_set_wedged 		Q 0 [e8b:19] prio=20 @ 5165ms: igt/rcs0[5977]/4
> <7>[  239.100362] i915_gem_set_wedged 	drv_selftest [5894] waiting for 19a99
> 
> where the GPU saw an arbitration point and idles; AND HAS NOT BEEN RESET!
> The RING_MODE indicates that is idle and has the STOP_RING bit set, so
> try clearing it.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/intel_uncore.c | 2 ++
>   1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index b36a3b5736a0..082b0045ac8c 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1720,6 +1720,8 @@ static void gen3_stop_engine(struct intel_engine_cs *engine)
>   	if (I915_READ_FW(RING_HEAD(base)) != 0)
>   		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
>   				 engine->name);
> +
> +	I915_WRITE_FW(RING_MI_MODE(base), _MASKED_BIT_DISABLE(STOP_RING));
>   }
>   
>   static void i915_stop_engines(struct drm_i915_private *dev_priv,
> 

Right, so expectation is after reset STOP_RING will not be set, but it 
sometimes is?

Should we also add a notice or info if it is set in intel_gpu_reset, 
after the reset is called? Could add i915_check_engine_running(..) 
helper or something.

Regards,

Tvrtko
Chris Wilson May 18, 2018, 9:47 a.m. UTC | #2
Quoting Tvrtko Ursulin (2018-05-18 10:33:44)
> 
> On 17/05/2018 15:24, Chris Wilson wrote:
> > Inside the live_hangcheck (reset) selftests, we occasionally see
> > failures like
> > 
> > <7>[  239.094840] i915_gem_set_wedged rcs0
> > <7>[  239.094843] i915_gem_set_wedged         current seqno 19a98, last 19a9a, hangcheck 0 [5158 ms]
> > <7>[  239.094846] i915_gem_set_wedged         Reset count: 6239 (global 1)
> > <7>[  239.094848] i915_gem_set_wedged         Requests:
> > <7>[  239.095052] i915_gem_set_wedged                 first  19a99 [e8c:5f] prio=1024 @ 5159ms: (null)
> > <7>[  239.095056] i915_gem_set_wedged                 last   19a9a [e81:1a] prio=139 @ 5159ms: igt/rcs0[5977]/1
> > <7>[  239.095059] i915_gem_set_wedged                 active 19a99 [e8c:5f] prio=1024 @ 5159ms: (null)
> > <7>[  239.095062] i915_gem_set_wedged                 [head 0220, postfix 0280, tail 02a8, batch 0xffffffff_ffffffff]
> > <7>[  239.100050] i915_gem_set_wedged                 ring->start:  0x00283000
> > <7>[  239.100053] i915_gem_set_wedged                 ring->head:   0x000001f8
> > <7>[  239.100055] i915_gem_set_wedged                 ring->tail:   0x000002a8
> > <7>[  239.100057] i915_gem_set_wedged                 ring->emit:   0x000002a8
> > <7>[  239.100059] i915_gem_set_wedged                 ring->space:  0x00000f10
> > <7>[  239.100085] i915_gem_set_wedged         RING_START: 0x00283000
> > <7>[  239.100088] i915_gem_set_wedged         RING_HEAD:  0x00000260
> > <7>[  239.100091] i915_gem_set_wedged         RING_TAIL:  0x000002a8
> > <7>[  239.100094] i915_gem_set_wedged         RING_CTL:   0x00000001
> > <7>[  239.100097] i915_gem_set_wedged         RING_MODE:  0x00000300 [idle]
> > <7>[  239.100100] i915_gem_set_wedged         RING_IMR: fffffefe
> > <7>[  239.100104] i915_gem_set_wedged         ACTHD:  0x00000000_0000609c
> > <7>[  239.100108] i915_gem_set_wedged         BBADDR: 0x00000000_0000609d
> > <7>[  239.100111] i915_gem_set_wedged         DMA_FADDR: 0x00000000_00283260
> > <7>[  239.100114] i915_gem_set_wedged         IPEIR: 0x00000000
> > <7>[  239.100117] i915_gem_set_wedged         IPEHR: 0x02800000
> > <7>[  239.100120] i915_gem_set_wedged         Execlist status: 0x00044052 00000002
> > <7>[  239.100124] i915_gem_set_wedged         Execlist CSB read 5 [5 cached], write 5 [5 from hws], interrupt posted? no, tasklet queued? no (enabled)
> > <7>[  239.100128] i915_gem_set_wedged                 ELSP[0] count=1, ring->start=00283000, rq: 19a99 [e8c:5f] prio=1024 @ 5164ms: (null)
> > <7>[  239.100132] i915_gem_set_wedged                 ELSP[1] count=1, ring->start=00257000, rq: 19a9a [e81:1a] prio=139 @ 5164ms: igt/rcs0[5977]/1
> > <7>[  239.100135] i915_gem_set_wedged                 HW active? 0x5
> > <7>[  239.100250] i915_gem_set_wedged                 E 19a99 [e8c:5f] prio=1024 @ 5164ms: (null)
> > <7>[  239.100338] i915_gem_set_wedged                 E 19a9a [e81:1a] prio=139 @ 5164ms: igt/rcs0[5977]/1
> > <7>[  239.100340] i915_gem_set_wedged                 Queue priority: 139
> > <7>[  239.100343] i915_gem_set_wedged                 Q 0 [e98:19] prio=132 @ 5164ms: igt/rcs0[5977]/8
> > <7>[  239.100346] i915_gem_set_wedged                 Q 0 [e84:19] prio=121 @ 5165ms: igt/rcs0[5977]/2
> > <7>[  239.100349] i915_gem_set_wedged                 Q 0 [e87:19] prio=82 @ 5165ms: igt/rcs0[5977]/3
> > <7>[  239.100352] i915_gem_set_wedged                 Q 0 [e84:1a] prio=44 @ 5164ms: igt/rcs0[5977]/2
> > <7>[  239.100356] i915_gem_set_wedged                 Q 0 [e8b:19] prio=20 @ 5165ms: igt/rcs0[5977]/4
> > <7>[  239.100362] i915_gem_set_wedged         drv_selftest [5894] waiting for 19a99
> > 
> > where the GPU saw an arbitration point and idles; AND HAS NOT BEEN RESET!
> > The RING_MODE indicates that is idle and has the STOP_RING bit set, so
> > try clearing it.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   drivers/gpu/drm/i915/intel_uncore.c | 2 ++
> >   1 file changed, 2 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> > index b36a3b5736a0..082b0045ac8c 100644
> > --- a/drivers/gpu/drm/i915/intel_uncore.c
> > +++ b/drivers/gpu/drm/i915/intel_uncore.c
> > @@ -1720,6 +1720,8 @@ static void gen3_stop_engine(struct intel_engine_cs *engine)
> >       if (I915_READ_FW(RING_HEAD(base)) != 0)
> >               DRM_DEBUG_DRIVER("%s: ring head not parked\n",
> >                                engine->name);
> > +
> > +     I915_WRITE_FW(RING_MI_MODE(base), _MASKED_BIT_DISABLE(STOP_RING));
> >   }
> >   
> >   static void i915_stop_engines(struct drm_i915_private *dev_priv,
> > 
> 
> Right, so expectation is after reset STOP_RING will not be set, but it 
> sometimes is?

Yes.

> Should we also add a notice or info if it is set in intel_gpu_reset, 
> after the reset is called? Could add i915_check_engine_running(..) 
> helper or something.

Could do, will be any more useful than the dump we give above? ;)
Might be interesting to do the dump on unusual state on takeover just
for reference in later fails.
-Chris
Tvrtko Ursulin May 18, 2018, 9:53 a.m. UTC | #3
On 18/05/2018 10:47, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-05-18 10:33:44)
>>
>> On 17/05/2018 15:24, Chris Wilson wrote:
>>> Inside the live_hangcheck (reset) selftests, we occasionally see
>>> failures like
>>>
>>> <7>[  239.094840] i915_gem_set_wedged rcs0
>>> <7>[  239.094843] i915_gem_set_wedged         current seqno 19a98, last 19a9a, hangcheck 0 [5158 ms]
>>> <7>[  239.094846] i915_gem_set_wedged         Reset count: 6239 (global 1)
>>> <7>[  239.094848] i915_gem_set_wedged         Requests:
>>> <7>[  239.095052] i915_gem_set_wedged                 first  19a99 [e8c:5f] prio=1024 @ 5159ms: (null)
>>> <7>[  239.095056] i915_gem_set_wedged                 last   19a9a [e81:1a] prio=139 @ 5159ms: igt/rcs0[5977]/1
>>> <7>[  239.095059] i915_gem_set_wedged                 active 19a99 [e8c:5f] prio=1024 @ 5159ms: (null)
>>> <7>[  239.095062] i915_gem_set_wedged                 [head 0220, postfix 0280, tail 02a8, batch 0xffffffff_ffffffff]
>>> <7>[  239.100050] i915_gem_set_wedged                 ring->start:  0x00283000
>>> <7>[  239.100053] i915_gem_set_wedged                 ring->head:   0x000001f8
>>> <7>[  239.100055] i915_gem_set_wedged                 ring->tail:   0x000002a8
>>> <7>[  239.100057] i915_gem_set_wedged                 ring->emit:   0x000002a8
>>> <7>[  239.100059] i915_gem_set_wedged                 ring->space:  0x00000f10
>>> <7>[  239.100085] i915_gem_set_wedged         RING_START: 0x00283000
>>> <7>[  239.100088] i915_gem_set_wedged         RING_HEAD:  0x00000260
>>> <7>[  239.100091] i915_gem_set_wedged         RING_TAIL:  0x000002a8
>>> <7>[  239.100094] i915_gem_set_wedged         RING_CTL:   0x00000001
>>> <7>[  239.100097] i915_gem_set_wedged         RING_MODE:  0x00000300 [idle]
>>> <7>[  239.100100] i915_gem_set_wedged         RING_IMR: fffffefe
>>> <7>[  239.100104] i915_gem_set_wedged         ACTHD:  0x00000000_0000609c
>>> <7>[  239.100108] i915_gem_set_wedged         BBADDR: 0x00000000_0000609d
>>> <7>[  239.100111] i915_gem_set_wedged         DMA_FADDR: 0x00000000_00283260
>>> <7>[  239.100114] i915_gem_set_wedged         IPEIR: 0x00000000
>>> <7>[  239.100117] i915_gem_set_wedged         IPEHR: 0x02800000
>>> <7>[  239.100120] i915_gem_set_wedged         Execlist status: 0x00044052 00000002
>>> <7>[  239.100124] i915_gem_set_wedged         Execlist CSB read 5 [5 cached], write 5 [5 from hws], interrupt posted? no, tasklet queued? no (enabled)
>>> <7>[  239.100128] i915_gem_set_wedged                 ELSP[0] count=1, ring->start=00283000, rq: 19a99 [e8c:5f] prio=1024 @ 5164ms: (null)
>>> <7>[  239.100132] i915_gem_set_wedged                 ELSP[1] count=1, ring->start=00257000, rq: 19a9a [e81:1a] prio=139 @ 5164ms: igt/rcs0[5977]/1
>>> <7>[  239.100135] i915_gem_set_wedged                 HW active? 0x5
>>> <7>[  239.100250] i915_gem_set_wedged                 E 19a99 [e8c:5f] prio=1024 @ 5164ms: (null)
>>> <7>[  239.100338] i915_gem_set_wedged                 E 19a9a [e81:1a] prio=139 @ 5164ms: igt/rcs0[5977]/1
>>> <7>[  239.100340] i915_gem_set_wedged                 Queue priority: 139
>>> <7>[  239.100343] i915_gem_set_wedged                 Q 0 [e98:19] prio=132 @ 5164ms: igt/rcs0[5977]/8
>>> <7>[  239.100346] i915_gem_set_wedged                 Q 0 [e84:19] prio=121 @ 5165ms: igt/rcs0[5977]/2
>>> <7>[  239.100349] i915_gem_set_wedged                 Q 0 [e87:19] prio=82 @ 5165ms: igt/rcs0[5977]/3
>>> <7>[  239.100352] i915_gem_set_wedged                 Q 0 [e84:1a] prio=44 @ 5164ms: igt/rcs0[5977]/2
>>> <7>[  239.100356] i915_gem_set_wedged                 Q 0 [e8b:19] prio=20 @ 5165ms: igt/rcs0[5977]/4
>>> <7>[  239.100362] i915_gem_set_wedged         drv_selftest [5894] waiting for 19a99
>>>
>>> where the GPU saw an arbitration point and idles; AND HAS NOT BEEN RESET!
>>> The RING_MODE indicates that is idle and has the STOP_RING bit set, so
>>> try clearing it.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>    drivers/gpu/drm/i915/intel_uncore.c | 2 ++
>>>    1 file changed, 2 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
>>> index b36a3b5736a0..082b0045ac8c 100644
>>> --- a/drivers/gpu/drm/i915/intel_uncore.c
>>> +++ b/drivers/gpu/drm/i915/intel_uncore.c
>>> @@ -1720,6 +1720,8 @@ static void gen3_stop_engine(struct intel_engine_cs *engine)
>>>        if (I915_READ_FW(RING_HEAD(base)) != 0)
>>>                DRM_DEBUG_DRIVER("%s: ring head not parked\n",
>>>                                 engine->name);
>>> +
>>> +     I915_WRITE_FW(RING_MI_MODE(base), _MASKED_BIT_DISABLE(STOP_RING));
>>>    }
>>>    
>>>    static void i915_stop_engines(struct drm_i915_private *dev_priv,
>>>
>>
>> Right, so expectation is after reset STOP_RING will not be set, but it
>> sometimes is?
> 
> Yes.
> 
>> Should we also add a notice or info if it is set in intel_gpu_reset,
>> after the reset is called? Could add i915_check_engine_running(..)
>> helper or something.
> 
> Could do, will be any more useful than the dump we give above? ;)

I think so - it would immediately and clearly say reset did not go to 
plan and bad things could follow. (While the hangcheck dump above makes 
requires one to think and analyse.)

Also, is stuck STOP_RING bit the only thing which goes wrong or could 
there be more weirdness under the covers?

> Might be interesting to do the dump on unusual state on takeover just
> for reference in later fails.

Takeover as in when initializing the engines?

Regards,

Tvrtko
Tvrtko Ursulin May 18, 2018, 10:02 a.m. UTC | #4
On 18/05/2018 10:47, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-05-18 10:33:44)
>>
>> On 17/05/2018 15:24, Chris Wilson wrote:
>>> Inside the live_hangcheck (reset) selftests, we occasionally see
>>> failures like
>>>
>>> <7>[  239.094840] i915_gem_set_wedged rcs0
>>> <7>[  239.094843] i915_gem_set_wedged         current seqno 19a98, last 19a9a, hangcheck 0 [5158 ms]
>>> <7>[  239.094846] i915_gem_set_wedged         Reset count: 6239 (global 1)
>>> <7>[  239.094848] i915_gem_set_wedged         Requests:
>>> <7>[  239.095052] i915_gem_set_wedged                 first  19a99 [e8c:5f] prio=1024 @ 5159ms: (null)
>>> <7>[  239.095056] i915_gem_set_wedged                 last   19a9a [e81:1a] prio=139 @ 5159ms: igt/rcs0[5977]/1
>>> <7>[  239.095059] i915_gem_set_wedged                 active 19a99 [e8c:5f] prio=1024 @ 5159ms: (null)
>>> <7>[  239.095062] i915_gem_set_wedged                 [head 0220, postfix 0280, tail 02a8, batch 0xffffffff_ffffffff]
>>> <7>[  239.100050] i915_gem_set_wedged                 ring->start:  0x00283000
>>> <7>[  239.100053] i915_gem_set_wedged                 ring->head:   0x000001f8
>>> <7>[  239.100055] i915_gem_set_wedged                 ring->tail:   0x000002a8
>>> <7>[  239.100057] i915_gem_set_wedged                 ring->emit:   0x000002a8
>>> <7>[  239.100059] i915_gem_set_wedged                 ring->space:  0x00000f10
>>> <7>[  239.100085] i915_gem_set_wedged         RING_START: 0x00283000
>>> <7>[  239.100088] i915_gem_set_wedged         RING_HEAD:  0x00000260
>>> <7>[  239.100091] i915_gem_set_wedged         RING_TAIL:  0x000002a8
>>> <7>[  239.100094] i915_gem_set_wedged         RING_CTL:   0x00000001
>>> <7>[  239.100097] i915_gem_set_wedged         RING_MODE:  0x00000300 [idle]
>>> <7>[  239.100100] i915_gem_set_wedged         RING_IMR: fffffefe
>>> <7>[  239.100104] i915_gem_set_wedged         ACTHD:  0x00000000_0000609c
>>> <7>[  239.100108] i915_gem_set_wedged         BBADDR: 0x00000000_0000609d
>>> <7>[  239.100111] i915_gem_set_wedged         DMA_FADDR: 0x00000000_00283260
>>> <7>[  239.100114] i915_gem_set_wedged         IPEIR: 0x00000000
>>> <7>[  239.100117] i915_gem_set_wedged         IPEHR: 0x02800000
>>> <7>[  239.100120] i915_gem_set_wedged         Execlist status: 0x00044052 00000002
>>> <7>[  239.100124] i915_gem_set_wedged         Execlist CSB read 5 [5 cached], write 5 [5 from hws], interrupt posted? no, tasklet queued? no (enabled)
>>> <7>[  239.100128] i915_gem_set_wedged                 ELSP[0] count=1, ring->start=00283000, rq: 19a99 [e8c:5f] prio=1024 @ 5164ms: (null)
>>> <7>[  239.100132] i915_gem_set_wedged                 ELSP[1] count=1, ring->start=00257000, rq: 19a9a [e81:1a] prio=139 @ 5164ms: igt/rcs0[5977]/1
>>> <7>[  239.100135] i915_gem_set_wedged                 HW active? 0x5
>>> <7>[  239.100250] i915_gem_set_wedged                 E 19a99 [e8c:5f] prio=1024 @ 5164ms: (null)
>>> <7>[  239.100338] i915_gem_set_wedged                 E 19a9a [e81:1a] prio=139 @ 5164ms: igt/rcs0[5977]/1
>>> <7>[  239.100340] i915_gem_set_wedged                 Queue priority: 139
>>> <7>[  239.100343] i915_gem_set_wedged                 Q 0 [e98:19] prio=132 @ 5164ms: igt/rcs0[5977]/8
>>> <7>[  239.100346] i915_gem_set_wedged                 Q 0 [e84:19] prio=121 @ 5165ms: igt/rcs0[5977]/2
>>> <7>[  239.100349] i915_gem_set_wedged                 Q 0 [e87:19] prio=82 @ 5165ms: igt/rcs0[5977]/3
>>> <7>[  239.100352] i915_gem_set_wedged                 Q 0 [e84:1a] prio=44 @ 5164ms: igt/rcs0[5977]/2
>>> <7>[  239.100356] i915_gem_set_wedged                 Q 0 [e8b:19] prio=20 @ 5165ms: igt/rcs0[5977]/4
>>> <7>[  239.100362] i915_gem_set_wedged         drv_selftest [5894] waiting for 19a99
>>>
>>> where the GPU saw an arbitration point and idles; AND HAS NOT BEEN RESET!
>>> The RING_MODE indicates that is idle and has the STOP_RING bit set, so
>>> try clearing it.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>    drivers/gpu/drm/i915/intel_uncore.c | 2 ++
>>>    1 file changed, 2 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
>>> index b36a3b5736a0..082b0045ac8c 100644
>>> --- a/drivers/gpu/drm/i915/intel_uncore.c
>>> +++ b/drivers/gpu/drm/i915/intel_uncore.c
>>> @@ -1720,6 +1720,8 @@ static void gen3_stop_engine(struct intel_engine_cs *engine)
>>>        if (I915_READ_FW(RING_HEAD(base)) != 0)
>>>                DRM_DEBUG_DRIVER("%s: ring head not parked\n",
>>>                                 engine->name);
>>> +
>>> +     I915_WRITE_FW(RING_MI_MODE(base), _MASKED_BIT_DISABLE(STOP_RING));
>>>    }
>>>    
>>>    static void i915_stop_engines(struct drm_i915_private *dev_priv,
>>>
>>
>> Right, so expectation is after reset STOP_RING will not be set, but it
>> sometimes is?
> 
> Yes.

Also there is a comment in there which says engine must be stopped 
before reset on some platforms. So shouldn't the manual attempt to 
unstuck it go after the reset and not in gen3_stop_engine?

Like the suggested i915_check_engine_running:

if (stopped) {
	DRM_NOTICE(Manually starting engine after reset);
	clear_stop_ring;
}

?


>> Should we also add a notice or info if it is set in intel_gpu_reset,
>> after the reset is called? Could add i915_check_engine_running(..)
>> helper or something.
> 
> Could do, will be any more useful than the dump we give above? ;)
> Might be interesting to do the dump on unusual state on takeover just
> for reference in later fails.
> -Chris
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index b36a3b5736a0..082b0045ac8c 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1720,6 +1720,8 @@  static void gen3_stop_engine(struct intel_engine_cs *engine)
 	if (I915_READ_FW(RING_HEAD(base)) != 0)
 		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
 				 engine->name);
+
+	I915_WRITE_FW(RING_MI_MODE(base), _MASKED_BIT_DISABLE(STOP_RING));
 }
 
 static void i915_stop_engines(struct drm_i915_private *dev_priv,