diff mbox series

[3/3] drm/i915: Pipeline PDP updates for Braswell

Message ID 20181207090213.14352-3-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/3] drm/i915: Push EMIT_INVALIDATE at request start to backends | expand

Commit Message

Chris Wilson Dec. 7, 2018, 9:02 a.m. UTC
Currently we face a severe problem on Braswell that manifests as invalid
ppGTT accesses. The code tries to maintain the PDP (page directory
pointers) inside the context in two ways, direct write into the context
and a pipelined LRI update. The direct write into the context is
fundamentally racy as it is unserialised with any access (read or write)
the GPU is doing. By asserting that Braswell is not used with vGPU
(currently an unsupported platform) we can eliminate the dangerous
direct write into the context image and solely use the pipelined update.

However, the LRI of the PDP fouls up the GPU, causing it to freeze and
take out the machine with "forcewake ack timeouts". This seems possible
to workaround by preventing the GPU from sleeping (via means of
disabling the power-state management interface, i.e. forcing each ring
to remain awake) around the update.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108656
References: https://bugs.freedesktop.org/show_bug.cgi?id=108714
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c |   2 -
 drivers/gpu/drm/i915/intel_lrc.c    | 140 ++++++++++++++--------------
 2 files changed, 69 insertions(+), 73 deletions(-)

Comments

Tvrtko Ursulin Dec. 7, 2018, 10:30 a.m. UTC | #1
On 07/12/2018 09:02, Chris Wilson wrote:
> Currently we face a severe problem on Braswell that manifests as invalid
> ppGTT accesses. The code tries to maintain the PDP (page directory
> pointers) inside the context in two ways, direct write into the context
> and a pipelined LRI update. The direct write into the context is
> fundamentally racy as it is unserialised with any access (read or write)
> the GPU is doing. By asserting that Braswell is not used with vGPU
> (currently an unsupported platform) we can eliminate the dangerous
> direct write into the context image and solely use the pipelined update.
> 
> However, the LRI of the PDP fouls up the GPU, causing it to freeze and
> take out the machine with "forcewake ack timeouts". This seems possible
> to workaround by preventing the GPU from sleeping (via means of
> disabling the power-state management interface, i.e. forcing each ring
> to remain awake) around the update.

Changelog is missing.

> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108656
> References: https://bugs.freedesktop.org/show_bug.cgi?id=108714
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem_gtt.c |   2 -
>   drivers/gpu/drm/i915/intel_lrc.c    | 140 ++++++++++++++--------------
>   2 files changed, 69 insertions(+), 73 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index add1fe7aeb93..62bde517d383 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1423,8 +1423,6 @@ static int gen8_ppgtt_alloc_pdp(struct i915_address_space *vm,
>   			gen8_initialize_pd(vm, pd);
>   			gen8_ppgtt_set_pdpe(vm, pdp, pd, pdpe);
>   			GEM_BUG_ON(pdp->used_pdpes > i915_pdpes_per_pdp(vm));
> -
> -			mark_tlbs_dirty(i915_vm_to_ppgtt(vm));
>   		}
>   
>   		ret = gen8_ppgtt_alloc_pd(vm, pd, start, length);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index b1f5db3442eb..c84bdc21bcce 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -363,31 +363,12 @@ execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
>   	trace_i915_request_out(rq);
>   }
>   
> -static void
> -execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
> -{
> -	ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
> -	ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
> -	ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
> -	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
> -}
> -
>   static u64 execlists_update_context(struct i915_request *rq)
>   {
> -	struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
>   	struct intel_context *ce = rq->hw_context;
> -	u32 *reg_state = ce->lrc_reg_state;
>   
> -	reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
> -
> -	/*
> -	 * True 32b PPGTT with dynamic page allocation: update PDP
> -	 * registers and point the unallocated PDPs to scratch page.
> -	 * PML4 is allocated during ppgtt init, so this is not needed
> -	 * in 48-bit mode.
> -	 */
> -	if (!i915_vm_is_48bit(&ppgtt->vm))
> -		execlists_update_context_pdps(ppgtt, reg_state);
> +	ce->lrc_reg_state[CTX_RING_TAIL + 1] =
> +		intel_ring_set_tail(rq->ring, rq->tail);
>   
>   	/*
>   	 * Make sure the context image is complete before we submit it to HW.
> @@ -1247,6 +1228,59 @@ execlists_context_pin(struct intel_engine_cs *engine,
>   	return __execlists_context_pin(engine, ctx, ce);
>   }
>   
> +static int emit_pdps(struct i915_request *rq)
> +{
> +	const struct intel_engine_cs * const engine = rq->engine;
> +	struct i915_hw_ppgtt * const ppgtt = rq->gem_context->ppgtt;
> +	int err, i;
> +	u32 *cs;
> +
> +	GEM_BUG_ON(intel_vgpu_active(rq->i915));
> +
> +	/*
> +	 * Beware ye of the dragons, this sequence is magic!
> +	 *
> +	 * Small changes to this sequence can cause anything from
> +	 * GPU hangs to forcewake errors and machine lockups!
> +	 */
> +
> +	/* Flush any residual operations from the context load */
> +	err = engine->emit_flush(rq, EMIT_FLUSH);
> +	if (err)
> +		return err;
> +
> +	/* Magic required to prevent forcewake errors! */
> +	err = engine->emit_flush(rq, EMIT_INVALIDATE);
> +	if (err)
> +		return err;
> +
> +	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	/* Ensure the LRI have landed before we invalidate & continue */
> +	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
> +	for (i = GEN8_3LVL_PDPES; i--; ) {
> +		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
> +
> +		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
> +		*cs++ = upper_32_bits(pd_daddr);
> +		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
> +		*cs++ = lower_32_bits(pd_daddr);
> +	}
> +	*cs++ = MI_NOOP;
> +
> +	intel_ring_advance(rq, cs);
> +
> +	/* Be doubly sure the LRI have landed before proceeding */
> +	err = engine->emit_flush(rq, EMIT_FLUSH);
> +	if (err)
> +		return err;
> +
> +	/* Re-invalidate the TLB for luck */
> +	return engine->emit_flush(rq, EMIT_INVALIDATE);
> +}
> +
>   static int execlists_request_alloc(struct i915_request *request)
>   {
>   	int ret;
> @@ -1260,11 +1294,6 @@ static int execlists_request_alloc(struct i915_request *request)
>   	 */
>   	request->reserved_space += EXECLISTS_REQUEST_SIZE;
>   
> -	/* Unconditionally invalidate GPU caches and TLBs. */
> -	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
> -	if (ret)
> -		return ret;
> -
>   	/*
>   	 * Note that after this point, we have committed to using
>   	 * this request as it is being used to both track the
> @@ -1273,6 +1302,14 @@ static int execlists_request_alloc(struct i915_request *request)
>   	 * to cancel/unwind this request now.
>   	 */
>   
> +	/* Unconditionally invalidate GPU caches and TLBs. */
> +	if (i915_vm_is_48bit(&request->gem_context->ppgtt->vm))
> +		ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
> +	else
> +		ret = emit_pdps(request);
> +	if (ret)
> +		return ret;
> +
>   	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
>   	return 0;
>   }
> @@ -1808,56 +1845,11 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
>   		  atomic_read(&execlists->tasklet.count));
>   }
>   
> -static int intel_logical_ring_emit_pdps(struct i915_request *rq)
> -{
> -	struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
> -	struct intel_engine_cs *engine = rq->engine;
> -	const int num_lri_cmds = GEN8_3LVL_PDPES * 2;
> -	u32 *cs;
> -	int i;
> -
> -	cs = intel_ring_begin(rq, num_lri_cmds * 2 + 2);
> -	if (IS_ERR(cs))
> -		return PTR_ERR(cs);
> -
> -	*cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
> -	for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) {
> -		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
> -
> -		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
> -		*cs++ = upper_32_bits(pd_daddr);
> -		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
> -		*cs++ = lower_32_bits(pd_daddr);
> -	}
> -
> -	*cs++ = MI_NOOP;
> -	intel_ring_advance(rq, cs);
> -
> -	return 0;
> -}
> -
>   static int gen8_emit_bb_start(struct i915_request *rq,
>   			      u64 offset, u32 len,
>   			      const unsigned int flags)
>   {
>   	u32 *cs;
> -	int ret;
> -
> -	/* Don't rely in hw updating PDPs, specially in lite-restore.
> -	 * Ideally, we should set Force PD Restore in ctx descriptor,
> -	 * but we can't. Force Restore would be a second option, but
> -	 * it is unsafe in case of lite-restore (because the ctx is
> -	 * not idle). PML4 is allocated during ppgtt init so this is
> -	 * not needed in 48-bit.*/
> -	if ((intel_engine_flag(rq->engine) & rq->gem_context->ppgtt->pd_dirty_rings) &&
> -	    !i915_vm_is_48bit(&rq->gem_context->ppgtt->vm) &&
> -	    !intel_vgpu_active(rq->i915)) {
> -		ret = intel_logical_ring_emit_pdps(rq);
> -		if (ret)
> -			return ret;
> -
> -		rq->gem_context->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
> -	}
>   
>   	cs = intel_ring_begin(rq, 6);
>   	if (IS_ERR(cs))
> @@ -1890,6 +1882,7 @@ static int gen8_emit_bb_start(struct i915_request *rq,
>   
>   	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
>   	*cs++ = MI_NOOP;
> +
>   	intel_ring_advance(rq, cs);
>   
>   	return 0;
> @@ -2501,6 +2494,11 @@ static void execlists_init_reg_state(u32 *regs,
>   		 * other PDP Descriptors are ignored.
>   		 */
>   		ASSIGN_CTX_PML4(ctx->ppgtt, regs);
> +	} else {
> +		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 3);
> +		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 2);
> +		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 1);
> +		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 0);
>   	}
>   
>   	if (rcs) {
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Is BSW fixed now? Or just a tiny bit better?

Regards,

Tvrtko
Chris Wilson Dec. 7, 2018, 10:39 a.m. UTC | #2
Quoting Tvrtko Ursulin (2018-12-07 10:30:53)
> Is BSW fixed now? Or just a tiny bit better?

For some value of a lot better. gem_concurrent_blit is still getting the
odd false-positive missed-breadcrumb report, the occasional incoherency
(I think is insufficient mb around fence register changes) and a rare
invalid TLB GPU hang (EMIT_INVALIDATE harder?).

On the positive side, the machine hasn't died for a few days (the
LRI forcewake errors kill the machine), and at its worst
gem_concurrent_blit could GPU hang within a few seconds. However, not
once have I been able to reproduce the GPU hangs with gem_ctx_create and
friends as seen by CI, so more than likely we are looking at more than
one problem. Fingers crossed this is sufficient to shut CI up (with the
exception of those rare gem_mmap_gtt incoherency failures).
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index add1fe7aeb93..62bde517d383 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1423,8 +1423,6 @@  static int gen8_ppgtt_alloc_pdp(struct i915_address_space *vm,
 			gen8_initialize_pd(vm, pd);
 			gen8_ppgtt_set_pdpe(vm, pdp, pd, pdpe);
 			GEM_BUG_ON(pdp->used_pdpes > i915_pdpes_per_pdp(vm));
-
-			mark_tlbs_dirty(i915_vm_to_ppgtt(vm));
 		}
 
 		ret = gen8_ppgtt_alloc_pd(vm, pd, start, length);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index b1f5db3442eb..c84bdc21bcce 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -363,31 +363,12 @@  execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
 	trace_i915_request_out(rq);
 }
 
-static void
-execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
-{
-	ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
-	ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
-	ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
-	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
-}
-
 static u64 execlists_update_context(struct i915_request *rq)
 {
-	struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
 	struct intel_context *ce = rq->hw_context;
-	u32 *reg_state = ce->lrc_reg_state;
 
-	reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
-
-	/*
-	 * True 32b PPGTT with dynamic page allocation: update PDP
-	 * registers and point the unallocated PDPs to scratch page.
-	 * PML4 is allocated during ppgtt init, so this is not needed
-	 * in 48-bit mode.
-	 */
-	if (!i915_vm_is_48bit(&ppgtt->vm))
-		execlists_update_context_pdps(ppgtt, reg_state);
+	ce->lrc_reg_state[CTX_RING_TAIL + 1] =
+		intel_ring_set_tail(rq->ring, rq->tail);
 
 	/*
 	 * Make sure the context image is complete before we submit it to HW.
@@ -1247,6 +1228,59 @@  execlists_context_pin(struct intel_engine_cs *engine,
 	return __execlists_context_pin(engine, ctx, ce);
 }
 
+static int emit_pdps(struct i915_request *rq)
+{
+	const struct intel_engine_cs * const engine = rq->engine;
+	struct i915_hw_ppgtt * const ppgtt = rq->gem_context->ppgtt;
+	int err, i;
+	u32 *cs;
+
+	GEM_BUG_ON(intel_vgpu_active(rq->i915));
+
+	/*
+	 * Beware ye of the dragons, this sequence is magic!
+	 *
+	 * Small changes to this sequence can cause anything from
+	 * GPU hangs to forcewake errors and machine lockups!
+	 */
+
+	/* Flush any residual operations from the context load */
+	err = engine->emit_flush(rq, EMIT_FLUSH);
+	if (err)
+		return err;
+
+	/* Magic required to prevent forcewake errors! */
+	err = engine->emit_flush(rq, EMIT_INVALIDATE);
+	if (err)
+		return err;
+
+	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	/* Ensure the LRI have landed before we invalidate & continue */
+	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
+	for (i = GEN8_3LVL_PDPES; i--; ) {
+		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
+
+		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
+		*cs++ = upper_32_bits(pd_daddr);
+		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
+		*cs++ = lower_32_bits(pd_daddr);
+	}
+	*cs++ = MI_NOOP;
+
+	intel_ring_advance(rq, cs);
+
+	/* Be doubly sure the LRI have landed before proceeding */
+	err = engine->emit_flush(rq, EMIT_FLUSH);
+	if (err)
+		return err;
+
+	/* Re-invalidate the TLB for luck */
+	return engine->emit_flush(rq, EMIT_INVALIDATE);
+}
+
 static int execlists_request_alloc(struct i915_request *request)
 {
 	int ret;
@@ -1260,11 +1294,6 @@  static int execlists_request_alloc(struct i915_request *request)
 	 */
 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
 
-	/* Unconditionally invalidate GPU caches and TLBs. */
-	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
-	if (ret)
-		return ret;
-
 	/*
 	 * Note that after this point, we have committed to using
 	 * this request as it is being used to both track the
@@ -1273,6 +1302,14 @@  static int execlists_request_alloc(struct i915_request *request)
 	 * to cancel/unwind this request now.
 	 */
 
+	/* Unconditionally invalidate GPU caches and TLBs. */
+	if (i915_vm_is_48bit(&request->gem_context->ppgtt->vm))
+		ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
+	else
+		ret = emit_pdps(request);
+	if (ret)
+		return ret;
+
 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
 	return 0;
 }
@@ -1808,56 +1845,11 @@  static void execlists_reset_finish(struct intel_engine_cs *engine)
 		  atomic_read(&execlists->tasklet.count));
 }
 
-static int intel_logical_ring_emit_pdps(struct i915_request *rq)
-{
-	struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
-	struct intel_engine_cs *engine = rq->engine;
-	const int num_lri_cmds = GEN8_3LVL_PDPES * 2;
-	u32 *cs;
-	int i;
-
-	cs = intel_ring_begin(rq, num_lri_cmds * 2 + 2);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
-	for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) {
-		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
-
-		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
-		*cs++ = upper_32_bits(pd_daddr);
-		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
-		*cs++ = lower_32_bits(pd_daddr);
-	}
-
-	*cs++ = MI_NOOP;
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
 static int gen8_emit_bb_start(struct i915_request *rq,
 			      u64 offset, u32 len,
 			      const unsigned int flags)
 {
 	u32 *cs;
-	int ret;
-
-	/* Don't rely in hw updating PDPs, specially in lite-restore.
-	 * Ideally, we should set Force PD Restore in ctx descriptor,
-	 * but we can't. Force Restore would be a second option, but
-	 * it is unsafe in case of lite-restore (because the ctx is
-	 * not idle). PML4 is allocated during ppgtt init so this is
-	 * not needed in 48-bit.*/
-	if ((intel_engine_flag(rq->engine) & rq->gem_context->ppgtt->pd_dirty_rings) &&
-	    !i915_vm_is_48bit(&rq->gem_context->ppgtt->vm) &&
-	    !intel_vgpu_active(rq->i915)) {
-		ret = intel_logical_ring_emit_pdps(rq);
-		if (ret)
-			return ret;
-
-		rq->gem_context->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
-	}
 
 	cs = intel_ring_begin(rq, 6);
 	if (IS_ERR(cs))
@@ -1890,6 +1882,7 @@  static int gen8_emit_bb_start(struct i915_request *rq,
 
 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 	*cs++ = MI_NOOP;
+
 	intel_ring_advance(rq, cs);
 
 	return 0;
@@ -2501,6 +2494,11 @@  static void execlists_init_reg_state(u32 *regs,
 		 * other PDP Descriptors are ignored.
 		 */
 		ASSIGN_CTX_PML4(ctx->ppgtt, regs);
+	} else {
+		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 3);
+		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 2);
+		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 1);
+		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 0);
 	}
 
 	if (rcs) {