diff mbox series

[8/8] drm/i915: Pipeline PDP updates for Braswell

Message ID 20181203113701.12106-8-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/8] drm/i915/breadcrumbs: Reduce missed-breadcrumb false positive rate | expand

Commit Message

Chris Wilson Dec. 3, 2018, 11:37 a.m. UTC
Currently we face a severe problem on Braswell that manifests as invalid
ppGTT accesses. The code tries to maintain the PDP (page directory
pointers) inside the context in two ways, direct write into the context
and a pipelined LRI update. The direct write into the context is
fundamentally racy as it is unserialised with any access (read or write)
the GPU is doing. By asserting that Braswell is not used with vGPU
(currently an unsupported platform) we can eliminate the dangerous
direct write into the context image and solely use the pipelined update.

However, the LRI of the PDP fouls up the GPU, causing it to freeze and
take out the machine with "forcewake ack timeouts". This seems possible
to workaround by preventing the GPU from sleeping (via means of
disabling the power-state management interface, i.e. forcing each ring
to remain awake) around the update.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108656
References: https://bugs.freedesktop.org/show_bug.cgi?id=108714
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c     |   2 -
 drivers/gpu/drm/i915/i915_request.c     |   5 -
 drivers/gpu/drm/i915/intel_lrc.c        | 137 +++++++++++-------------
 drivers/gpu/drm/i915/intel_ringbuffer.c |   5 +-
 4 files changed, 68 insertions(+), 81 deletions(-)

Comments

Tvrtko Ursulin Dec. 4, 2018, 11:53 a.m. UTC | #1
On 03/12/2018 11:37, Chris Wilson wrote:
> Currently we face a severe problem on Braswell that manifests as invalid
> ppGTT accesses. The code tries to maintain the PDP (page directory
> pointers) inside the context in two ways, direct write into the context
> and a pipelined LRI update. The direct write into the context is
> fundamentally racy as it is unserialised with any access (read or write)
> the GPU is doing. By asserting that Braswell is not used with vGPU
> (currently an unsupported platform) we can eliminate the dangerous
> direct write into the context image and solely use the pipelined update.
> 
> However, the LRI of the PDP fouls up the GPU, causing it to freeze and
> take out the machine with "forcewake ack timeouts". This seems possible
> to workaround by preventing the GPU from sleeping (via means of
> disabling the power-state management interface, i.e. forcing each ring
> to remain awake) around the update.
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108656
> References: https://bugs.freedesktop.org/show_bug.cgi?id=108714
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem_gtt.c     |   2 -
>   drivers/gpu/drm/i915/i915_request.c     |   5 -
>   drivers/gpu/drm/i915/intel_lrc.c        | 137 +++++++++++-------------
>   drivers/gpu/drm/i915/intel_ringbuffer.c |   5 +-
>   4 files changed, 68 insertions(+), 81 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index add1fe7aeb93..62bde517d383 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1423,8 +1423,6 @@ static int gen8_ppgtt_alloc_pdp(struct i915_address_space *vm,
>   			gen8_initialize_pd(vm, pd);
>   			gen8_ppgtt_set_pdpe(vm, pdp, pd, pdpe);
>   			GEM_BUG_ON(pdp->used_pdpes > i915_pdpes_per_pdp(vm));
> -
> -			mark_tlbs_dirty(i915_vm_to_ppgtt(vm));
>   		}
>   
>   		ret = gen8_ppgtt_alloc_pd(vm, pd, start, length);
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index ca95ab2f4cfa..8ab8e8e6a086 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -719,11 +719,6 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	 */
>   	rq->head = rq->ring->emit;
>   
> -	/* Unconditionally invalidate GPU caches and TLBs. */
> -	ret = engine->emit_flush(rq, EMIT_INVALIDATE);

It seems this is still always called at least once on the common path so 
why are you moving it to the backend? Just because it is more "backendy" 
type operation? It makes sense I guess.

> -	if (ret)
> -		goto err_unwind;
> -
>   	ret = engine->request_alloc(rq);
>   	if (ret)
>   		goto err_unwind;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index de070dca4033..1ec3f80a4472 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -363,31 +363,12 @@ execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
>   	trace_i915_request_out(rq);
>   }
>   
> -static void
> -execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
> -{
> -	ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
> -	ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
> -	ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
> -	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
> -}
> -
>   static u64 execlists_update_context(struct i915_request *rq)
>   {
> -	struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
>   	struct intel_context *ce = rq->hw_context;
> -	u32 *reg_state = ce->lrc_reg_state;
> -
> -	reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
>   
> -	/*
> -	 * True 32b PPGTT with dynamic page allocation: update PDP
> -	 * registers and point the unallocated PDPs to scratch page.
> -	 * PML4 is allocated during ppgtt init, so this is not needed
> -	 * in 48-bit mode.
> -	 */
> -	if (!i915_vm_is_48bit(&ppgtt->vm))
> -		execlists_update_context_pdps(ppgtt, reg_state);
> +	ce->lrc_reg_state[CTX_RING_TAIL + 1] =
> +		intel_ring_set_tail(rq->ring, rq->tail);
>   
>   	/*
>   	 * Make sure the context image is complete before we submit it to HW.
> @@ -1240,29 +1221,80 @@ execlists_context_pin(struct intel_engine_cs *engine,
>   	return __execlists_context_pin(engine, ctx, ce);
>   }
>   
> +static int emit_pdps(struct i915_request *rq)
> +{
> +	const struct intel_engine_cs * const engine = rq->engine;
> +	struct i915_hw_ppgtt * const ppgtt = rq->gem_context->ppgtt;
> +	int err, i;
> +	u32 *cs;
> +
> +	err = engine->emit_flush(rq, EMIT_INVALIDATE);
> +	if (err)
> +		return err;
> +
> +	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	/*
> +	 * Force the GPU (not just the local engine/powerwell!) to remain awake,
> +	 * or else we may kill the machine with "timed out waiting for
> +	 * forcewake ack request".
> +	 */
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
> +	for (i = GEN8_3LVL_PDPES; i--; ) {
> +		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
> +
> +		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
> +		*cs++ = upper_32_bits(pd_daddr);
> +		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
> +		*cs++ = lower_32_bits(pd_daddr);
> +	}
> +	*cs++ = MI_NOOP;
> +
> +	intel_ring_advance(rq, cs);
> +
> +	err = engine->emit_flush(rq, EMIT_INVALIDATE);
> +	if (err)
> +		return err;
> +
> +	return 0;
> +}
> +
>   static int execlists_request_alloc(struct i915_request *request)
>   {
>   	int ret;
>   
>   	GEM_BUG_ON(!request->hw_context->pin_count);
>   
> -	/* Flush enough space to reduce the likelihood of waiting after
> +	/*
> +	 * Flush enough space to reduce the likelihood of waiting after
>   	 * we start building the request - in which case we will just
>   	 * have to repeat work.
>   	 */
>   	request->reserved_space += EXECLISTS_REQUEST_SIZE;
>   
> -	ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
> -	if (ret)
> -		return ret;

Removing this in favour of what intel_ring_begin will do? But is it the 
same? Could be.. just not sure due all the arithmetic that's happening 
in these areas..

> -
> -	/* Note that after this point, we have committed to using
> +	/*
> +	 * Note that after this point, we have committed to using
>   	 * this request as it is being used to both track the
>   	 * state of engine initialisation and liveness of the
>   	 * golden renderstate above. Think twice before you try
>   	 * to cancel/unwind this request now.
>   	 */
>   
> +	/* Unconditionally invalidate GPU caches and TLBs. */
> +	if (i915_vm_is_48bit(&request->gem_context->ppgtt->vm)) {
> +		ret = request->engine->emit_flush(request, EMIT_INVALIDATE);

Excuse my ignorance, but what is the PDP update mechanism in 48-bit mode?

> +		if (ret)
> +			return ret;
> +	} else {
> +		GEM_BUG_ON(intel_vgpu_active(request->i915));
> +		ret = emit_pdps(request);
> +		if (ret)
> +			return ret;
> +	}
> +

There is always a starting EMIT_INVALIDATE on both of these two branches 
so you could pull it out before the if block.

>   	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
>   	return 0;
>   }
> @@ -1832,56 +1864,11 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
>   		  atomic_read(&execlists->tasklet.count));
>   }
>   
> -static int intel_logical_ring_emit_pdps(struct i915_request *rq)
> -{
> -	struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
> -	struct intel_engine_cs *engine = rq->engine;
> -	const int num_lri_cmds = GEN8_3LVL_PDPES * 2;
> -	u32 *cs;
> -	int i;
> -
> -	cs = intel_ring_begin(rq, num_lri_cmds * 2 + 2);
> -	if (IS_ERR(cs))
> -		return PTR_ERR(cs);
> -
> -	*cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
> -	for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) {
> -		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
> -
> -		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
> -		*cs++ = upper_32_bits(pd_daddr);
> -		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
> -		*cs++ = lower_32_bits(pd_daddr);
> -	}
> -
> -	*cs++ = MI_NOOP;
> -	intel_ring_advance(rq, cs);
> -
> -	return 0;
> -}
> -
>   static int gen8_emit_bb_start(struct i915_request *rq,
>   			      u64 offset, u32 len,
>   			      const unsigned int flags)
>   {
>   	u32 *cs;
> -	int ret;
> -
> -	/* Don't rely in hw updating PDPs, specially in lite-restore.
> -	 * Ideally, we should set Force PD Restore in ctx descriptor,
> -	 * but we can't. Force Restore would be a second option, but
> -	 * it is unsafe in case of lite-restore (because the ctx is
> -	 * not idle). PML4 is allocated during ppgtt init so this is
> -	 * not needed in 48-bit.*/
> -	if ((intel_engine_flag(rq->engine) & rq->gem_context->ppgtt->pd_dirty_rings) &&
> -	    !i915_vm_is_48bit(&rq->gem_context->ppgtt->vm) &&
> -	    !intel_vgpu_active(rq->i915)) {
> -		ret = intel_logical_ring_emit_pdps(rq);

What is the reason to move from emit_bb_start to request_alloc?

> -		if (ret)
> -			return ret;
> -
> -		rq->gem_context->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
> -	}
>   
>   	cs = intel_ring_begin(rq, 6);
>   	if (IS_ERR(cs))
> @@ -1914,6 +1901,7 @@ static int gen8_emit_bb_start(struct i915_request *rq,
>   
>   	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
>   	*cs++ = MI_NOOP;
> +
>   	intel_ring_advance(rq, cs);
>   
>   	return 0;
> @@ -2544,6 +2532,11 @@ static void execlists_init_reg_state(u32 *regs,
>   		 * other PDP Descriptors are ignored.
>   		 */
>   		ASSIGN_CTX_PML4(ctx->ppgtt, regs);
> +	} else {
> +		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 3);
> +		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 2);
> +		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 1);
> +		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 0);
>   	}
>   
>   	if (rcs) {
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 37bd05cef0e9..4591f568547c 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1833,11 +1833,12 @@ static int ring_request_alloc(struct i915_request *request)
>   	 */
>   	request->reserved_space += LEGACY_REQUEST_SIZE;
>   
> -	ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
> +	ret = switch_context(request);
>   	if (ret)
>   		return ret;
>   
> -	ret = switch_context(request);
> +	/* Unconditionally invalidate GPU caches and TLBs. */
> +	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
>   	if (ret)
>   		return ret;
>   
> 

Regards,

Tvrtko
Chris Wilson Dec. 4, 2018, 12:09 p.m. UTC | #2
Quoting Tvrtko Ursulin (2018-12-04 11:53:22)
> 
> On 03/12/2018 11:37, Chris Wilson wrote:
> > Currently we face a severe problem on Braswell that manifests as invalid
> > ppGTT accesses. The code tries to maintain the PDP (page directory
> > pointers) inside the context in two ways, direct write into the context
> > and a pipelined LRI update. The direct write into the context is
> > fundamentally racy as it is unserialised with any access (read or write)
> > the GPU is doing. By asserting that Braswell is not used with vGPU
> > (currently an unsupported platform) we can eliminate the dangerous
> > direct write into the context image and solely use the pipelined update.
> > 
> > However, the LRI of the PDP fouls up the GPU, causing it to freeze and
> > take out the machine with "forcewake ack timeouts". This seems possible
> > to workaround by preventing the GPU from sleeping (via means of
> > disabling the power-state management interface, i.e. forcing each ring
> > to remain awake) around the update.
> > 
> > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108656
> > References: https://bugs.freedesktop.org/show_bug.cgi?id=108714
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> > index ca95ab2f4cfa..8ab8e8e6a086 100644
> > --- a/drivers/gpu/drm/i915/i915_request.c
> > +++ b/drivers/gpu/drm/i915/i915_request.c
> > @@ -719,11 +719,6 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
> >        */
> >       rq->head = rq->ring->emit;
> >   
> > -     /* Unconditionally invalidate GPU caches and TLBs. */
> > -     ret = engine->emit_flush(rq, EMIT_INVALIDATE);
> 
> It seems this is still always called at least once on the common path so 
> why are you moving it to the backend? Just because it is more "backendy" 
> type operation? It makes sense I guess.

Yup. Having gone through many iterations, not all quite as symmetrical
as the current incarnation, specialisation of flushes to the backend
made sense. We pulled it into the core because everyone had to do their
invalidate first, but that might not actually be so true (at least, we
probably should do the invalidate after the context load in ringbuffer,
but one bug at a time).

> > +static int emit_pdps(struct i915_request *rq)
> > +{
> > +     const struct intel_engine_cs * const engine = rq->engine;
> > +     struct i915_hw_ppgtt * const ppgtt = rq->gem_context->ppgtt;
> > +     int err, i;
> > +     u32 *cs;
> > +
> > +     err = engine->emit_flush(rq, EMIT_INVALIDATE);
> > +     if (err)
> > +             return err;
> > +
> > +     cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
> > +     if (IS_ERR(cs))
> > +             return PTR_ERR(cs);
> > +
> > +     /*
> > +      * Force the GPU (not just the local engine/powerwell!) to remain awake,
> > +      * or else we may kill the machine with "timed out waiting for
> > +      * forcewake ack request".
> > +      */
> > +
> > +     *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
> > +     for (i = GEN8_3LVL_PDPES; i--; ) {
> > +             const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
> > +
> > +             *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
> > +             *cs++ = upper_32_bits(pd_daddr);
> > +             *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
> > +             *cs++ = lower_32_bits(pd_daddr);
> > +     }
> > +     *cs++ = MI_NOOP;
> > +
> > +     intel_ring_advance(rq, cs);
> > +
> > +     err = engine->emit_flush(rq, EMIT_INVALIDATE);
> > +     if (err)
> > +             return err;
> > +
> > +     return 0;
> > +}
> > +
> >   static int execlists_request_alloc(struct i915_request *request)
> >   {
> >       int ret;
> >   
> >       GEM_BUG_ON(!request->hw_context->pin_count);
> >   
> > -     /* Flush enough space to reduce the likelihood of waiting after
> > +     /*
> > +      * Flush enough space to reduce the likelihood of waiting after
> >        * we start building the request - in which case we will just
> >        * have to repeat work.
> >        */
> >       request->reserved_space += EXECLISTS_REQUEST_SIZE;
> >   
> > -     ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
> > -     if (ret)
> > -             return ret;
> 
> Removing this in favour of what intel_ring_begin will do? But is it the 
> same? Could be.. just not sure due all the arithmetic that's happening 
> in these areas..

It's the same effect. intel_ring_wait_for_space() was just a shorthand
to avoid the secondary effect of preparing the ring for the request.

> > -
> > -     /* Note that after this point, we have committed to using
> > +     /*
> > +      * Note that after this point, we have committed to using
> >        * this request as it is being used to both track the
> >        * state of engine initialisation and liveness of the
> >        * golden renderstate above. Think twice before you try
> >        * to cancel/unwind this request now.
> >        */
> >   
> > +     /* Unconditionally invalidate GPU caches and TLBs. */
> > +     if (i915_vm_is_48bit(&request->gem_context->ppgtt->vm)) {
> > +             ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
> 
> Excuse my ignorance, but what is the PDP update mechanism in 48-bit mode?

The top-most level is fixed (i.e. the address register inside the context
image for the pm4l page), and the TLBs flushed by EMIT_INVALIDATE. Each
batch then traverses the new page directory tree afresh.

> > +             if (ret)
> > +                     return ret;
> > +     } else {
> > +             GEM_BUG_ON(intel_vgpu_active(request->i915));
> > +             ret = emit_pdps(request);
> > +             if (ret)
> > +                     return ret;
> > +     }
> > +
> 
> There is always a starting EMIT_INVALIDATE on both of these two branches 
> so you could pull it out before the if block.

Still playing. Though at the moment, it's 8 EMIT_FLUSH after the LRI. My
point is that the emit_pdps() is decidedly a magic sequence, dropping
that EMIT_INVALIDATE (in favour of just one after) causes "media forcewake
errors". I should probably have a much bigger "here be unknown dragons".

> >   static int gen8_emit_bb_start(struct i915_request *rq,
> >                             u64 offset, u32 len,
> >                             const unsigned int flags)
> >   {
> >       u32 *cs;
> > -     int ret;
> > -
> > -     /* Don't rely in hw updating PDPs, specially in lite-restore.
> > -      * Ideally, we should set Force PD Restore in ctx descriptor,
> > -      * but we can't. Force Restore would be a second option, but
> > -      * it is unsafe in case of lite-restore (because the ctx is
> > -      * not idle). PML4 is allocated during ppgtt init so this is
> > -      * not needed in 48-bit.*/
> > -     if ((intel_engine_flag(rq->engine) & rq->gem_context->ppgtt->pd_dirty_rings) &&
> > -         !i915_vm_is_48bit(&rq->gem_context->ppgtt->vm) &&
> > -         !intel_vgpu_active(rq->i915)) {
> > -             ret = intel_logical_ring_emit_pdps(rq);
> 
> What is the reason to move from emit_bb_start to request_alloc?

Long standing pet peeve. Imho, this is not about the BB_START but is an
integral part of TLB invalidation. So inappropriate misuse of
emit_bb_start.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index add1fe7aeb93..62bde517d383 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1423,8 +1423,6 @@  static int gen8_ppgtt_alloc_pdp(struct i915_address_space *vm,
 			gen8_initialize_pd(vm, pd);
 			gen8_ppgtt_set_pdpe(vm, pdp, pd, pdpe);
 			GEM_BUG_ON(pdp->used_pdpes > i915_pdpes_per_pdp(vm));
-
-			mark_tlbs_dirty(i915_vm_to_ppgtt(vm));
 		}
 
 		ret = gen8_ppgtt_alloc_pd(vm, pd, start, length);
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index ca95ab2f4cfa..8ab8e8e6a086 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -719,11 +719,6 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 */
 	rq->head = rq->ring->emit;
 
-	/* Unconditionally invalidate GPU caches and TLBs. */
-	ret = engine->emit_flush(rq, EMIT_INVALIDATE);
-	if (ret)
-		goto err_unwind;
-
 	ret = engine->request_alloc(rq);
 	if (ret)
 		goto err_unwind;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index de070dca4033..1ec3f80a4472 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -363,31 +363,12 @@  execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
 	trace_i915_request_out(rq);
 }
 
-static void
-execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
-{
-	ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
-	ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
-	ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
-	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
-}
-
 static u64 execlists_update_context(struct i915_request *rq)
 {
-	struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
 	struct intel_context *ce = rq->hw_context;
-	u32 *reg_state = ce->lrc_reg_state;
-
-	reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
 
-	/*
-	 * True 32b PPGTT with dynamic page allocation: update PDP
-	 * registers and point the unallocated PDPs to scratch page.
-	 * PML4 is allocated during ppgtt init, so this is not needed
-	 * in 48-bit mode.
-	 */
-	if (!i915_vm_is_48bit(&ppgtt->vm))
-		execlists_update_context_pdps(ppgtt, reg_state);
+	ce->lrc_reg_state[CTX_RING_TAIL + 1] =
+		intel_ring_set_tail(rq->ring, rq->tail);
 
 	/*
 	 * Make sure the context image is complete before we submit it to HW.
@@ -1240,29 +1221,80 @@  execlists_context_pin(struct intel_engine_cs *engine,
 	return __execlists_context_pin(engine, ctx, ce);
 }
 
+static int emit_pdps(struct i915_request *rq)
+{
+	const struct intel_engine_cs * const engine = rq->engine;
+	struct i915_hw_ppgtt * const ppgtt = rq->gem_context->ppgtt;
+	int err, i;
+	u32 *cs;
+
+	err = engine->emit_flush(rq, EMIT_INVALIDATE);
+	if (err)
+		return err;
+
+	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	/*
+	 * Force the GPU (not just the local engine/powerwell!) to remain awake,
+	 * or else we may kill the machine with "timed out waiting for
+	 * forcewake ack request".
+	 */
+
+	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
+	for (i = GEN8_3LVL_PDPES; i--; ) {
+		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
+
+		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
+		*cs++ = upper_32_bits(pd_daddr);
+		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
+		*cs++ = lower_32_bits(pd_daddr);
+	}
+	*cs++ = MI_NOOP;
+
+	intel_ring_advance(rq, cs);
+
+	err = engine->emit_flush(rq, EMIT_INVALIDATE);
+	if (err)
+		return err;
+
+	return 0;
+}
+
 static int execlists_request_alloc(struct i915_request *request)
 {
 	int ret;
 
 	GEM_BUG_ON(!request->hw_context->pin_count);
 
-	/* Flush enough space to reduce the likelihood of waiting after
+	/*
+	 * Flush enough space to reduce the likelihood of waiting after
 	 * we start building the request - in which case we will just
 	 * have to repeat work.
 	 */
 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
 
-	ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
-	if (ret)
-		return ret;
-
-	/* Note that after this point, we have committed to using
+	/*
+	 * Note that after this point, we have committed to using
 	 * this request as it is being used to both track the
 	 * state of engine initialisation and liveness of the
 	 * golden renderstate above. Think twice before you try
 	 * to cancel/unwind this request now.
 	 */
 
+	/* Unconditionally invalidate GPU caches and TLBs. */
+	if (i915_vm_is_48bit(&request->gem_context->ppgtt->vm)) {
+		ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
+		if (ret)
+			return ret;
+	} else {
+		GEM_BUG_ON(intel_vgpu_active(request->i915));
+		ret = emit_pdps(request);
+		if (ret)
+			return ret;
+	}
+
 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
 	return 0;
 }
@@ -1832,56 +1864,11 @@  static void execlists_reset_finish(struct intel_engine_cs *engine)
 		  atomic_read(&execlists->tasklet.count));
 }
 
-static int intel_logical_ring_emit_pdps(struct i915_request *rq)
-{
-	struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
-	struct intel_engine_cs *engine = rq->engine;
-	const int num_lri_cmds = GEN8_3LVL_PDPES * 2;
-	u32 *cs;
-	int i;
-
-	cs = intel_ring_begin(rq, num_lri_cmds * 2 + 2);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
-	for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) {
-		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
-
-		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
-		*cs++ = upper_32_bits(pd_daddr);
-		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
-		*cs++ = lower_32_bits(pd_daddr);
-	}
-
-	*cs++ = MI_NOOP;
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
 static int gen8_emit_bb_start(struct i915_request *rq,
 			      u64 offset, u32 len,
 			      const unsigned int flags)
 {
 	u32 *cs;
-	int ret;
-
-	/* Don't rely in hw updating PDPs, specially in lite-restore.
-	 * Ideally, we should set Force PD Restore in ctx descriptor,
-	 * but we can't. Force Restore would be a second option, but
-	 * it is unsafe in case of lite-restore (because the ctx is
-	 * not idle). PML4 is allocated during ppgtt init so this is
-	 * not needed in 48-bit.*/
-	if ((intel_engine_flag(rq->engine) & rq->gem_context->ppgtt->pd_dirty_rings) &&
-	    !i915_vm_is_48bit(&rq->gem_context->ppgtt->vm) &&
-	    !intel_vgpu_active(rq->i915)) {
-		ret = intel_logical_ring_emit_pdps(rq);
-		if (ret)
-			return ret;
-
-		rq->gem_context->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
-	}
 
 	cs = intel_ring_begin(rq, 6);
 	if (IS_ERR(cs))
@@ -1914,6 +1901,7 @@  static int gen8_emit_bb_start(struct i915_request *rq,
 
 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 	*cs++ = MI_NOOP;
+
 	intel_ring_advance(rq, cs);
 
 	return 0;
@@ -2544,6 +2532,11 @@  static void execlists_init_reg_state(u32 *regs,
 		 * other PDP Descriptors are ignored.
 		 */
 		ASSIGN_CTX_PML4(ctx->ppgtt, regs);
+	} else {
+		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 3);
+		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 2);
+		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 1);
+		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 0);
 	}
 
 	if (rcs) {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 37bd05cef0e9..4591f568547c 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1833,11 +1833,12 @@  static int ring_request_alloc(struct i915_request *request)
 	 */
 	request->reserved_space += LEGACY_REQUEST_SIZE;
 
-	ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
+	ret = switch_context(request);
 	if (ret)
 		return ret;
 
-	ret = switch_context(request);
+	/* Unconditionally invalidate GPU caches and TLBs. */
+	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
 	if (ret)
 		return ret;