diff mbox

[1/2] drm/i915/execlists: HWS is uncached on !llc platforms

Message ID 1445622212-32604-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Oct. 23, 2015, 5:43 p.m. UTC
As the HWS is mapped into the GPU as uncached, writes into that page do
not automatically snoop the CPU cache and therefore if we want to see
coherent values we need to clear the stale cacheline first. Note, we
already had a workaround for BXT-A for an identical issue, but since we
never enabled snooping it applies to all non-llc platforms.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_lrc.c | 70 ++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

Comments

Ville Syrjala Oct. 23, 2015, 5:50 p.m. UTC | #1
On Fri, Oct 23, 2015 at 06:43:31PM +0100, Chris Wilson wrote:
> As the HWS is mapped into the GPU as uncached,

Since when?

> writes into that page do
> not automatically snoop the CPU cache and therefore if we want to see
> coherent values we need to clear the stale cacheline first. Note, we
> already had a workaround for BXT-A for an identical issue, but since we
> never enabled snooping it applies to all non-llc platforms.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 70 ++++++++++++++++++++--------------------
>  1 file changed, 35 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 3308337bb6c2..7393dd00d26d 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1751,41 +1751,36 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
>  	return 0;
>  }
>  
> -static u32 gen8_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency)
> +static u32 llc_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency)
>  {
>  	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
>  }
>  
> -static void gen8_set_seqno(struct intel_engine_cs *ring, u32 seqno)
> +static void llc_set_seqno(struct intel_engine_cs *ring, u32 seqno)
>  {
>  	intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno);
>  }
>  
> -static u32 bxt_a_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency)
> +static u32 wc_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency)
>  {
> -
>  	/*
>  	 * On BXT A steppings there is a HW coherency issue whereby the
>  	 * MI_STORE_DATA_IMM storing the completed request's seqno
> -	 * occasionally doesn't invalidate the CPU cache. Work around this by
> -	 * clflushing the corresponding cacheline whenever the caller wants
> -	 * the coherency to be guaranteed. Note that this cacheline is known
> -	 * to be clean at this point, since we only write it in
> -	 * bxt_a_set_seqno(), where we also do a clflush after the write. So
> -	 * this clflush in practice becomes an invalidate operation.
> +	 * occasionally doesn't invalidate the CPU cache. However, as
> +	 * we do not mark the context for snoopable access, we have to flush
> +	 * the stale cacheline before seeing new values anyway.
>  	 */
> -
>  	if (!lazy_coherency)
>  		intel_flush_status_page(ring, I915_GEM_HWS_INDEX);
>  
>  	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
>  }
>  
> -static void bxt_a_set_seqno(struct intel_engine_cs *ring, u32 seqno)
> +static void wc_set_seqno(struct intel_engine_cs *ring, u32 seqno)
>  {
>  	intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno);
>  
> -	/* See bxt_a_get_seqno() explaining the reason for the clflush. */
> +	/* See wc_get_seqno() explaining the reason for the clflush. */
>  	intel_flush_status_page(ring, I915_GEM_HWS_INDEX);
>  }
>  
> @@ -1973,12 +1968,12 @@ static int logical_render_ring_init(struct drm_device *dev)
>  		ring->init_hw = gen8_init_render_ring;
>  	ring->init_context = gen8_init_rcs_context;
>  	ring->cleanup = intel_fini_pipe_control;
> -	if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
> -		ring->get_seqno = bxt_a_get_seqno;
> -		ring->set_seqno = bxt_a_set_seqno;
> +	if (!HAS_LLC(dev_priv)) {
> +		ring->get_seqno = wc_get_seqno;
> +		ring->set_seqno = wc_set_seqno;
>  	} else {
> -		ring->get_seqno = gen8_get_seqno;
> -		ring->set_seqno = gen8_set_seqno;
> +		ring->get_seqno = llc_get_seqno;
> +		ring->set_seqno = llc_set_seqno;
>  	}
>  	ring->emit_request = gen8_emit_request;
>  	ring->emit_flush = gen8_emit_flush_render;
> @@ -2025,12 +2020,12 @@ static int logical_bsd_ring_init(struct drm_device *dev)
>  		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT;
>  
>  	ring->init_hw = gen8_init_common_ring;
> -	if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
> -		ring->get_seqno = bxt_a_get_seqno;
> -		ring->set_seqno = bxt_a_set_seqno;
> +	if (!HAS_LLC(dev_priv)) {
> +		ring->get_seqno = wc_get_seqno;
> +		ring->set_seqno = wc_set_seqno;
>  	} else {
> -		ring->get_seqno = gen8_get_seqno;
> -		ring->set_seqno = gen8_set_seqno;
> +		ring->get_seqno = llc_get_seqno;
> +		ring->set_seqno = llc_set_seqno;
>  	}
>  	ring->emit_request = gen8_emit_request;
>  	ring->emit_flush = gen8_emit_flush;
> @@ -2055,8 +2050,13 @@ static int logical_bsd2_ring_init(struct drm_device *dev)
>  		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT;
>  
>  	ring->init_hw = gen8_init_common_ring;
> -	ring->get_seqno = gen8_get_seqno;
> -	ring->set_seqno = gen8_set_seqno;
> +	if (!HAS_LLC(dev_priv)) {
> +		ring->get_seqno = wc_get_seqno;
> +		ring->set_seqno = wc_set_seqno;
> +	} else {
> +		ring->get_seqno = llc_get_seqno;
> +		ring->set_seqno = llc_set_seqno;
> +	}
>  	ring->emit_request = gen8_emit_request;
>  	ring->emit_flush = gen8_emit_flush;
>  	ring->irq_get = gen8_logical_ring_get_irq;
> @@ -2080,12 +2080,12 @@ static int logical_blt_ring_init(struct drm_device *dev)
>  		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT;
>  
>  	ring->init_hw = gen8_init_common_ring;
> -	if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
> -		ring->get_seqno = bxt_a_get_seqno;
> -		ring->set_seqno = bxt_a_set_seqno;
> +	if (!HAS_LLC(dev_priv)) {
> +		ring->get_seqno = wc_get_seqno;
> +		ring->set_seqno = wc_set_seqno;
>  	} else {
> -		ring->get_seqno = gen8_get_seqno;
> -		ring->set_seqno = gen8_set_seqno;
> +		ring->get_seqno = llc_get_seqno;
> +		ring->set_seqno = llc_set_seqno;
>  	}
>  	ring->emit_request = gen8_emit_request;
>  	ring->emit_flush = gen8_emit_flush;
> @@ -2110,12 +2110,12 @@ static int logical_vebox_ring_init(struct drm_device *dev)
>  		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT;
>  
>  	ring->init_hw = gen8_init_common_ring;
> -	if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
> -		ring->get_seqno = bxt_a_get_seqno;
> -		ring->set_seqno = bxt_a_set_seqno;
> +	if (!HAS_LLC(dev_priv)) {
> +		ring->get_seqno = wc_get_seqno;
> +		ring->set_seqno = wc_set_seqno;
>  	} else {
> -		ring->get_seqno = gen8_get_seqno;
> -		ring->set_seqno = gen8_set_seqno;
> +		ring->get_seqno = llc_get_seqno;
> +		ring->set_seqno = llc_set_seqno;
>  	}
>  	ring->emit_request = gen8_emit_request;
>  	ring->emit_flush = gen8_emit_flush;
> -- 
> 2.6.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Oct. 23, 2015, 5:56 p.m. UTC | #2
On Fri, Oct 23, 2015 at 08:50:42PM +0300, Ville Syrjälä wrote:
> On Fri, Oct 23, 2015 at 06:43:31PM +0100, Chris Wilson wrote:
> > As the HWS is mapped into the GPU as uncached,
> 
> Since when?

Since it is embedded into execlists' default context which is allocated
using the system default cache level, i.e. uncached on !llc. See
intel_lr_context_deferred_alloc()
-Chris
Ville Syrjala Oct. 23, 2015, 6:22 p.m. UTC | #3
On Fri, Oct 23, 2015 at 06:56:41PM +0100, Chris Wilson wrote:
> On Fri, Oct 23, 2015 at 08:50:42PM +0300, Ville Syrjälä wrote:
> > On Fri, Oct 23, 2015 at 06:43:31PM +0100, Chris Wilson wrote:
> > > As the HWS is mapped into the GPU as uncached,
> > 
> > Since when?
> 
> Since it is embedded into execlists' default context which is allocated
> using the system default cache level, i.e. uncached on !llc. See
> intel_lr_context_deferred_alloc()

Oh right. That doesn't actually matter since it's mapped through ggtt
which means it always goes through PAT 0.
Chris Wilson Oct. 23, 2015, 6:29 p.m. UTC | #4
On Fri, Oct 23, 2015 at 09:22:38PM +0300, Ville Syrjälä wrote:
> On Fri, Oct 23, 2015 at 06:56:41PM +0100, Chris Wilson wrote:
> > On Fri, Oct 23, 2015 at 08:50:42PM +0300, Ville Syrjälä wrote:
> > > On Fri, Oct 23, 2015 at 06:43:31PM +0100, Chris Wilson wrote:
> > > > As the HWS is mapped into the GPU as uncached,
> > > 
> > > Since when?
> > 
> > Since it is embedded into execlists' default context which is allocated
> > using the system default cache level, i.e. uncached on !llc. See
> > intel_lr_context_deferred_alloc()
> 
> Oh right. That doesn't actually matter since it's mapped through ggtt
> which means it always goes through PAT 0.

Oh, that again. Doesn't that mean we broke i915.enable_ppgtt=0?
-Chris
Ville Syrjala Oct. 23, 2015, 6:41 p.m. UTC | #5
On Fri, Oct 23, 2015 at 07:29:08PM +0100, Chris Wilson wrote:
> On Fri, Oct 23, 2015 at 09:22:38PM +0300, Ville Syrjälä wrote:
> > On Fri, Oct 23, 2015 at 06:56:41PM +0100, Chris Wilson wrote:
> > > On Fri, Oct 23, 2015 at 08:50:42PM +0300, Ville Syrjälä wrote:
> > > > On Fri, Oct 23, 2015 at 06:43:31PM +0100, Chris Wilson wrote:
> > > > > As the HWS is mapped into the GPU as uncached,
> > > > 
> > > > Since when?
> > > 
> > > Since it is embedded into execlists' default context which is allocated
> > > using the system default cache level, i.e. uncached on !llc. See
> > > intel_lr_context_deferred_alloc()
> > 
> > Oh right. That doesn't actually matter since it's mapped through ggtt
> > which means it always goes through PAT 0.
> 
> Oh, that again. Doesn't that mean we broke i915.enable_ppgtt=0?

Just means everything is snooped in that case. As long as the hardware
doesn't get too upset about all the snooping it should work. Can't
really recall if I actually tried it though. I think I did.
Chris Wilson Oct. 23, 2015, 7:08 p.m. UTC | #6
On Fri, Oct 23, 2015 at 09:41:29PM +0300, Ville Syrjälä wrote:
> On Fri, Oct 23, 2015 at 07:29:08PM +0100, Chris Wilson wrote:
> > On Fri, Oct 23, 2015 at 09:22:38PM +0300, Ville Syrjälä wrote:
> > > On Fri, Oct 23, 2015 at 06:56:41PM +0100, Chris Wilson wrote:
> > > > On Fri, Oct 23, 2015 at 08:50:42PM +0300, Ville Syrjälä wrote:
> > > > > On Fri, Oct 23, 2015 at 06:43:31PM +0100, Chris Wilson wrote:
> > > > > > As the HWS is mapped into the GPU as uncached,
> > > > > 
> > > > > Since when?
> > > > 
> > > > Since it is embedded into execlists' default context which is allocated
> > > > using the system default cache level, i.e. uncached on !llc. See
> > > > intel_lr_context_deferred_alloc()
> > > 
> > > Oh right. That doesn't actually matter since it's mapped through ggtt
> > > which means it always goes through PAT 0.
> > 
> > Oh, that again. Doesn't that mean we broke i915.enable_ppgtt=0?
> 
> Just means everything is snooped in that case. As long as the hardware
> doesn't get too upset about all the snooping it should work. Can't
> really recall if I actually tried it though. I think I did.

Just wondering if it means we start getting cacheline dirt on the
scanout. Though since snooping only occurs on flushes, I guess it
actually means it hits the backing storage and then is pushed into the
cpu cache. The other worry is whether we are then generating fsb snoop
traffic on every context switch. Just idle thoughts as I realise I don't
know as much about snooping as I'd like.
-Chris
Ville Syrjala Oct. 23, 2015, 7:36 p.m. UTC | #7
On Fri, Oct 23, 2015 at 08:08:34PM +0100, Chris Wilson wrote:
> On Fri, Oct 23, 2015 at 09:41:29PM +0300, Ville Syrjälä wrote:
> > On Fri, Oct 23, 2015 at 07:29:08PM +0100, Chris Wilson wrote:
> > > On Fri, Oct 23, 2015 at 09:22:38PM +0300, Ville Syrjälä wrote:
> > > > On Fri, Oct 23, 2015 at 06:56:41PM +0100, Chris Wilson wrote:
> > > > > On Fri, Oct 23, 2015 at 08:50:42PM +0300, Ville Syrjälä wrote:
> > > > > > On Fri, Oct 23, 2015 at 06:43:31PM +0100, Chris Wilson wrote:
> > > > > > > As the HWS is mapped into the GPU as uncached,
> > > > > > 
> > > > > > Since when?
> > > > > 
> > > > > Since it is embedded into execlists' default context which is allocated
> > > > > using the system default cache level, i.e. uncached on !llc. See
> > > > > intel_lr_context_deferred_alloc()
> > > > 
> > > > Oh right. That doesn't actually matter since it's mapped through ggtt
> > > > which means it always goes through PAT 0.
> > > 
> > > Oh, that again. Doesn't that mean we broke i915.enable_ppgtt=0?
> > 
> > Just means everything is snooped in that case. As long as the hardware
> > doesn't get too upset about all the snooping it should work. Can't
> > really recall if I actually tried it though. I think I did.
> 
> Just wondering if it means we start getting cacheline dirt on the
> scanout. Though since snooping only occurs on flushes, I guess it
> actually means it hits the backing storage and then is pushed into the
> cpu cache. The other worry is whether we are then generating fsb snoop
> traffic on every context switch. Just idle thoughts as I realise I don't
> know as much about snooping as I'd like.

TBH I never gave !ppgtt too much thought.

The snoops for the context switches have crossed my mind. I was
thinking that maybe we could map the status page through ppgtt and
the rest of the context through ggtt, and then we could make the
PAT 0 non-snooped. But looks like the per-process status page still
needs to be mapped through the ggtt.

But maybe we could also map it through the ppgtt and use the ppgtt
mapping for seqno writes? That's assuming we don't need to look at
whatever else gets stored in the status page through the ggtt mapping.

Or I suppose we could take you approach and just make ggtt non-snooped
and take the clflush hit for seqno reads. No idea which is worse. Would
need to gather some numbers I suppose.
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 3308337bb6c2..7393dd00d26d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1751,41 +1751,36 @@  static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
 	return 0;
 }
 
-static u32 gen8_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency)
+static u32 llc_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency)
 {
 	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
 }
 
-static void gen8_set_seqno(struct intel_engine_cs *ring, u32 seqno)
+static void llc_set_seqno(struct intel_engine_cs *ring, u32 seqno)
 {
 	intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno);
 }
 
-static u32 bxt_a_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency)
+static u32 wc_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency)
 {
-
 	/*
 	 * On BXT A steppings there is a HW coherency issue whereby the
 	 * MI_STORE_DATA_IMM storing the completed request's seqno
-	 * occasionally doesn't invalidate the CPU cache. Work around this by
-	 * clflushing the corresponding cacheline whenever the caller wants
-	 * the coherency to be guaranteed. Note that this cacheline is known
-	 * to be clean at this point, since we only write it in
-	 * bxt_a_set_seqno(), where we also do a clflush after the write. So
-	 * this clflush in practice becomes an invalidate operation.
+	 * occasionally doesn't invalidate the CPU cache. However, as
+	 * we do not mark the context for snoopable access, we have to flush
+	 * the stale cacheline before seeing new values anyway.
 	 */
-
 	if (!lazy_coherency)
 		intel_flush_status_page(ring, I915_GEM_HWS_INDEX);
 
 	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
 }
 
-static void bxt_a_set_seqno(struct intel_engine_cs *ring, u32 seqno)
+static void wc_set_seqno(struct intel_engine_cs *ring, u32 seqno)
 {
 	intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno);
 
-	/* See bxt_a_get_seqno() explaining the reason for the clflush. */
+	/* See wc_get_seqno() explaining the reason for the clflush. */
 	intel_flush_status_page(ring, I915_GEM_HWS_INDEX);
 }
 
@@ -1973,12 +1968,12 @@  static int logical_render_ring_init(struct drm_device *dev)
 		ring->init_hw = gen8_init_render_ring;
 	ring->init_context = gen8_init_rcs_context;
 	ring->cleanup = intel_fini_pipe_control;
-	if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
-		ring->get_seqno = bxt_a_get_seqno;
-		ring->set_seqno = bxt_a_set_seqno;
+	if (!HAS_LLC(dev_priv)) {
+		ring->get_seqno = wc_get_seqno;
+		ring->set_seqno = wc_set_seqno;
 	} else {
-		ring->get_seqno = gen8_get_seqno;
-		ring->set_seqno = gen8_set_seqno;
+		ring->get_seqno = llc_get_seqno;
+		ring->set_seqno = llc_set_seqno;
 	}
 	ring->emit_request = gen8_emit_request;
 	ring->emit_flush = gen8_emit_flush_render;
@@ -2025,12 +2020,12 @@  static int logical_bsd_ring_init(struct drm_device *dev)
 		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT;
 
 	ring->init_hw = gen8_init_common_ring;
-	if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
-		ring->get_seqno = bxt_a_get_seqno;
-		ring->set_seqno = bxt_a_set_seqno;
+	if (!HAS_LLC(dev_priv)) {
+		ring->get_seqno = wc_get_seqno;
+		ring->set_seqno = wc_set_seqno;
 	} else {
-		ring->get_seqno = gen8_get_seqno;
-		ring->set_seqno = gen8_set_seqno;
+		ring->get_seqno = llc_get_seqno;
+		ring->set_seqno = llc_set_seqno;
 	}
 	ring->emit_request = gen8_emit_request;
 	ring->emit_flush = gen8_emit_flush;
@@ -2055,8 +2050,13 @@  static int logical_bsd2_ring_init(struct drm_device *dev)
 		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT;
 
 	ring->init_hw = gen8_init_common_ring;
-	ring->get_seqno = gen8_get_seqno;
-	ring->set_seqno = gen8_set_seqno;
+	if (!HAS_LLC(dev_priv)) {
+		ring->get_seqno = wc_get_seqno;
+		ring->set_seqno = wc_set_seqno;
+	} else {
+		ring->get_seqno = llc_get_seqno;
+		ring->set_seqno = llc_set_seqno;
+	}
 	ring->emit_request = gen8_emit_request;
 	ring->emit_flush = gen8_emit_flush;
 	ring->irq_get = gen8_logical_ring_get_irq;
@@ -2080,12 +2080,12 @@  static int logical_blt_ring_init(struct drm_device *dev)
 		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT;
 
 	ring->init_hw = gen8_init_common_ring;
-	if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
-		ring->get_seqno = bxt_a_get_seqno;
-		ring->set_seqno = bxt_a_set_seqno;
+	if (!HAS_LLC(dev_priv)) {
+		ring->get_seqno = wc_get_seqno;
+		ring->set_seqno = wc_set_seqno;
 	} else {
-		ring->get_seqno = gen8_get_seqno;
-		ring->set_seqno = gen8_set_seqno;
+		ring->get_seqno = llc_get_seqno;
+		ring->set_seqno = llc_set_seqno;
 	}
 	ring->emit_request = gen8_emit_request;
 	ring->emit_flush = gen8_emit_flush;
@@ -2110,12 +2110,12 @@  static int logical_vebox_ring_init(struct drm_device *dev)
 		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT;
 
 	ring->init_hw = gen8_init_common_ring;
-	if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
-		ring->get_seqno = bxt_a_get_seqno;
-		ring->set_seqno = bxt_a_set_seqno;
+	if (!HAS_LLC(dev_priv)) {
+		ring->get_seqno = wc_get_seqno;
+		ring->set_seqno = wc_set_seqno;
 	} else {
-		ring->get_seqno = gen8_get_seqno;
-		ring->set_seqno = gen8_set_seqno;
+		ring->get_seqno = llc_get_seqno;
+		ring->set_seqno = llc_set_seqno;
 	}
 	ring->emit_request = gen8_emit_request;
 	ring->emit_flush = gen8_emit_flush;