diff mbox

[RFC] drm/i915: use PIPE_CONTROL for flushing on gen6+

Message ID 20110812111845.339aab04@jbarnes-desktop (mailing list archive)
State New, archived
Headers show

Commit Message

Jesse Barnes Aug. 12, 2011, 6:18 p.m. UTC
I'm trying to figure out why this doesn't work.  Anyone have ideas?

On gen6+ (well probably since Cantiga actually) we're supposed to use
PIPE_CONTROL rather than MI_FLUSH for flushing the pipeline and
caches.  This patch doesn't cause hangs or crashes in my testing, but
does prevent glxgears from displaying anything.

The other worrying thing about this is that gen6+ has a command length
field of 3, but we're using 2 in the DRI driver, even on gen6.
Changing it doesn't seem to have any effect, but it's still of concern.

Comments

Eric Anholt Aug. 12, 2011, 11:55 p.m. UTC | #1
On Fri, 12 Aug 2011 11:18:45 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> I'm trying to figure out why this doesn't work.  Anyone have ideas?
> 
> On gen6+ (well probably since Cantiga actually) we're supposed to use
> PIPE_CONTROL rather than MI_FLUSH for flushing the pipeline and
> caches.  This patch doesn't cause hangs or crashes in my testing, but
> does prevent glxgears from displaying anything.
> 
> The other worrying thing about this is that gen6+ has a command length
> field of 3, but we're using 2 in the DRI driver, even on gen6.
> Changing it doesn't seem to have any effect, but it's still of concern.

From what I've read, the hardware tends to process shortened packets
using some garbage in whatever fields were left out.

I don't see why we'd need a separate object for the pipe control write
destination.  We could just drop it in some unused bit of the status
page, right?

I'd drop the scratch_addr from packets that don't do a post-sync write,
for clarity.

I don't see what's actually going wrong in this patch, though.
Jesse Barnes Aug. 19, 2011, 5:37 p.m. UTC | #2
On Fri, 12 Aug 2011 16:55:47 -0700
Eric Anholt <eric@anholt.net> wrote:

> On Fri, 12 Aug 2011 11:18:45 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> > I'm trying to figure out why this doesn't work.  Anyone have ideas?
> > 
> > On gen6+ (well probably since Cantiga actually) we're supposed to use
> > PIPE_CONTROL rather than MI_FLUSH for flushing the pipeline and
> > caches.  This patch doesn't cause hangs or crashes in my testing, but
> > does prevent glxgears from displaying anything.
> > 
> > The other worrying thing about this is that gen6+ has a command length
> > field of 3, but we're using 2 in the DRI driver, even on gen6.
> > Changing it doesn't seem to have any effect, but it's still of concern.
> 
> From what I've read, the hardware tends to process shortened packets
> using some garbage in whatever fields were left out.
> 
> I don't see why we'd need a separate object for the pipe control write
> destination.  We could just drop it in some unused bit of the status
> page, right?

Yeah, that would be fine.  I just re-used the scratch page from ILK
since we already had code for it, but on SNB it should be safe to just
use the HWS.

> I'd drop the scratch_addr from packets that don't do a post-sync write,
> for clarity.
> 
> I don't see what's actually going wrong in this patch, though.

I tried adding the TLB flush, which was missing, but I still see big
missing regions on the login screen.  I guess even blits somehow aren't
landing or getting sequenced correctly... back to the docs to look for
more clues.
Ben Widawsky Aug. 30, 2011, 11:11 p.m. UTC | #3
On Fri, Aug 12, 2011 at 11:18:45AM -0700, Jesse Barnes wrote:
>  	int space = (ring->head & HEAD_ADDR) - (ring->tail + 8);
> @@ -123,6 +133,112 @@ render_ring_flush(struct intel_ring_buffer *ring,
>  	return 0;
>  }
>  
> +/**
> + * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
> + * implementing two workarounds on gen6.  From section 1.4.7.1
> + * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
> + *
> + * [DevSNB-C+{W/A}] Before any depth stall flush (including those
> + * produced by non-pipelined state commands), software needs to first
> + * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
> + * 0.
> + *
> + * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
> + * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
> + *
> + * And the workaround for these two requires this workaround first:
> + *
> + * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
> + * BEFORE the pipe-control with a post-sync op and no write-cache
> + * flushes.
> + *
> + * And this last workaround is tricky because of the requirements on
> + * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
> + * volume 2 part 1:
> + *
> + *     "1 of the following must also be set:
> + *      - Render Target Cache Flush Enable ([12] of DW1)
> + *      - Depth Cache Flush Enable ([0] of DW1)
> + *      - Stall at Pixel Scoreboard ([1] of DW1)
> + *      - Depth Stall ([13] of DW1)
> + *      - Post-Sync Operation ([13] of DW1)
> + *      - Notify Enable ([8] of DW1)"
> + *
> + * The cache flushes require the workaround flush that triggered this
> + * one, so we can't use it.  Depth stall would trigger the same.
> + * Post-sync nonzero is what triggered this second workaround, so we
> + * can't use that one either.  Notify enable is IRQs, which aren't
> + * really our business.  That leaves only stall at scoreboard.
> + */
> +static int
> +intel_emit_post_sync_nonzero_flush(struct intel_ring_buffer *ring)
> +{
> +	struct pipe_control *pc = ring->private;
> +	u32 scratch_addr = pc->gtt_offset + 128;
> +	int ret;
> +
> +
> +	ret = intel_ring_begin(ring, 6);
> +	if (ret)
> +		return ret;
> +
> +	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL_GEN6);
> +	intel_ring_emit(ring, PIPE_CONTROL_CS_STALL |
> +			PIPE_CONTROL_STALL_AT_SCOREBOARD);
> +	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
> +	intel_ring_emit(ring, 0); /* low dword */
> +	intel_ring_emit(ring, 0); /* high dword */
> +	intel_ring_emit(ring, MI_NOOP);
> +	intel_ring_advance(ring);
> +
> +	ret = intel_ring_begin(ring, 6);
> +	if (ret)
> +		return ret;
> +
> +	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL_GEN6);
> +	intel_ring_emit(ring, PIPE_CONTROL_QW_WRITE);
> +	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
> +	intel_ring_emit(ring, 0);
> +	intel_ring_emit(ring, 0);
> +	intel_ring_emit(ring, MI_NOOP);
> +	intel_ring_advance(ring);
> +
> +	return 0;
> +}
> +
> +static int
> +gen6_render_ring_flush(struct intel_ring_buffer *ring,
> +                         u32 invalidate_domains, u32 flush_domains)
> +{
> +	u32 flags = 0;
> +	struct pipe_control *pc = ring->private;
> +	u32 scratch_addr = pc->gtt_offset + 128;
> +	int ret;
> +
> +	/* Force SNB workarounds for PIPE_CONTROL flushes */
> +	intel_emit_post_sync_nonzero_flush(ring);
> +
> +	/* Just flush everything for now */
> +	flags |= PIPE_CONTROL_WC_FLUSH;
> +	flags |= PIPE_CONTROL_IS_FLUSH;
> +	flags |= PIPE_CONTROL_TC_FLUSH;
> +	flags |= PIPE_CONTROL_DEPTH_FLUSH;
> +
> +	ret = intel_ring_begin(ring, 6);
> +	if (ret)
> +		return ret;
> +
> +	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL_GEN6);
> +	intel_ring_emit(ring, flags);
> +	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
> +	intel_ring_emit(ring, 0); /* lower dword */
> +	intel_ring_emit(ring, 0); /* uppwer dword */
> +	intel_ring_emit(ring, MI_NOOP);
> +	intel_ring_advance(ring);
> +
> +	return 0;
> +}
> +
>  static void ring_write_tail(struct intel_ring_buffer *ring,
>  			    u32 value)
>  {

While I'm not convinced this is broken, I think you either want to
specify a qword write (3 << 14) or use n=2 for the pipe control, ie:

intel_ring_emit(ring, GFX_OP_PIPE_CONTROL_GEN);
intel_ring_emit(ring, flags);
intel_ring_emit(ring, 0); /* ignored if no write */
intel_ring_emit(ring, 0); /* ignored if no write*/
intel_ring_advance(ring);

Ben
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 5baaef4..4a456a6 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -236,6 +236,8 @@ 
 #define   DISPLAY_PLANE_A           (0<<20)
 #define   DISPLAY_PLANE_B           (1<<20)
 #define GFX_OP_PIPE_CONTROL	((0x3<<29)|(0x3<<27)|(0x2<<24)|2)
+#define GFX_OP_PIPE_CONTROL_GEN6 ((0x3<<29)|(0x3<<27)|(0x2<<24)|3)
+#define   PIPE_CONTROL_CS_STALL (1<<20)
 #define   PIPE_CONTROL_QW_WRITE	(1<<14)
 #define   PIPE_CONTROL_DEPTH_STALL (1<<13)
 #define   PIPE_CONTROL_WC_FLUSH	(1<<12)
@@ -244,8 +246,8 @@ 
 #define   PIPE_CONTROL_ISP_DIS	(1<<9)
 #define   PIPE_CONTROL_NOTIFY	(1<<8)
 #define   PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */
-#define   PIPE_CONTROL_STALL_EN	(1<<1) /* in addr word, Ironlake+ only */
-
+#define   PIPE_CONTROL_STALL_AT_SCOREBOARD (1<<1) /* in addr word, Ironlake+ only */
+#define   PIPE_CONTROL_DEPTH_FLUSH (1<<0)
 
 /*
  * Reset registers
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 47b9b27..cecc1e1 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -34,6 +34,16 @@ 
 #include "i915_trace.h"
 #include "intel_drv.h"
 
+/*
+ * 965+ support PIPE_CONTROL commands, which provide finer grained control
+ * over cache flushing.
+ */
+struct pipe_control {
+	struct drm_i915_gem_object *obj;
+	volatile u32 *cpu_page;
+	u32 gtt_offset;
+};
+
 static inline int ring_space(struct intel_ring_buffer *ring)
 {
 	int space = (ring->head & HEAD_ADDR) - (ring->tail + 8);
@@ -123,6 +133,112 @@  render_ring_flush(struct intel_ring_buffer *ring,
 	return 0;
 }
 
+/**
+ * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
+ * implementing two workarounds on gen6.  From section 1.4.7.1
+ * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
+ *
+ * [DevSNB-C+{W/A}] Before any depth stall flush (including those
+ * produced by non-pipelined state commands), software needs to first
+ * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
+ * 0.
+ *
+ * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
+ * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
+ *
+ * And the workaround for these two requires this workaround first:
+ *
+ * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
+ * BEFORE the pipe-control with a post-sync op and no write-cache
+ * flushes.
+ *
+ * And this last workaround is tricky because of the requirements on
+ * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
+ * volume 2 part 1:
+ *
+ *     "1 of the following must also be set:
+ *      - Render Target Cache Flush Enable ([12] of DW1)
+ *      - Depth Cache Flush Enable ([0] of DW1)
+ *      - Stall at Pixel Scoreboard ([1] of DW1)
+ *      - Depth Stall ([13] of DW1)
+ *      - Post-Sync Operation ([13] of DW1)
+ *      - Notify Enable ([8] of DW1)"
+ *
+ * The cache flushes require the workaround flush that triggered this
+ * one, so we can't use it.  Depth stall would trigger the same.
+ * Post-sync nonzero is what triggered this second workaround, so we
+ * can't use that one either.  Notify enable is IRQs, which aren't
+ * really our business.  That leaves only stall at scoreboard.
+ */
+static int
+intel_emit_post_sync_nonzero_flush(struct intel_ring_buffer *ring)
+{
+	struct pipe_control *pc = ring->private;
+	u32 scratch_addr = pc->gtt_offset + 128;
+	int ret;
+
+
+	ret = intel_ring_begin(ring, 6);
+	if (ret)
+		return ret;
+
+	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL_GEN6);
+	intel_ring_emit(ring, PIPE_CONTROL_CS_STALL |
+			PIPE_CONTROL_STALL_AT_SCOREBOARD);
+	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
+	intel_ring_emit(ring, 0); /* low dword */
+	intel_ring_emit(ring, 0); /* high dword */
+	intel_ring_emit(ring, MI_NOOP);
+	intel_ring_advance(ring);
+
+	ret = intel_ring_begin(ring, 6);
+	if (ret)
+		return ret;
+
+	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL_GEN6);
+	intel_ring_emit(ring, PIPE_CONTROL_QW_WRITE);
+	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
+	intel_ring_emit(ring, 0);
+	intel_ring_emit(ring, 0);
+	intel_ring_emit(ring, MI_NOOP);
+	intel_ring_advance(ring);
+
+	return 0;
+}
+
+static int
+gen6_render_ring_flush(struct intel_ring_buffer *ring,
+                         u32 invalidate_domains, u32 flush_domains)
+{
+	u32 flags = 0;
+	struct pipe_control *pc = ring->private;
+	u32 scratch_addr = pc->gtt_offset + 128;
+	int ret;
+
+	/* Force SNB workarounds for PIPE_CONTROL flushes */
+	intel_emit_post_sync_nonzero_flush(ring);
+
+	/* Just flush everything for now */
+	flags |= PIPE_CONTROL_WC_FLUSH;
+	flags |= PIPE_CONTROL_IS_FLUSH;
+	flags |= PIPE_CONTROL_TC_FLUSH;
+	flags |= PIPE_CONTROL_DEPTH_FLUSH;
+
+	ret = intel_ring_begin(ring, 6);
+	if (ret)
+		return ret;
+
+	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL_GEN6);
+	intel_ring_emit(ring, flags);
+	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
+	intel_ring_emit(ring, 0); /* lower dword */
+	intel_ring_emit(ring, 0); /* uppwer dword */
+	intel_ring_emit(ring, MI_NOOP);
+	intel_ring_advance(ring);
+
+	return 0;
+}
+
 static void ring_write_tail(struct intel_ring_buffer *ring,
 			    u32 value)
 {
@@ -206,16 +322,6 @@  static int init_ring_common(struct intel_ring_buffer *ring)
 	return 0;
 }
 
-/*
- * 965+ support PIPE_CONTROL commands, which provide finer grained control
- * over cache flushing.
- */
-struct pipe_control {
-	struct drm_i915_gem_object *obj;
-	volatile u32 *cpu_page;
-	u32 gtt_offset;
-};
-
 static int
 init_pipe_control(struct intel_ring_buffer *ring)
 {
@@ -292,8 +398,7 @@  static int init_render_ring(struct intel_ring_buffer *ring)
 		I915_WRITE(MI_MODE, mode);
 	}
 
-	if (INTEL_INFO(dev)->gen >= 6) {
-	} else if (IS_GEN5(dev)) {
+	if (INTEL_INFO(dev)->gen >= 5) {
 		ret = init_pipe_control(ring);
 		if (ret)
 			return ret;
@@ -1291,6 +1396,7 @@  int intel_init_render_ring_buffer(struct drm_device *dev)
 	*ring = render_ring;
 	if (INTEL_INFO(dev)->gen >= 6) {
 		ring->add_request = gen6_add_request;
+		ring->flush = gen6_render_ring_flush;
 		ring->irq_get = gen6_render_ring_get_irq;
 		ring->irq_put = gen6_render_ring_put_irq;
 	} else if (IS_GEN5(dev)) {