diff mbox

[06/13] drm/i915/bdw: implement semaphore signal

Message ID 1391025333-31587-7-git-send-email-benjamin.widawsky@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ben Widawsky Jan. 29, 2014, 7:55 p.m. UTC
Semaphore signalling works similarly to previous GENs with the exception
that the per ring mailboxes no longer exist. Instead you must define
your own space, somewhere in the GTT.

The comments in the code define the layout I've opted for, which should
be fairly future proof. Ie. I tried to define offsets in abstract terms
(NUM_RINGS, seqno size, etc).

NOTE: If one wanted to move this to the HWSP they could. I've decided
one 4k object would be easier to deal with, and provide potential wins
with cache locality, but that's all speculative.

v2: Update the macro to not need the other ring's ring->id (Chris)
Update the comment to use the correct formula (Chris)

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h         |   1 +
 drivers/gpu/drm/i915/i915_reg.h         |   5 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c | 199 +++++++++++++++++++++++++-------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  38 +++++-
 4 files changed, 197 insertions(+), 46 deletions(-)

Comments

Ville Syrjälä Jan. 30, 2014, 12:38 p.m. UTC | #1
On Wed, Jan 29, 2014 at 11:55:26AM -0800, Ben Widawsky wrote:
> Semaphore signalling works similarly to previous GENs with the exception
> that the per ring mailboxes no longer exist. Instead you must define
> your own space, somewhere in the GTT.
> 
> The comments in the code define the layout I've opted for, which should
> be fairly future proof. Ie. I tried to define offsets in abstract terms
> (NUM_RINGS, seqno size, etc).
> 
> NOTE: If one wanted to move this to the HWSP they could. I've decided
> one 4k object would be easier to deal with, and provide potential wins
> with cache locality, but that's all speculative.
> 
> v2: Update the macro to not need the other ring's ring->id (Chris)
> Update the comment to use the correct formula (Chris)
> 
> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |   1 +
>  drivers/gpu/drm/i915/i915_reg.h         |   5 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 199 +++++++++++++++++++++++++-------
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  38 +++++-
>  4 files changed, 197 insertions(+), 46 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 3673ba1..f521059 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1380,6 +1380,7 @@ typedef struct drm_i915_private {
>  
>  	struct pci_dev *bridge_dev;
>  	struct intel_ring_buffer ring[I915_NUM_RINGS];
> +	struct drm_i915_gem_object *semaphore_obj;
>  	uint32_t last_seqno, next_seqno;
>  
>  	drm_dma_handle_t *status_page_dmah;
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index cbbaf26..8b745dc 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -216,7 +216,7 @@
>  #define   MI_DISPLAY_FLIP_IVB_SPRITE_B (3 << 19)
>  #define   MI_DISPLAY_FLIP_IVB_PLANE_C  (4 << 19)
>  #define   MI_DISPLAY_FLIP_IVB_SPRITE_C (5 << 19)
> -#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6+ */
> +#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6, gen7 */
>  #define   MI_SEMAPHORE_GLOBAL_GTT    (1<<22)
>  #define   MI_SEMAPHORE_UPDATE	    (1<<21)
>  #define   MI_SEMAPHORE_COMPARE	    (1<<20)
> @@ -241,6 +241,8 @@
>  #define   MI_RESTORE_EXT_STATE_EN	(1<<2)
>  #define   MI_FORCE_RESTORE		(1<<1)
>  #define   MI_RESTORE_INHIBIT		(1<<0)
> +#define MI_SEMAPHORE_SIGNAL	MI_INSTR(0x1b, 0) /* GEN8+ */
> +#define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
>  #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
>  #define   MI_MEM_VIRTUAL	(1 << 22) /* 965+ only */
>  #define MI_STORE_DWORD_INDEX	MI_INSTR(0x21, 1)
> @@ -329,6 +331,7 @@
>  #define   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE		(1<<10) /* GM45+ only */
>  #define   PIPE_CONTROL_INDIRECT_STATE_DISABLE		(1<<9)
>  #define   PIPE_CONTROL_NOTIFY				(1<<8)
> +#define   PIPE_CONTROL_FLUSH_ENABLE			(1<<7) /* gen7+ */
>  #define   PIPE_CONTROL_VF_CACHE_INVALIDATE		(1<<4)
>  #define   PIPE_CONTROL_CONST_CACHE_INVALIDATE		(1<<3)
>  #define   PIPE_CONTROL_STATE_CACHE_INVALIDATE		(1<<2)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 37ae2b1..b750835 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -619,6 +619,13 @@ static int init_render_ring(struct intel_ring_buffer *ring)
>  static void render_ring_cleanup(struct intel_ring_buffer *ring)
>  {
>  	struct drm_device *dev = ring->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	if (dev_priv->semaphore_obj) {
> +		i915_gem_object_ggtt_unpin(dev_priv->semaphore_obj);
> +		drm_gem_object_unreference(&dev_priv->semaphore_obj->base);
> +		dev_priv->semaphore_obj = NULL;
> +	}
>  
>  	if (ring->scratch.obj == NULL)
>  		return;
> @@ -632,6 +639,86 @@ static void render_ring_cleanup(struct intel_ring_buffer *ring)
>  	ring->scratch.obj = NULL;
>  }
>  
> +static int gen8_rcs_signal(struct intel_ring_buffer *signaller,
> +			   unsigned int num_dwords)
> +{
> +#define MBOX_UPDATE_DWORDS 8
> +	struct drm_device *dev = signaller->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct intel_ring_buffer *waiter;
> +	int i, ret, num_rings;
> +
> +	num_rings = hweight_long(INTEL_INFO(dev)->ring_mask);
> +	num_dwords = (num_rings-1) * MBOX_UPDATE_DWORDS;

Again num_dwords +=

> +#undef MBOX_UPDATE_DWORDS
> +
> +	/* XXX: + 4 for the caller */
> +	ret = intel_ring_begin(signaller, num_dwords + 4);

and the +4 goes away.

> +	if (ret)
> +		return ret;
> +
> +	for_each_ring(waiter, dev_priv, i) {
> +		u64 gtt_offset = signaller->semaphore.signal_ggtt[i];
> +		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
> +			continue;
> +
> +		intel_ring_emit(signaller, GFX_OP_PIPE_CONTROL(6));
> +		intel_ring_emit(signaller, PIPE_CONTROL_GLOBAL_GTT_IVB |
> +					   PIPE_CONTROL_QW_WRITE |
> +					   PIPE_CONTROL_FLUSH_ENABLE);
> +		intel_ring_emit(signaller, lower_32_bits(gtt_offset));
> +		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
> +		intel_ring_emit(signaller, signaller->outstanding_lazy_seqno);
> +		intel_ring_emit(signaller, 0);
> +		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
> +					   MI_SEMAPHORE_TARGET(waiter->id));
> +		intel_ring_emit(signaller, 0);
> +	}
> +
> +	WARN_ON(i != num_rings);
> +
> +	return 0;
> +}

<snip>

> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index c69ae10..f1e7a66 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -111,6 +111,39 @@ struct  intel_ring_buffer {
>  #define I915_DISPATCH_PINNED 0x2
>  	void		(*cleanup)(struct intel_ring_buffer *ring);
>  
> +	/* GEN8 signal/wait table
> +	 *	  signal to  signal to    signal to   signal to
> +	 *	    RCS         VCS          BCS        VECS
> +	 *      ------------------------------------------------------
> +	 *  RCS | NOP (0x00) | BCS (0x08) | VCS (0x10) | VECS (0x18) |
> +	 *	|-----------------------------------------------------
> +	 *  VCS | RCS (0x20) | NOP (0x28) | BCS (0x30) | VECS (0x38) |
> +	 *	|-----------------------------------------------------
> +	 *  BCS | RCS (0x40) | VCS (0x48) | NOP (0x50) | VECS (0x58) |
> +	 *	|-----------------------------------------------------
> +	 * VECS | RCS (0x60) | VCS (0x68) | BCS (0x70) |  NOP (0x78) |
> +	 *	|-----------------------------------------------------
> +	 *
> +	 * Generalization:
> +	 *  f(x, y) := (x->id * NUM_RINGS * seqno_size) + (seqno_size * y->id)
> +	 *  ie. transpose of g(x, y)
> +	 *
> +	 *	 sync from   sync from    sync from    sync from
> +	 *	    RCS         VCS          BCS        VECS
> +	 *      ------------------------------------------------------
> +	 *  RCS | NOP (0x00) | BCS (0x20) | VCS (0x40) | VECS (0x60) |
> +	 *	|-----------------------------------------------------
> +	 *  VCS | RCS (0x08) | NOP (0x28) | BCS (0x48) | VECS (0x68) |
> +	 *	|-----------------------------------------------------
> +	 *  BCS | RCS (0x10) | VCS (0x30) | NOP (0x50) | VECS (0x60) |
> +	 *	|-----------------------------------------------------
> +	 * VECS | RCS (0x18) | VCS (0x38) | BCS (0x58) |  NOP (0x78) |
> +	 *	|-----------------------------------------------------
> +	 *
> +	 * Generalization:
> +	 *  g(x, y) := (y->id * NUM_RINGS * seqno_size) + (seqno_size * x->id)
> +	 *  ie. transpose of f(x, y)
> +	 */
>  	struct {
>  		u32	sync_seqno[I915_NUM_RINGS-1];
>  		/* AKA wait() */
> @@ -120,7 +153,10 @@ struct  intel_ring_buffer {
>  		/* our mbox written by others */
>  		u32		mbox[I915_NUM_RINGS];

mbox should also get a u64 friend, right?

>  		/* mboxes this ring signals to */
> -		u32		signal_mbox[I915_NUM_RINGS];
> +		union {
> +			u32		signal_mbox[I915_NUM_RINGS];
> +			u64		signal_ggtt[I915_NUM_RINGS];
> +		};
>  
>  		/* num_dwords is space the caller will need for atomic update */
>  		int		(*signal)(struct intel_ring_buffer *signaller,
> -- 
> 1.8.5.3
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Jan. 30, 2014, 12:46 p.m. UTC | #2
On Thu, Jan 30, 2014 at 02:38:17PM +0200, Ville Syrjälä wrote:
> On Wed, Jan 29, 2014 at 11:55:26AM -0800, Ben Widawsky wrote:
> > Semaphore signalling works similarly to previous GENs with the exception
> > that the per ring mailboxes no longer exist. Instead you must define
> > your own space, somewhere in the GTT.
> > 
> > The comments in the code define the layout I've opted for, which should
> > be fairly future proof. Ie. I tried to define offsets in abstract terms
> > (NUM_RINGS, seqno size, etc).
> > 
> > NOTE: If one wanted to move this to the HWSP they could. I've decided
> > one 4k object would be easier to deal with, and provide potential wins
> > with cache locality, but that's all speculative.
> > 
> > v2: Update the macro to not need the other ring's ring->id (Chris)
> > Update the comment to use the correct formula (Chris)
> > 
> > Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h         |   1 +
> >  drivers/gpu/drm/i915/i915_reg.h         |   5 +-
> >  drivers/gpu/drm/i915/intel_ringbuffer.c | 199 +++++++++++++++++++++++++-------
> >  drivers/gpu/drm/i915/intel_ringbuffer.h |  38 +++++-
> >  4 files changed, 197 insertions(+), 46 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 3673ba1..f521059 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1380,6 +1380,7 @@ typedef struct drm_i915_private {
> >  
> >  	struct pci_dev *bridge_dev;
> >  	struct intel_ring_buffer ring[I915_NUM_RINGS];
> > +	struct drm_i915_gem_object *semaphore_obj;
> >  	uint32_t last_seqno, next_seqno;
> >  
> >  	drm_dma_handle_t *status_page_dmah;
> > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> > index cbbaf26..8b745dc 100644
> > --- a/drivers/gpu/drm/i915/i915_reg.h
> > +++ b/drivers/gpu/drm/i915/i915_reg.h
> > @@ -216,7 +216,7 @@
> >  #define   MI_DISPLAY_FLIP_IVB_SPRITE_B (3 << 19)
> >  #define   MI_DISPLAY_FLIP_IVB_PLANE_C  (4 << 19)
> >  #define   MI_DISPLAY_FLIP_IVB_SPRITE_C (5 << 19)
> > -#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6+ */
> > +#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6, gen7 */
> >  #define   MI_SEMAPHORE_GLOBAL_GTT    (1<<22)
> >  #define   MI_SEMAPHORE_UPDATE	    (1<<21)
> >  #define   MI_SEMAPHORE_COMPARE	    (1<<20)
> > @@ -241,6 +241,8 @@
> >  #define   MI_RESTORE_EXT_STATE_EN	(1<<2)
> >  #define   MI_FORCE_RESTORE		(1<<1)
> >  #define   MI_RESTORE_INHIBIT		(1<<0)
> > +#define MI_SEMAPHORE_SIGNAL	MI_INSTR(0x1b, 0) /* GEN8+ */
> > +#define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
> >  #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
> >  #define   MI_MEM_VIRTUAL	(1 << 22) /* 965+ only */
> >  #define MI_STORE_DWORD_INDEX	MI_INSTR(0x21, 1)
> > @@ -329,6 +331,7 @@
> >  #define   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE		(1<<10) /* GM45+ only */
> >  #define   PIPE_CONTROL_INDIRECT_STATE_DISABLE		(1<<9)
> >  #define   PIPE_CONTROL_NOTIFY				(1<<8)
> > +#define   PIPE_CONTROL_FLUSH_ENABLE			(1<<7) /* gen7+ */

Oh. So they changed how post-sync writes operated - this should be a
separate fix for stable I believe (so that batches are not run before we
have finished invalidating the TLBs required).
-Chris
Daniel Vetter Jan. 30, 2014, 1:18 p.m. UTC | #3
On Thu, Jan 30, 2014 at 1:46 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> Oh. So they changed how post-sync writes operated - this should be a
> separate fix for stable I believe (so that batches are not run before we
> have finished invalidating the TLBs required).

We have an igt to exercise tlb invalidation stuff, which runs on all
rings. But it only runs a batch, so only uses the CS tlb. Do we need
to extend this?
-Daniel
Chris Wilson Jan. 30, 2014, 1:25 p.m. UTC | #4
On Thu, Jan 30, 2014 at 02:18:32PM +0100, Daniel Vetter wrote:
> On Thu, Jan 30, 2014 at 1:46 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > Oh. So they changed how post-sync writes operated - this should be a
> > separate fix for stable I believe (so that batches are not run before we
> > have finished invalidating the TLBs required).
> 
> We have an igt to exercise tlb invalidation stuff, which runs on all
> rings. But it only runs a batch, so only uses the CS tlb. Do we need
> to extend this?

You could try and catch out the sampler. Or it may be that the
hardware internally serialises the operation of invalidating the TLBs
and lookup. Or it may be just such a slim window that it will only be
hit during a demo and never a test case ;)
-Chris
Chris Wilson Jan. 30, 2014, 1:35 p.m. UTC | #5
On Thu, Jan 30, 2014 at 02:18:32PM +0100, Daniel Vetter wrote:
> On Thu, Jan 30, 2014 at 1:46 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > Oh. So they changed how post-sync writes operated - this should be a
> > separate fix for stable I believe (so that batches are not run before we
> > have finished invalidating the TLBs required).
> 
> We have an igt to exercise tlb invalidation stuff, which runs on all
> rings. But it only runs a batch, so only uses the CS tlb. Do we need
> to extend this?

So the spec says:

Pipe Control Flush Enable (IVB+)
If ENABLED, the PIPE_CONTROL command will wait until all previous writes
of immediate data from post sync circles are complete before executing
the next command.

Post Sync Operation
This field specifies an optional action to be taken upon completion of
the synchronization operation.

TLB Invalidate
If ENABLED, all TLBs belonging to Render Engine will be invalidated once
the flush operation is complete.

Command Streamer Stall Enable
If ENABLED, the sync operation will not occur until all previous flush
operations pending a completion of those previous flushes will complete,
including the flush produced from this command. This enables the command
to act similar to the legacy MI_FLUSH command.

Going by that, the order is

flush, stall, TLB invalidate / post-sync op, [pipe control flush]

Based on my reading of the above (which unless someone has a more
definitive source) says that without the CONTROL_FLUSH_ENABLE, the CS
can continue operations as soon as the flush is complete - in parallel
to the TLB invalidate. Adding CONTROL_FLUSH_ENABLE would then stall the
CS until the post-sync operation completes. That still leaves the
possibility that the TLB invalidate is being performed in parallel and
is itself provides no CS sync.
-Chris
Ben Widawsky Feb. 11, 2014, 9:48 p.m. UTC | #6
On Thu, Jan 30, 2014 at 01:35:41PM +0000, Chris Wilson wrote:
> On Thu, Jan 30, 2014 at 02:18:32PM +0100, Daniel Vetter wrote:
> > On Thu, Jan 30, 2014 at 1:46 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > Oh. So they changed how post-sync writes operated - this should be a
> > > separate fix for stable I believe (so that batches are not run before we
> > > have finished invalidating the TLBs required).
> > 
> > We have an igt to exercise tlb invalidation stuff, which runs on all
> > rings. But it only runs a batch, so only uses the CS tlb. Do we need
> > to extend this?
> 
> So the spec says:
> 
> Pipe Control Flush Enable (IVB+)
> If ENABLED, the PIPE_CONTROL command will wait until all previous writes
> of immediate data from post sync circles are complete before executing
> the next command.
> 
> Post Sync Operation
> This field specifies an optional action to be taken upon completion of
> the synchronization operation.
> 
> TLB Invalidate
> If ENABLED, all TLBs belonging to Render Engine will be invalidated once
> the flush operation is complete.
> 
> Command Streamer Stall Enable
> If ENABLED, the sync operation will not occur until all previous flush
> operations pending a completion of those previous flushes will complete,
> including the flush produced from this command. This enables the command
> to act similar to the legacy MI_FLUSH command.
> 
> Going by that, the order is
> 
> flush, stall, TLB invalidate / post-sync op, [pipe control flush]
> 
> Based on my reading of the above (which unless someone has a more
> definitive source) says that without the CONTROL_FLUSH_ENABLE, the CS
> can continue operations as soon as the flush is complete - in parallel
> to the TLB invalidate. Adding CONTROL_FLUSH_ENABLE would then stall the
> CS until the post-sync operation completes. That still leaves the
> possibility that the TLB invalidate is being performed in parallel and
> is itself provides no CS sync.
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre

so.... what the verdict?
Ben Widawsky Feb. 11, 2014, 10:11 p.m. UTC | #7
On Thu, Jan 30, 2014 at 02:38:17PM +0200, Ville Syrjälä wrote:
> On Wed, Jan 29, 2014 at 11:55:26AM -0800, Ben Widawsky wrote:
> > Semaphore signalling works similarly to previous GENs with the exception
> > that the per ring mailboxes no longer exist. Instead you must define
> > your own space, somewhere in the GTT.
> > 
> > The comments in the code define the layout I've opted for, which should
> > be fairly future proof. Ie. I tried to define offsets in abstract terms
> > (NUM_RINGS, seqno size, etc).
> > 
> > NOTE: If one wanted to move this to the HWSP they could. I've decided
> > one 4k object would be easier to deal with, and provide potential wins
> > with cache locality, but that's all speculative.
> > 
> > v2: Update the macro to not need the other ring's ring->id (Chris)
> > Update the comment to use the correct formula (Chris)
> > 
> > Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h         |   1 +
> >  drivers/gpu/drm/i915/i915_reg.h         |   5 +-
> >  drivers/gpu/drm/i915/intel_ringbuffer.c | 199 +++++++++++++++++++++++++-------
> >  drivers/gpu/drm/i915/intel_ringbuffer.h |  38 +++++-
> >  4 files changed, 197 insertions(+), 46 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 3673ba1..f521059 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1380,6 +1380,7 @@ typedef struct drm_i915_private {
> >  
> >  	struct pci_dev *bridge_dev;
> >  	struct intel_ring_buffer ring[I915_NUM_RINGS];
> > +	struct drm_i915_gem_object *semaphore_obj;
> >  	uint32_t last_seqno, next_seqno;
> >  
> >  	drm_dma_handle_t *status_page_dmah;
> > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> > index cbbaf26..8b745dc 100644
> > --- a/drivers/gpu/drm/i915/i915_reg.h
> > +++ b/drivers/gpu/drm/i915/i915_reg.h
> > @@ -216,7 +216,7 @@
> >  #define   MI_DISPLAY_FLIP_IVB_SPRITE_B (3 << 19)
> >  #define   MI_DISPLAY_FLIP_IVB_PLANE_C  (4 << 19)
> >  #define   MI_DISPLAY_FLIP_IVB_SPRITE_C (5 << 19)
> > -#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6+ */
> > +#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6, gen7 */
> >  #define   MI_SEMAPHORE_GLOBAL_GTT    (1<<22)
> >  #define   MI_SEMAPHORE_UPDATE	    (1<<21)
> >  #define   MI_SEMAPHORE_COMPARE	    (1<<20)
> > @@ -241,6 +241,8 @@
> >  #define   MI_RESTORE_EXT_STATE_EN	(1<<2)
> >  #define   MI_FORCE_RESTORE		(1<<1)
> >  #define   MI_RESTORE_INHIBIT		(1<<0)
> > +#define MI_SEMAPHORE_SIGNAL	MI_INSTR(0x1b, 0) /* GEN8+ */
> > +#define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
> >  #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
> >  #define   MI_MEM_VIRTUAL	(1 << 22) /* 965+ only */
> >  #define MI_STORE_DWORD_INDEX	MI_INSTR(0x21, 1)
> > @@ -329,6 +331,7 @@
> >  #define   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE		(1<<10) /* GM45+ only */
> >  #define   PIPE_CONTROL_INDIRECT_STATE_DISABLE		(1<<9)
> >  #define   PIPE_CONTROL_NOTIFY				(1<<8)
> > +#define   PIPE_CONTROL_FLUSH_ENABLE			(1<<7) /* gen7+ */
> >  #define   PIPE_CONTROL_VF_CACHE_INVALIDATE		(1<<4)
> >  #define   PIPE_CONTROL_CONST_CACHE_INVALIDATE		(1<<3)
> >  #define   PIPE_CONTROL_STATE_CACHE_INVALIDATE		(1<<2)
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > index 37ae2b1..b750835 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > @@ -619,6 +619,13 @@ static int init_render_ring(struct intel_ring_buffer *ring)
> >  static void render_ring_cleanup(struct intel_ring_buffer *ring)
> >  {
> >  	struct drm_device *dev = ring->dev;
> > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > +
> > +	if (dev_priv->semaphore_obj) {
> > +		i915_gem_object_ggtt_unpin(dev_priv->semaphore_obj);
> > +		drm_gem_object_unreference(&dev_priv->semaphore_obj->base);
> > +		dev_priv->semaphore_obj = NULL;
> > +	}
> >  
> >  	if (ring->scratch.obj == NULL)
> >  		return;
> > @@ -632,6 +639,86 @@ static void render_ring_cleanup(struct intel_ring_buffer *ring)
> >  	ring->scratch.obj = NULL;
> >  }
> >  
> > +static int gen8_rcs_signal(struct intel_ring_buffer *signaller,
> > +			   unsigned int num_dwords)
> > +{
> > +#define MBOX_UPDATE_DWORDS 8
> > +	struct drm_device *dev = signaller->dev;
> > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > +	struct intel_ring_buffer *waiter;
> > +	int i, ret, num_rings;
> > +
> > +	num_rings = hweight_long(INTEL_INFO(dev)->ring_mask);
> > +	num_dwords = (num_rings-1) * MBOX_UPDATE_DWORDS;
> 
> Again num_dwords +=
> 
> > +#undef MBOX_UPDATE_DWORDS
> > +
> > +	/* XXX: + 4 for the caller */
> > +	ret = intel_ring_begin(signaller, num_dwords + 4);
> 
> and the +4 goes away.
> 
> > +	if (ret)
> > +		return ret;
> > +
> > +	for_each_ring(waiter, dev_priv, i) {
> > +		u64 gtt_offset = signaller->semaphore.signal_ggtt[i];
> > +		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
> > +			continue;
> > +
> > +		intel_ring_emit(signaller, GFX_OP_PIPE_CONTROL(6));
> > +		intel_ring_emit(signaller, PIPE_CONTROL_GLOBAL_GTT_IVB |
> > +					   PIPE_CONTROL_QW_WRITE |
> > +					   PIPE_CONTROL_FLUSH_ENABLE);
> > +		intel_ring_emit(signaller, lower_32_bits(gtt_offset));
> > +		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
> > +		intel_ring_emit(signaller, signaller->outstanding_lazy_seqno);
> > +		intel_ring_emit(signaller, 0);
> > +		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
> > +					   MI_SEMAPHORE_TARGET(waiter->id));
> > +		intel_ring_emit(signaller, 0);
> > +	}
> > +
> > +	WARN_ON(i != num_rings);
> > +
> > +	return 0;
> > +}
> 
> <snip>

Got those, thanks.

> 
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > index c69ae10..f1e7a66 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > @@ -111,6 +111,39 @@ struct  intel_ring_buffer {
> >  #define I915_DISPATCH_PINNED 0x2
> >  	void		(*cleanup)(struct intel_ring_buffer *ring);
> >  
> > +	/* GEN8 signal/wait table
> > +	 *	  signal to  signal to    signal to   signal to
> > +	 *	    RCS         VCS          BCS        VECS
> > +	 *      ------------------------------------------------------
> > +	 *  RCS | NOP (0x00) | BCS (0x08) | VCS (0x10) | VECS (0x18) |
> > +	 *	|-----------------------------------------------------
> > +	 *  VCS | RCS (0x20) | NOP (0x28) | BCS (0x30) | VECS (0x38) |
> > +	 *	|-----------------------------------------------------
> > +	 *  BCS | RCS (0x40) | VCS (0x48) | NOP (0x50) | VECS (0x58) |
> > +	 *	|-----------------------------------------------------
> > +	 * VECS | RCS (0x60) | VCS (0x68) | BCS (0x70) |  NOP (0x78) |
> > +	 *	|-----------------------------------------------------
> > +	 *
> > +	 * Generalization:
> > +	 *  f(x, y) := (x->id * NUM_RINGS * seqno_size) + (seqno_size * y->id)
> > +	 *  ie. transpose of g(x, y)
> > +	 *
> > +	 *	 sync from   sync from    sync from    sync from
> > +	 *	    RCS         VCS          BCS        VECS
> > +	 *      ------------------------------------------------------
> > +	 *  RCS | NOP (0x00) | BCS (0x20) | VCS (0x40) | VECS (0x60) |
> > +	 *	|-----------------------------------------------------
> > +	 *  VCS | RCS (0x08) | NOP (0x28) | BCS (0x48) | VECS (0x68) |
> > +	 *	|-----------------------------------------------------
> > +	 *  BCS | RCS (0x10) | VCS (0x30) | NOP (0x50) | VECS (0x60) |
> > +	 *	|-----------------------------------------------------
> > +	 * VECS | RCS (0x18) | VCS (0x38) | BCS (0x58) |  NOP (0x78) |
> > +	 *	|-----------------------------------------------------
> > +	 *
> > +	 * Generalization:
> > +	 *  g(x, y) := (y->id * NUM_RINGS * seqno_size) + (seqno_size * x->id)
> > +	 *  ie. transpose of f(x, y)
> > +	 */
> >  	struct {
> >  		u32	sync_seqno[I915_NUM_RINGS-1];
> >  		/* AKA wait() */
> > @@ -120,7 +153,10 @@ struct  intel_ring_buffer {
> >  		/* our mbox written by others */
> >  		u32		mbox[I915_NUM_RINGS];
> 
> mbox should also get a u64 friend, right?

mbox should be gen6 only, given the change to using the gtt on gen8. In
this point in the series, semaphores should be forcibly disabled on
gen8, so the code looks wrong, but the path cannot [should not] be
taken.

I suppose I should kill the initialization of mbox for gen8, or somehow
consolidate with a union to prevent confusion.

> 
> >  		/* mboxes this ring signals to */
> > -		u32		signal_mbox[I915_NUM_RINGS];
> > +		union {
> > +			u32		signal_mbox[I915_NUM_RINGS];
> > +			u64		signal_ggtt[I915_NUM_RINGS];
> > +		};
> >  
> >  		/* num_dwords is space the caller will need for atomic update */
> >  		int		(*signal)(struct intel_ring_buffer *signaller,
> > -- 
> > 1.8.5.3
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> -- 
> Ville Syrjälä
> Intel OTC
Ben Widawsky Feb. 11, 2014, 10:22 p.m. UTC | #8
On Tue, Feb 11, 2014 at 02:11:04PM -0800, Ben Widawsky wrote:
> On Thu, Jan 30, 2014 at 02:38:17PM +0200, Ville Syrjälä wrote:
> > On Wed, Jan 29, 2014 at 11:55:26AM -0800, Ben Widawsky wrote:
> > > Semaphore signalling works similarly to previous GENs with the exception
> > > that the per ring mailboxes no longer exist. Instead you must define
> > > your own space, somewhere in the GTT.
> > > 
> > > The comments in the code define the layout I've opted for, which should
> > > be fairly future proof. Ie. I tried to define offsets in abstract terms
> > > (NUM_RINGS, seqno size, etc).
> > > 
> > > NOTE: If one wanted to move this to the HWSP they could. I've decided
> > > one 4k object would be easier to deal with, and provide potential wins
> > > with cache locality, but that's all speculative.
> > > 
> > > v2: Update the macro to not need the other ring's ring->id (Chris)
> > > Update the comment to use the correct formula (Chris)
> > > 
> > > Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> > > ---
> > >  drivers/gpu/drm/i915/i915_drv.h         |   1 +
> > >  drivers/gpu/drm/i915/i915_reg.h         |   5 +-
> > >  drivers/gpu/drm/i915/intel_ringbuffer.c | 199 +++++++++++++++++++++++++-------
> > >  drivers/gpu/drm/i915/intel_ringbuffer.h |  38 +++++-
> > >  4 files changed, 197 insertions(+), 46 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > > index 3673ba1..f521059 100644
> > > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > @@ -1380,6 +1380,7 @@ typedef struct drm_i915_private {
> > >  
> > >  	struct pci_dev *bridge_dev;
> > >  	struct intel_ring_buffer ring[I915_NUM_RINGS];
> > > +	struct drm_i915_gem_object *semaphore_obj;
> > >  	uint32_t last_seqno, next_seqno;
> > >  
> > >  	drm_dma_handle_t *status_page_dmah;
> > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> > > index cbbaf26..8b745dc 100644
> > > --- a/drivers/gpu/drm/i915/i915_reg.h
> > > +++ b/drivers/gpu/drm/i915/i915_reg.h
> > > @@ -216,7 +216,7 @@
> > >  #define   MI_DISPLAY_FLIP_IVB_SPRITE_B (3 << 19)
> > >  #define   MI_DISPLAY_FLIP_IVB_PLANE_C  (4 << 19)
> > >  #define   MI_DISPLAY_FLIP_IVB_SPRITE_C (5 << 19)
> > > -#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6+ */
> > > +#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6, gen7 */
> > >  #define   MI_SEMAPHORE_GLOBAL_GTT    (1<<22)
> > >  #define   MI_SEMAPHORE_UPDATE	    (1<<21)
> > >  #define   MI_SEMAPHORE_COMPARE	    (1<<20)
> > > @@ -241,6 +241,8 @@
> > >  #define   MI_RESTORE_EXT_STATE_EN	(1<<2)
> > >  #define   MI_FORCE_RESTORE		(1<<1)
> > >  #define   MI_RESTORE_INHIBIT		(1<<0)
> > > +#define MI_SEMAPHORE_SIGNAL	MI_INSTR(0x1b, 0) /* GEN8+ */
> > > +#define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
> > >  #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
> > >  #define   MI_MEM_VIRTUAL	(1 << 22) /* 965+ only */
> > >  #define MI_STORE_DWORD_INDEX	MI_INSTR(0x21, 1)
> > > @@ -329,6 +331,7 @@
> > >  #define   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE		(1<<10) /* GM45+ only */
> > >  #define   PIPE_CONTROL_INDIRECT_STATE_DISABLE		(1<<9)
> > >  #define   PIPE_CONTROL_NOTIFY				(1<<8)
> > > +#define   PIPE_CONTROL_FLUSH_ENABLE			(1<<7) /* gen7+ */
> > >  #define   PIPE_CONTROL_VF_CACHE_INVALIDATE		(1<<4)
> > >  #define   PIPE_CONTROL_CONST_CACHE_INVALIDATE		(1<<3)
> > >  #define   PIPE_CONTROL_STATE_CACHE_INVALIDATE		(1<<2)
> > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > index 37ae2b1..b750835 100644
> > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > @@ -619,6 +619,13 @@ static int init_render_ring(struct intel_ring_buffer *ring)
> > >  static void render_ring_cleanup(struct intel_ring_buffer *ring)
> > >  {
> > >  	struct drm_device *dev = ring->dev;
> > > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > > +
> > > +	if (dev_priv->semaphore_obj) {
> > > +		i915_gem_object_ggtt_unpin(dev_priv->semaphore_obj);
> > > +		drm_gem_object_unreference(&dev_priv->semaphore_obj->base);
> > > +		dev_priv->semaphore_obj = NULL;
> > > +	}
> > >  
> > >  	if (ring->scratch.obj == NULL)
> > >  		return;
> > > @@ -632,6 +639,86 @@ static void render_ring_cleanup(struct intel_ring_buffer *ring)
> > >  	ring->scratch.obj = NULL;
> > >  }
> > >  
> > > +static int gen8_rcs_signal(struct intel_ring_buffer *signaller,
> > > +			   unsigned int num_dwords)
> > > +{
> > > +#define MBOX_UPDATE_DWORDS 8
> > > +	struct drm_device *dev = signaller->dev;
> > > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > > +	struct intel_ring_buffer *waiter;
> > > +	int i, ret, num_rings;
> > > +
> > > +	num_rings = hweight_long(INTEL_INFO(dev)->ring_mask);
> > > +	num_dwords = (num_rings-1) * MBOX_UPDATE_DWORDS;
> > 
> > Again num_dwords +=
> > 
> > > +#undef MBOX_UPDATE_DWORDS
> > > +
> > > +	/* XXX: + 4 for the caller */
> > > +	ret = intel_ring_begin(signaller, num_dwords + 4);
> > 
> > and the +4 goes away.
> > 
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	for_each_ring(waiter, dev_priv, i) {
> > > +		u64 gtt_offset = signaller->semaphore.signal_ggtt[i];
> > > +		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
> > > +			continue;
> > > +
> > > +		intel_ring_emit(signaller, GFX_OP_PIPE_CONTROL(6));
> > > +		intel_ring_emit(signaller, PIPE_CONTROL_GLOBAL_GTT_IVB |
> > > +					   PIPE_CONTROL_QW_WRITE |
> > > +					   PIPE_CONTROL_FLUSH_ENABLE);
> > > +		intel_ring_emit(signaller, lower_32_bits(gtt_offset));
> > > +		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
> > > +		intel_ring_emit(signaller, signaller->outstanding_lazy_seqno);
> > > +		intel_ring_emit(signaller, 0);
> > > +		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
> > > +					   MI_SEMAPHORE_TARGET(waiter->id));
> > > +		intel_ring_emit(signaller, 0);
> > > +	}
> > > +
> > > +	WARN_ON(i != num_rings);
> > > +
> > > +	return 0;
> > > +}
> > 
> > <snip>
> 
> Got those, thanks.
> 
> > 
> > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > index c69ae10..f1e7a66 100644
> > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > @@ -111,6 +111,39 @@ struct  intel_ring_buffer {
> > >  #define I915_DISPATCH_PINNED 0x2
> > >  	void		(*cleanup)(struct intel_ring_buffer *ring);
> > >  
> > > +	/* GEN8 signal/wait table
> > > +	 *	  signal to  signal to    signal to   signal to
> > > +	 *	    RCS         VCS          BCS        VECS
> > > +	 *      ------------------------------------------------------
> > > +	 *  RCS | NOP (0x00) | BCS (0x08) | VCS (0x10) | VECS (0x18) |
> > > +	 *	|-----------------------------------------------------
> > > +	 *  VCS | RCS (0x20) | NOP (0x28) | BCS (0x30) | VECS (0x38) |
> > > +	 *	|-----------------------------------------------------
> > > +	 *  BCS | RCS (0x40) | VCS (0x48) | NOP (0x50) | VECS (0x58) |
> > > +	 *	|-----------------------------------------------------
> > > +	 * VECS | RCS (0x60) | VCS (0x68) | BCS (0x70) |  NOP (0x78) |
> > > +	 *	|-----------------------------------------------------
> > > +	 *
> > > +	 * Generalization:
> > > +	 *  f(x, y) := (x->id * NUM_RINGS * seqno_size) + (seqno_size * y->id)
> > > +	 *  ie. transpose of g(x, y)
> > > +	 *
> > > +	 *	 sync from   sync from    sync from    sync from
> > > +	 *	    RCS         VCS          BCS        VECS
> > > +	 *      ------------------------------------------------------
> > > +	 *  RCS | NOP (0x00) | BCS (0x20) | VCS (0x40) | VECS (0x60) |
> > > +	 *	|-----------------------------------------------------
> > > +	 *  VCS | RCS (0x08) | NOP (0x28) | BCS (0x48) | VECS (0x68) |
> > > +	 *	|-----------------------------------------------------
> > > +	 *  BCS | RCS (0x10) | VCS (0x30) | NOP (0x50) | VECS (0x60) |
> > > +	 *	|-----------------------------------------------------
> > > +	 * VECS | RCS (0x18) | VCS (0x38) | BCS (0x58) |  NOP (0x78) |
> > > +	 *	|-----------------------------------------------------
> > > +	 *
> > > +	 * Generalization:
> > > +	 *  g(x, y) := (y->id * NUM_RINGS * seqno_size) + (seqno_size * x->id)
> > > +	 *  ie. transpose of f(x, y)
> > > +	 */
> > >  	struct {
> > >  		u32	sync_seqno[I915_NUM_RINGS-1];
> > >  		/* AKA wait() */
> > > @@ -120,7 +153,10 @@ struct  intel_ring_buffer {
> > >  		/* our mbox written by others */
> > >  		u32		mbox[I915_NUM_RINGS];
> > 
> > mbox should also get a u64 friend, right?
> 
> mbox should be gen6 only, given the change to using the gtt on gen8. In
> this point in the series, semaphores should be forcibly disabled on
> gen8, so the code looks wrong, but the path cannot [should not] be
> taken.
> 
> I suppose I should kill the initialization of mbox for gen8, or somehow
> consolidate with a union to prevent confusion.
> 

Just to clarify it should be

gen6:
signal uses signal_mbox for signal
wait uses mbox

gen8:
signal uses signal_ggtt for signal
wait uses arithmetic to figure out the offset

> > 
> > >  		/* mboxes this ring signals to */
> > > -		u32		signal_mbox[I915_NUM_RINGS];
> > > +		union {
> > > +			u32		signal_mbox[I915_NUM_RINGS];
> > > +			u64		signal_ggtt[I915_NUM_RINGS];
> > > +		};
> > >  
> > >  		/* num_dwords is space the caller will need for atomic update */
> > >  		int		(*signal)(struct intel_ring_buffer *signaller,
> > > -- 
> > > 1.8.5.3
> > > 
> > > _______________________________________________
> > > Intel-gfx mailing list
> > > Intel-gfx@lists.freedesktop.org
> > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> > 
> > -- 
> > Ville Syrjälä
> > Intel OTC
> 
> -- 
> Ben Widawsky, Intel Open Source Technology Center
Chris Wilson Feb. 11, 2014, 10:23 p.m. UTC | #9
On Tue, Feb 11, 2014 at 01:48:22PM -0800, Ben Widawsky wrote:
> On Thu, Jan 30, 2014 at 01:35:41PM +0000, Chris Wilson wrote:
> > On Thu, Jan 30, 2014 at 02:18:32PM +0100, Daniel Vetter wrote:
> > > On Thu, Jan 30, 2014 at 1:46 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > > Oh. So they changed how post-sync writes operated - this should be a
> > > > separate fix for stable I believe (so that batches are not run before we
> > > > have finished invalidating the TLBs required).
> > > 
> > > We have an igt to exercise tlb invalidation stuff, which runs on all
> > > rings. But it only runs a batch, so only uses the CS tlb. Do we need
> > > to extend this?
> > 
> > So the spec says:
> > 
> > Pipe Control Flush Enable (IVB+)
> > If ENABLED, the PIPE_CONTROL command will wait until all previous writes
> > of immediate data from post sync circles are complete before executing
> > the next command.
> > 
> > Post Sync Operation
> > This field specifies an optional action to be taken upon completion of
> > the synchronization operation.
> > 
> > TLB Invalidate
> > If ENABLED, all TLBs belonging to Render Engine will be invalidated once
> > the flush operation is complete.
> > 
> > Command Streamer Stall Enable
> > If ENABLED, the sync operation will not occur until all previous flush
> > operations pending a completion of those previous flushes will complete,
> > including the flush produced from this command. This enables the command
> > to act similar to the legacy MI_FLUSH command.
> > 
> > Going by that, the order is
> > 
> > flush, stall, TLB invalidate / post-sync op, [pipe control flush]
> > 
> > Based on my reading of the above (which unless someone has a more
> > definitive source) says that without the CONTROL_FLUSH_ENABLE, the CS
> > can continue operations as soon as the flush is complete - in parallel
> > to the TLB invalidate. Adding CONTROL_FLUSH_ENABLE would then stall the
> > CS until the post-sync operation completes. That still leaves the
> > possibility that the TLB invalidate is being performed in parallel and
> > is itself provides no CS sync.
> > -Chris
> > 
> > -- 
> > Chris Wilson, Intel Open Source Technology Centre
> 
> so.... what the verdict?

Gut feeling is that it fixes an issue with IVB TLB invalidate.
(Not yet sure if the bug I was looking at was accidentally fixed at the
same time as testing this.)
So cc stable@
-Chris
Ben Widawsky Feb. 11, 2014, 10:25 p.m. UTC | #10
On Tue, Feb 11, 2014 at 10:23:38PM +0000, Chris Wilson wrote:
> On Tue, Feb 11, 2014 at 01:48:22PM -0800, Ben Widawsky wrote:
> > On Thu, Jan 30, 2014 at 01:35:41PM +0000, Chris Wilson wrote:
> > > On Thu, Jan 30, 2014 at 02:18:32PM +0100, Daniel Vetter wrote:
> > > > On Thu, Jan 30, 2014 at 1:46 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > > > Oh. So they changed how post-sync writes operated - this should be a
> > > > > separate fix for stable I believe (so that batches are not run before we
> > > > > have finished invalidating the TLBs required).
> > > > 
> > > > We have an igt to exercise tlb invalidation stuff, which runs on all
> > > > rings. But it only runs a batch, so only uses the CS tlb. Do we need
> > > > to extend this?
> > > 
> > > So the spec says:
> > > 
> > > Pipe Control Flush Enable (IVB+)
> > > If ENABLED, the PIPE_CONTROL command will wait until all previous writes
> > > of immediate data from post sync circles are complete before executing
> > > the next command.
> > > 
> > > Post Sync Operation
> > > This field specifies an optional action to be taken upon completion of
> > > the synchronization operation.
> > > 
> > > TLB Invalidate
> > > If ENABLED, all TLBs belonging to Render Engine will be invalidated once
> > > the flush operation is complete.
> > > 
> > > Command Streamer Stall Enable
> > > If ENABLED, the sync operation will not occur until all previous flush
> > > operations pending a completion of those previous flushes will complete,
> > > including the flush produced from this command. This enables the command
> > > to act similar to the legacy MI_FLUSH command.
> > > 
> > > Going by that, the order is
> > > 
> > > flush, stall, TLB invalidate / post-sync op, [pipe control flush]
> > > 
> > > Based on my reading of the above (which unless someone has a more
> > > definitive source) says that without the CONTROL_FLUSH_ENABLE, the CS
> > > can continue operations as soon as the flush is complete - in parallel
> > > to the TLB invalidate. Adding CONTROL_FLUSH_ENABLE would then stall the
> > > CS until the post-sync operation completes. That still leaves the
> > > possibility that the TLB invalidate is being performed in parallel and
> > > is itself provides no CS sync.
> > > -Chris
> > > 
> > > -- 
> > > Chris Wilson, Intel Open Source Technology Centre
> > 
> > so.... what the verdict?
> 
> Gut feeling is that it fixes an issue with IVB TLB invalidate.
> (Not yet sure if the bug I was looking at was accidentally fixed at the
> same time as testing this.)
> So cc stable@
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre

You still want a separate patch?
Chris Wilson Feb. 11, 2014, 10:28 p.m. UTC | #11
On Tue, Feb 11, 2014 at 02:25:43PM -0800, Ben Widawsky wrote:
> On Tue, Feb 11, 2014 at 10:23:38PM +0000, Chris Wilson wrote:
> > Gut feeling is that it fixes an issue with IVB TLB invalidate.
> > (Not yet sure if the bug I was looking at was accidentally fixed at the
> > same time as testing this.)
> > So cc stable@

> You still want a separate patch?

Actually, bad news for me. The bug I had thought had gone, was merely
dorminant. It reappeared, so I have no known issue that this fixes. :(

I still think we need to add the Pipe Control Flush Enable to the TLB
invalidate sequence though, but no longer urgent.
-Chris
Ben Widawsky Feb. 11, 2014, 11:01 p.m. UTC | #12
On Tue, Feb 11, 2014 at 02:22:37PM -0800, Ben Widawsky wrote:
> On Tue, Feb 11, 2014 at 02:11:04PM -0800, Ben Widawsky wrote:
> > On Thu, Jan 30, 2014 at 02:38:17PM +0200, Ville Syrjälä wrote:
> > > On Wed, Jan 29, 2014 at 11:55:26AM -0800, Ben Widawsky wrote:
> > > > Semaphore signalling works similarly to previous GENs with the exception
> > > > that the per ring mailboxes no longer exist. Instead you must define
> > > > your own space, somewhere in the GTT.
> > > > 
> > > > The comments in the code define the layout I've opted for, which should
> > > > be fairly future proof. Ie. I tried to define offsets in abstract terms
> > > > (NUM_RINGS, seqno size, etc).
> > > > 
> > > > NOTE: If one wanted to move this to the HWSP they could. I've decided
> > > > one 4k object would be easier to deal with, and provide potential wins
> > > > with cache locality, but that's all speculative.
> > > > 
> > > > v2: Update the macro to not need the other ring's ring->id (Chris)
> > > > Update the comment to use the correct formula (Chris)
> > > > 
> > > > Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> > > > ---
> > > >  drivers/gpu/drm/i915/i915_drv.h         |   1 +
> > > >  drivers/gpu/drm/i915/i915_reg.h         |   5 +-
> > > >  drivers/gpu/drm/i915/intel_ringbuffer.c | 199 +++++++++++++++++++++++++-------
> > > >  drivers/gpu/drm/i915/intel_ringbuffer.h |  38 +++++-
> > > >  4 files changed, 197 insertions(+), 46 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > > > index 3673ba1..f521059 100644
> > > > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > > @@ -1380,6 +1380,7 @@ typedef struct drm_i915_private {
> > > >  
> > > >  	struct pci_dev *bridge_dev;
> > > >  	struct intel_ring_buffer ring[I915_NUM_RINGS];
> > > > +	struct drm_i915_gem_object *semaphore_obj;
> > > >  	uint32_t last_seqno, next_seqno;
> > > >  
> > > >  	drm_dma_handle_t *status_page_dmah;
> > > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> > > > index cbbaf26..8b745dc 100644
> > > > --- a/drivers/gpu/drm/i915/i915_reg.h
> > > > +++ b/drivers/gpu/drm/i915/i915_reg.h
> > > > @@ -216,7 +216,7 @@
> > > >  #define   MI_DISPLAY_FLIP_IVB_SPRITE_B (3 << 19)
> > > >  #define   MI_DISPLAY_FLIP_IVB_PLANE_C  (4 << 19)
> > > >  #define   MI_DISPLAY_FLIP_IVB_SPRITE_C (5 << 19)
> > > > -#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6+ */
> > > > +#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6, gen7 */
> > > >  #define   MI_SEMAPHORE_GLOBAL_GTT    (1<<22)
> > > >  #define   MI_SEMAPHORE_UPDATE	    (1<<21)
> > > >  #define   MI_SEMAPHORE_COMPARE	    (1<<20)
> > > > @@ -241,6 +241,8 @@
> > > >  #define   MI_RESTORE_EXT_STATE_EN	(1<<2)
> > > >  #define   MI_FORCE_RESTORE		(1<<1)
> > > >  #define   MI_RESTORE_INHIBIT		(1<<0)
> > > > +#define MI_SEMAPHORE_SIGNAL	MI_INSTR(0x1b, 0) /* GEN8+ */
> > > > +#define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
> > > >  #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
> > > >  #define   MI_MEM_VIRTUAL	(1 << 22) /* 965+ only */
> > > >  #define MI_STORE_DWORD_INDEX	MI_INSTR(0x21, 1)
> > > > @@ -329,6 +331,7 @@
> > > >  #define   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE		(1<<10) /* GM45+ only */
> > > >  #define   PIPE_CONTROL_INDIRECT_STATE_DISABLE		(1<<9)
> > > >  #define   PIPE_CONTROL_NOTIFY				(1<<8)
> > > > +#define   PIPE_CONTROL_FLUSH_ENABLE			(1<<7) /* gen7+ */
> > > >  #define   PIPE_CONTROL_VF_CACHE_INVALIDATE		(1<<4)
> > > >  #define   PIPE_CONTROL_CONST_CACHE_INVALIDATE		(1<<3)
> > > >  #define   PIPE_CONTROL_STATE_CACHE_INVALIDATE		(1<<2)
> > > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > > index 37ae2b1..b750835 100644
> > > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > > @@ -619,6 +619,13 @@ static int init_render_ring(struct intel_ring_buffer *ring)
> > > >  static void render_ring_cleanup(struct intel_ring_buffer *ring)
> > > >  {
> > > >  	struct drm_device *dev = ring->dev;
> > > > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > > > +
> > > > +	if (dev_priv->semaphore_obj) {
> > > > +		i915_gem_object_ggtt_unpin(dev_priv->semaphore_obj);
> > > > +		drm_gem_object_unreference(&dev_priv->semaphore_obj->base);
> > > > +		dev_priv->semaphore_obj = NULL;
> > > > +	}
> > > >  
> > > >  	if (ring->scratch.obj == NULL)
> > > >  		return;
> > > > @@ -632,6 +639,86 @@ static void render_ring_cleanup(struct intel_ring_buffer *ring)
> > > >  	ring->scratch.obj = NULL;
> > > >  }
> > > >  
> > > > +static int gen8_rcs_signal(struct intel_ring_buffer *signaller,
> > > > +			   unsigned int num_dwords)
> > > > +{
> > > > +#define MBOX_UPDATE_DWORDS 8
> > > > +	struct drm_device *dev = signaller->dev;
> > > > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > > > +	struct intel_ring_buffer *waiter;
> > > > +	int i, ret, num_rings;
> > > > +
> > > > +	num_rings = hweight_long(INTEL_INFO(dev)->ring_mask);
> > > > +	num_dwords = (num_rings-1) * MBOX_UPDATE_DWORDS;
> > > 
> > > Again num_dwords +=
> > > 
> > > > +#undef MBOX_UPDATE_DWORDS
> > > > +
> > > > +	/* XXX: + 4 for the caller */
> > > > +	ret = intel_ring_begin(signaller, num_dwords + 4);
> > > 
> > > and the +4 goes away.
> > > 
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > > +	for_each_ring(waiter, dev_priv, i) {
> > > > +		u64 gtt_offset = signaller->semaphore.signal_ggtt[i];
> > > > +		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
> > > > +			continue;
> > > > +
> > > > +		intel_ring_emit(signaller, GFX_OP_PIPE_CONTROL(6));
> > > > +		intel_ring_emit(signaller, PIPE_CONTROL_GLOBAL_GTT_IVB |
> > > > +					   PIPE_CONTROL_QW_WRITE |
> > > > +					   PIPE_CONTROL_FLUSH_ENABLE);
> > > > +		intel_ring_emit(signaller, lower_32_bits(gtt_offset));
> > > > +		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
> > > > +		intel_ring_emit(signaller, signaller->outstanding_lazy_seqno);
> > > > +		intel_ring_emit(signaller, 0);
> > > > +		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
> > > > +					   MI_SEMAPHORE_TARGET(waiter->id));
> > > > +		intel_ring_emit(signaller, 0);
> > > > +	}
> > > > +
> > > > +	WARN_ON(i != num_rings);
> > > > +
> > > > +	return 0;
> > > > +}
> > > 
> > > <snip>
> > 
> > Got those, thanks.
> > 
> > > 
> > > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > > index c69ae10..f1e7a66 100644
> > > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > > @@ -111,6 +111,39 @@ struct  intel_ring_buffer {
> > > >  #define I915_DISPATCH_PINNED 0x2
> > > >  	void		(*cleanup)(struct intel_ring_buffer *ring);
> > > >  
> > > > +	/* GEN8 signal/wait table
> > > > +	 *	  signal to  signal to    signal to   signal to
> > > > +	 *	    RCS         VCS          BCS        VECS
> > > > +	 *      ------------------------------------------------------
> > > > +	 *  RCS | NOP (0x00) | BCS (0x08) | VCS (0x10) | VECS (0x18) |
> > > > +	 *	|-----------------------------------------------------
> > > > +	 *  VCS | RCS (0x20) | NOP (0x28) | BCS (0x30) | VECS (0x38) |
> > > > +	 *	|-----------------------------------------------------
> > > > +	 *  BCS | RCS (0x40) | VCS (0x48) | NOP (0x50) | VECS (0x58) |
> > > > +	 *	|-----------------------------------------------------
> > > > +	 * VECS | RCS (0x60) | VCS (0x68) | BCS (0x70) |  NOP (0x78) |
> > > > +	 *	|-----------------------------------------------------
> > > > +	 *
> > > > +	 * Generalization:
> > > > +	 *  f(x, y) := (x->id * NUM_RINGS * seqno_size) + (seqno_size * y->id)
> > > > +	 *  ie. transpose of g(x, y)
> > > > +	 *
> > > > +	 *	 sync from   sync from    sync from    sync from
> > > > +	 *	    RCS         VCS          BCS        VECS
> > > > +	 *      ------------------------------------------------------
> > > > +	 *  RCS | NOP (0x00) | BCS (0x20) | VCS (0x40) | VECS (0x60) |
> > > > +	 *	|-----------------------------------------------------
> > > > +	 *  VCS | RCS (0x08) | NOP (0x28) | BCS (0x48) | VECS (0x68) |
> > > > +	 *	|-----------------------------------------------------
> > > > +	 *  BCS | RCS (0x10) | VCS (0x30) | NOP (0x50) | VECS (0x60) |
> > > > +	 *	|-----------------------------------------------------
> > > > +	 * VECS | RCS (0x18) | VCS (0x38) | BCS (0x58) |  NOP (0x78) |
> > > > +	 *	|-----------------------------------------------------
> > > > +	 *
> > > > +	 * Generalization:
> > > > +	 *  g(x, y) := (y->id * NUM_RINGS * seqno_size) + (seqno_size * x->id)
> > > > +	 *  ie. transpose of f(x, y)
> > > > +	 */
> > > >  	struct {
> > > >  		u32	sync_seqno[I915_NUM_RINGS-1];
> > > >  		/* AKA wait() */
> > > > @@ -120,7 +153,10 @@ struct  intel_ring_buffer {
> > > >  		/* our mbox written by others */
> > > >  		u32		mbox[I915_NUM_RINGS];
> > > 
> > > mbox should also get a u64 friend, right?
> > 
> > mbox should be gen6 only, given the change to using the gtt on gen8. In
> > this point in the series, semaphores should be forcibly disabled on
> > gen8, so the code looks wrong, but the path cannot [should not] be
> > taken.
> > 
> > I suppose I should kill the initialization of mbox for gen8, or somehow
> > consolidate with a union to prevent confusion.
> > 
> 
> Just to clarify it should be
> 
> gen6:
> signal uses signal_mbox for signal
> wait uses mbox
> 
> gen8:
> signal uses signal_ggtt for signal
> wait uses arithmetic to figure out the offset
> 

Ok, I've fixed this up to make things clearer, but it ends up in the
next patch. So look there when I repost.

> > > 
> > > >  		/* mboxes this ring signals to */
> > > > -		u32		signal_mbox[I915_NUM_RINGS];
> > > > +		union {
> > > > +			u32		signal_mbox[I915_NUM_RINGS];
> > > > +			u64		signal_ggtt[I915_NUM_RINGS];
> > > > +		};
> > > >  
> > > >  		/* num_dwords is space the caller will need for atomic update */
> > > >  		int		(*signal)(struct intel_ring_buffer *signaller,
> > > > -- 
> > > > 1.8.5.3
> > > > 
> > > > _______________________________________________
> > > > Intel-gfx mailing list
> > > > Intel-gfx@lists.freedesktop.org
> > > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> > > 
> > > -- 
> > > Ville Syrjälä
> > > Intel OTC
> > 
> > -- 
> > Ben Widawsky, Intel Open Source Technology Center
> 
> -- 
> Ben Widawsky, Intel Open Source Technology Center
Ville Syrjälä Feb. 12, 2014, 9:29 a.m. UTC | #13
On Tue, Feb 11, 2014 at 03:01:31PM -0800, Ben Widawsky wrote:
> On Tue, Feb 11, 2014 at 02:22:37PM -0800, Ben Widawsky wrote:
> > On Tue, Feb 11, 2014 at 02:11:04PM -0800, Ben Widawsky wrote:
> > > On Thu, Jan 30, 2014 at 02:38:17PM +0200, Ville Syrjälä wrote:
> > > > On Wed, Jan 29, 2014 at 11:55:26AM -0800, Ben Widawsky wrote:
> > > > > Semaphore signalling works similarly to previous GENs with the exception
> > > > > that the per ring mailboxes no longer exist. Instead you must define
> > > > > your own space, somewhere in the GTT.
> > > > > 
> > > > > The comments in the code define the layout I've opted for, which should
> > > > > be fairly future proof. Ie. I tried to define offsets in abstract terms
> > > > > (NUM_RINGS, seqno size, etc).
> > > > > 
> > > > > NOTE: If one wanted to move this to the HWSP they could. I've decided
> > > > > one 4k object would be easier to deal with, and provide potential wins
> > > > > with cache locality, but that's all speculative.
> > > > > 
> > > > > v2: Update the macro to not need the other ring's ring->id (Chris)
> > > > > Update the comment to use the correct formula (Chris)
> > > > > 
> > > > > Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> > > > > ---
> > > > >  drivers/gpu/drm/i915/i915_drv.h         |   1 +
> > > > >  drivers/gpu/drm/i915/i915_reg.h         |   5 +-
> > > > >  drivers/gpu/drm/i915/intel_ringbuffer.c | 199 +++++++++++++++++++++++++-------
> > > > >  drivers/gpu/drm/i915/intel_ringbuffer.h |  38 +++++-
> > > > >  4 files changed, 197 insertions(+), 46 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > > > > index 3673ba1..f521059 100644
> > > > > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > > > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > > > @@ -1380,6 +1380,7 @@ typedef struct drm_i915_private {
> > > > >  
> > > > >  	struct pci_dev *bridge_dev;
> > > > >  	struct intel_ring_buffer ring[I915_NUM_RINGS];
> > > > > +	struct drm_i915_gem_object *semaphore_obj;
> > > > >  	uint32_t last_seqno, next_seqno;
> > > > >  
> > > > >  	drm_dma_handle_t *status_page_dmah;
> > > > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> > > > > index cbbaf26..8b745dc 100644
> > > > > --- a/drivers/gpu/drm/i915/i915_reg.h
> > > > > +++ b/drivers/gpu/drm/i915/i915_reg.h
> > > > > @@ -216,7 +216,7 @@
> > > > >  #define   MI_DISPLAY_FLIP_IVB_SPRITE_B (3 << 19)
> > > > >  #define   MI_DISPLAY_FLIP_IVB_PLANE_C  (4 << 19)
> > > > >  #define   MI_DISPLAY_FLIP_IVB_SPRITE_C (5 << 19)
> > > > > -#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6+ */
> > > > > +#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6, gen7 */
> > > > >  #define   MI_SEMAPHORE_GLOBAL_GTT    (1<<22)
> > > > >  #define   MI_SEMAPHORE_UPDATE	    (1<<21)
> > > > >  #define   MI_SEMAPHORE_COMPARE	    (1<<20)
> > > > > @@ -241,6 +241,8 @@
> > > > >  #define   MI_RESTORE_EXT_STATE_EN	(1<<2)
> > > > >  #define   MI_FORCE_RESTORE		(1<<1)
> > > > >  #define   MI_RESTORE_INHIBIT		(1<<0)
> > > > > +#define MI_SEMAPHORE_SIGNAL	MI_INSTR(0x1b, 0) /* GEN8+ */
> > > > > +#define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
> > > > >  #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
> > > > >  #define   MI_MEM_VIRTUAL	(1 << 22) /* 965+ only */
> > > > >  #define MI_STORE_DWORD_INDEX	MI_INSTR(0x21, 1)
> > > > > @@ -329,6 +331,7 @@
> > > > >  #define   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE		(1<<10) /* GM45+ only */
> > > > >  #define   PIPE_CONTROL_INDIRECT_STATE_DISABLE		(1<<9)
> > > > >  #define   PIPE_CONTROL_NOTIFY				(1<<8)
> > > > > +#define   PIPE_CONTROL_FLUSH_ENABLE			(1<<7) /* gen7+ */
> > > > >  #define   PIPE_CONTROL_VF_CACHE_INVALIDATE		(1<<4)
> > > > >  #define   PIPE_CONTROL_CONST_CACHE_INVALIDATE		(1<<3)
> > > > >  #define   PIPE_CONTROL_STATE_CACHE_INVALIDATE		(1<<2)
> > > > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > > > index 37ae2b1..b750835 100644
> > > > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > > > @@ -619,6 +619,13 @@ static int init_render_ring(struct intel_ring_buffer *ring)
> > > > >  static void render_ring_cleanup(struct intel_ring_buffer *ring)
> > > > >  {
> > > > >  	struct drm_device *dev = ring->dev;
> > > > > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > > > > +
> > > > > +	if (dev_priv->semaphore_obj) {
> > > > > +		i915_gem_object_ggtt_unpin(dev_priv->semaphore_obj);
> > > > > +		drm_gem_object_unreference(&dev_priv->semaphore_obj->base);
> > > > > +		dev_priv->semaphore_obj = NULL;
> > > > > +	}
> > > > >  
> > > > >  	if (ring->scratch.obj == NULL)
> > > > >  		return;
> > > > > @@ -632,6 +639,86 @@ static void render_ring_cleanup(struct intel_ring_buffer *ring)
> > > > >  	ring->scratch.obj = NULL;
> > > > >  }
> > > > >  
> > > > > +static int gen8_rcs_signal(struct intel_ring_buffer *signaller,
> > > > > +			   unsigned int num_dwords)
> > > > > +{
> > > > > +#define MBOX_UPDATE_DWORDS 8
> > > > > +	struct drm_device *dev = signaller->dev;
> > > > > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > > > > +	struct intel_ring_buffer *waiter;
> > > > > +	int i, ret, num_rings;
> > > > > +
> > > > > +	num_rings = hweight_long(INTEL_INFO(dev)->ring_mask);
> > > > > +	num_dwords = (num_rings-1) * MBOX_UPDATE_DWORDS;
> > > > 
> > > > Again num_dwords +=
> > > > 
> > > > > +#undef MBOX_UPDATE_DWORDS
> > > > > +
> > > > > +	/* XXX: + 4 for the caller */
> > > > > +	ret = intel_ring_begin(signaller, num_dwords + 4);
> > > > 
> > > > and the +4 goes away.
> > > > 
> > > > > +	if (ret)
> > > > > +		return ret;
> > > > > +
> > > > > +	for_each_ring(waiter, dev_priv, i) {
> > > > > +		u64 gtt_offset = signaller->semaphore.signal_ggtt[i];
> > > > > +		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
> > > > > +			continue;
> > > > > +
> > > > > +		intel_ring_emit(signaller, GFX_OP_PIPE_CONTROL(6));
> > > > > +		intel_ring_emit(signaller, PIPE_CONTROL_GLOBAL_GTT_IVB |
> > > > > +					   PIPE_CONTROL_QW_WRITE |
> > > > > +					   PIPE_CONTROL_FLUSH_ENABLE);
> > > > > +		intel_ring_emit(signaller, lower_32_bits(gtt_offset));
> > > > > +		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
> > > > > +		intel_ring_emit(signaller, signaller->outstanding_lazy_seqno);
> > > > > +		intel_ring_emit(signaller, 0);
> > > > > +		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
> > > > > +					   MI_SEMAPHORE_TARGET(waiter->id));
> > > > > +		intel_ring_emit(signaller, 0);
> > > > > +	}
> > > > > +
> > > > > +	WARN_ON(i != num_rings);
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > 
> > > > <snip>
> > > 
> > > Got those, thanks.
> > > 
> > > > 
> > > > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > > > index c69ae10..f1e7a66 100644
> > > > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > > > @@ -111,6 +111,39 @@ struct  intel_ring_buffer {
> > > > >  #define I915_DISPATCH_PINNED 0x2
> > > > >  	void		(*cleanup)(struct intel_ring_buffer *ring);
> > > > >  
> > > > > +	/* GEN8 signal/wait table
> > > > > +	 *	  signal to  signal to    signal to   signal to
> > > > > +	 *	    RCS         VCS          BCS        VECS
> > > > > +	 *      ------------------------------------------------------
> > > > > +	 *  RCS | NOP (0x00) | BCS (0x08) | VCS (0x10) | VECS (0x18) |
> > > > > +	 *	|-----------------------------------------------------
> > > > > +	 *  VCS | RCS (0x20) | NOP (0x28) | BCS (0x30) | VECS (0x38) |
> > > > > +	 *	|-----------------------------------------------------
> > > > > +	 *  BCS | RCS (0x40) | VCS (0x48) | NOP (0x50) | VECS (0x58) |
> > > > > +	 *	|-----------------------------------------------------
> > > > > +	 * VECS | RCS (0x60) | VCS (0x68) | BCS (0x70) |  NOP (0x78) |
> > > > > +	 *	|-----------------------------------------------------
> > > > > +	 *
> > > > > +	 * Generalization:
> > > > > +	 *  f(x, y) := (x->id * NUM_RINGS * seqno_size) + (seqno_size * y->id)
> > > > > +	 *  ie. transpose of g(x, y)
> > > > > +	 *
> > > > > +	 *	 sync from   sync from    sync from    sync from
> > > > > +	 *	    RCS         VCS          BCS        VECS
> > > > > +	 *      ------------------------------------------------------
> > > > > +	 *  RCS | NOP (0x00) | BCS (0x20) | VCS (0x40) | VECS (0x60) |
> > > > > +	 *	|-----------------------------------------------------
> > > > > +	 *  VCS | RCS (0x08) | NOP (0x28) | BCS (0x48) | VECS (0x68) |
> > > > > +	 *	|-----------------------------------------------------
> > > > > +	 *  BCS | RCS (0x10) | VCS (0x30) | NOP (0x50) | VECS (0x60) |
> > > > > +	 *	|-----------------------------------------------------
> > > > > +	 * VECS | RCS (0x18) | VCS (0x38) | BCS (0x58) |  NOP (0x78) |
> > > > > +	 *	|-----------------------------------------------------
> > > > > +	 *
> > > > > +	 * Generalization:
> > > > > +	 *  g(x, y) := (y->id * NUM_RINGS * seqno_size) + (seqno_size * x->id)
> > > > > +	 *  ie. transpose of f(x, y)
> > > > > +	 */
> > > > >  	struct {
> > > > >  		u32	sync_seqno[I915_NUM_RINGS-1];
> > > > >  		/* AKA wait() */
> > > > > @@ -120,7 +153,10 @@ struct  intel_ring_buffer {
> > > > >  		/* our mbox written by others */
> > > > >  		u32		mbox[I915_NUM_RINGS];
> > > > 
> > > > mbox should also get a u64 friend, right?
> > > 
> > > mbox should be gen6 only, given the change to using the gtt on gen8. In
> > > this point in the series, semaphores should be forcibly disabled on
> > > gen8, so the code looks wrong, but the path cannot [should not] be
> > > taken.
> > > 
> > > I suppose I should kill the initialization of mbox for gen8, or somehow
> > > consolidate with a union to prevent confusion.
> > > 
> > 
> > Just to clarify it should be
> > 
> > gen6:
> > signal uses signal_mbox for signal
> > wait uses mbox
> > 
> > gen8:
> > signal uses signal_ggtt for signal
> > wait uses arithmetic to figure out the offset
> > 
> 
> Ok, I've fixed this up to make things clearer, but it ends up in the
> next patch. So look there when I repost.

I was confused by these:
+ ring->semaphore.mbox[RCS] = GEN8_WAIT_OFFSET(ring, RCS);

So you did store the wait offset into mbox, but then you didn't use the
precomputed values and instead recomputed on the spot in gen8
ring_sync().


> 
> > > > 
> > > > >  		/* mboxes this ring signals to */
> > > > > -		u32		signal_mbox[I915_NUM_RINGS];
> > > > > +		union {
> > > > > +			u32		signal_mbox[I915_NUM_RINGS];
> > > > > +			u64		signal_ggtt[I915_NUM_RINGS];
> > > > > +		};
> > > > >  
> > > > >  		/* num_dwords is space the caller will need for atomic update */
> > > > >  		int		(*signal)(struct intel_ring_buffer *signaller,
> > > > > -- 
> > > > > 1.8.5.3
> > > > > 
> > > > > _______________________________________________
> > > > > Intel-gfx mailing list
> > > > > Intel-gfx@lists.freedesktop.org
> > > > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> > > > 
> > > > -- 
> > > > Ville Syrjälä
> > > > Intel OTC
> > > 
> > > -- 
> > > Ben Widawsky, Intel Open Source Technology Center
> > 
> > -- 
> > Ben Widawsky, Intel Open Source Technology Center
> 
> -- 
> Ben Widawsky, Intel Open Source Technology Center
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3673ba1..f521059 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1380,6 +1380,7 @@  typedef struct drm_i915_private {
 
 	struct pci_dev *bridge_dev;
 	struct intel_ring_buffer ring[I915_NUM_RINGS];
+	struct drm_i915_gem_object *semaphore_obj;
 	uint32_t last_seqno, next_seqno;
 
 	drm_dma_handle_t *status_page_dmah;
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index cbbaf26..8b745dc 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -216,7 +216,7 @@ 
 #define   MI_DISPLAY_FLIP_IVB_SPRITE_B (3 << 19)
 #define   MI_DISPLAY_FLIP_IVB_PLANE_C  (4 << 19)
 #define   MI_DISPLAY_FLIP_IVB_SPRITE_C (5 << 19)
-#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6+ */
+#define MI_SEMAPHORE_MBOX	MI_INSTR(0x16, 1) /* gen6, gen7 */
 #define   MI_SEMAPHORE_GLOBAL_GTT    (1<<22)
 #define   MI_SEMAPHORE_UPDATE	    (1<<21)
 #define   MI_SEMAPHORE_COMPARE	    (1<<20)
@@ -241,6 +241,8 @@ 
 #define   MI_RESTORE_EXT_STATE_EN	(1<<2)
 #define   MI_FORCE_RESTORE		(1<<1)
 #define   MI_RESTORE_INHIBIT		(1<<0)
+#define MI_SEMAPHORE_SIGNAL	MI_INSTR(0x1b, 0) /* GEN8+ */
+#define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
 #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
 #define   MI_MEM_VIRTUAL	(1 << 22) /* 965+ only */
 #define MI_STORE_DWORD_INDEX	MI_INSTR(0x21, 1)
@@ -329,6 +331,7 @@ 
 #define   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE		(1<<10) /* GM45+ only */
 #define   PIPE_CONTROL_INDIRECT_STATE_DISABLE		(1<<9)
 #define   PIPE_CONTROL_NOTIFY				(1<<8)
+#define   PIPE_CONTROL_FLUSH_ENABLE			(1<<7) /* gen7+ */
 #define   PIPE_CONTROL_VF_CACHE_INVALIDATE		(1<<4)
 #define   PIPE_CONTROL_CONST_CACHE_INVALIDATE		(1<<3)
 #define   PIPE_CONTROL_STATE_CACHE_INVALIDATE		(1<<2)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 37ae2b1..b750835 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -619,6 +619,13 @@  static int init_render_ring(struct intel_ring_buffer *ring)
 static void render_ring_cleanup(struct intel_ring_buffer *ring)
 {
 	struct drm_device *dev = ring->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	if (dev_priv->semaphore_obj) {
+		i915_gem_object_ggtt_unpin(dev_priv->semaphore_obj);
+		drm_gem_object_unreference(&dev_priv->semaphore_obj->base);
+		dev_priv->semaphore_obj = NULL;
+	}
 
 	if (ring->scratch.obj == NULL)
 		return;
@@ -632,6 +639,86 @@  static void render_ring_cleanup(struct intel_ring_buffer *ring)
 	ring->scratch.obj = NULL;
 }
 
+static int gen8_rcs_signal(struct intel_ring_buffer *signaller,
+			   unsigned int num_dwords)
+{
+#define MBOX_UPDATE_DWORDS 8
+	struct drm_device *dev = signaller->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_ring_buffer *waiter;
+	int i, ret, num_rings;
+
+	num_rings = hweight_long(INTEL_INFO(dev)->ring_mask);
+	num_dwords = (num_rings-1) * MBOX_UPDATE_DWORDS;
+#undef MBOX_UPDATE_DWORDS
+
+	/* XXX: + 4 for the caller */
+	ret = intel_ring_begin(signaller, num_dwords + 4);
+	if (ret)
+		return ret;
+
+	for_each_ring(waiter, dev_priv, i) {
+		u64 gtt_offset = signaller->semaphore.signal_ggtt[i];
+		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
+			continue;
+
+		intel_ring_emit(signaller, GFX_OP_PIPE_CONTROL(6));
+		intel_ring_emit(signaller, PIPE_CONTROL_GLOBAL_GTT_IVB |
+					   PIPE_CONTROL_QW_WRITE |
+					   PIPE_CONTROL_FLUSH_ENABLE);
+		intel_ring_emit(signaller, lower_32_bits(gtt_offset));
+		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
+		intel_ring_emit(signaller, signaller->outstanding_lazy_seqno);
+		intel_ring_emit(signaller, 0);
+		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
+					   MI_SEMAPHORE_TARGET(waiter->id));
+		intel_ring_emit(signaller, 0);
+	}
+
+	WARN_ON(i != num_rings);
+
+	return 0;
+}
+
+static int gen8_xcs_signal(struct intel_ring_buffer *signaller,
+			   unsigned int num_dwords)
+{
+#define MBOX_UPDATE_DWORDS 6
+	struct drm_device *dev = signaller->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_ring_buffer *waiter;
+	int i, ret, num_rings;
+
+	num_rings = hweight_long(INTEL_INFO(dev)->ring_mask);
+	num_dwords = (num_rings-1) * MBOX_UPDATE_DWORDS;
+#undef MBOX_UPDATE_DWORDS
+
+	/* XXX: + 4 for the caller */
+	ret = intel_ring_begin(signaller, num_dwords + 4);
+	if (ret)
+		return ret;
+
+	for_each_ring(waiter, dev_priv, i) {
+		u64 gtt_offset = signaller->semaphore.signal_ggtt[i];
+		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
+			continue;
+
+		intel_ring_emit(signaller, (MI_FLUSH_DW + 1) |
+					   MI_FLUSH_DW_OP_STOREDW);
+		intel_ring_emit(signaller, lower_32_bits(gtt_offset) |
+					   MI_FLUSH_DW_USE_GTT);
+		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
+		intel_ring_emit(signaller, signaller->outstanding_lazy_seqno);
+		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
+					   MI_SEMAPHORE_TARGET(waiter->id));
+		intel_ring_emit(signaller, 0);
+	}
+
+	WARN_ON(i != num_rings);
+
+	return 0;
+}
+
 static int gen6_signal(struct intel_ring_buffer *signaller,
 		       unsigned int num_dwords)
 {
@@ -1852,16 +1939,67 @@  static int gen6_ring_flush(struct intel_ring_buffer *ring,
 	return 0;
 }
 
+/* seqno size is actually only a uint32, but since we plan to use MI_FLUSH_DW to
+ * do the writes, and that must have qw aligned offsets, simply pretend it's 8b.
+ */
+#define SEQNO_SIZE sizeof(uint64_t)
+#define GEN8_SIGNAL_OFFSET(to) \
+	(i915_gem_obj_ggtt_offset(dev_priv->semaphore_obj) + \
+	(ring->id * I915_NUM_RINGS * SEQNO_SIZE) + \
+	(SEQNO_SIZE * (to)))
+
+#define GEN8_WAIT_OFFSET(from) \
+	(i915_gem_obj_ggtt_offset(dev_priv->semaphore_obj) + \
+	((from) * I915_NUM_RINGS * SEQNO_SIZE) + \
+	(SEQNO_SIZE * ring->id))
+
+#define GEN8_RING_SEMAPHORE_INIT do { \
+	if (!dev_priv->semaphore_obj) { \
+		break; \
+	} \
+	ring->semaphore.signal_ggtt[RCS] = GEN8_SIGNAL_OFFSET(RCS); \
+	ring->semaphore.signal_ggtt[VCS] = GEN8_SIGNAL_OFFSET(VCS); \
+	ring->semaphore.signal_ggtt[BCS] = GEN8_SIGNAL_OFFSET(BCS); \
+	ring->semaphore.signal_ggtt[VECS] = GEN8_SIGNAL_OFFSET(VECS); \
+	ring->semaphore.mbox[RCS] = GEN8_WAIT_OFFSET(RCS); \
+	ring->semaphore.mbox[VCS] = GEN8_WAIT_OFFSET(VCS); \
+	ring->semaphore.mbox[BCS] = GEN8_WAIT_OFFSET(BCS); \
+	ring->semaphore.mbox[VECS] = GEN8_WAIT_OFFSET(VECS); \
+	ring->semaphore.signal_ggtt[ring->id] = MI_SEMAPHORE_SYNC_INVALID; \
+	ring->semaphore.mbox[ring->id] = GEN6_NOSYNC; \
+	} while(0)
+#undef seqno_size
+
+
+
 int intel_init_render_ring_buffer(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring = &dev_priv->ring[RCS];
+	struct drm_i915_gem_object *obj;
+	int ret;
 
 	ring->name = "render ring";
 	ring->id = RCS;
 	ring->mmio_base = RENDER_RING_BASE;
 
 	if (INTEL_INFO(dev)->gen >= 8) {
+		if (i915_semaphore_is_enabled(dev)) {
+			obj = i915_gem_alloc_object(dev, 4096);
+			if (obj == NULL) {
+				DRM_ERROR("Failed to allocate semaphore bo. Disabling semaphores\n");
+				i915.semaphores = 0;
+			} else {
+				i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
+				ret = i915_gem_obj_ggtt_pin(obj, 0, false, true);
+				if (ret != 0) {
+					drm_gem_object_unreference(&obj->base);
+					DRM_ERROR("Failed to pin semaphore bo. Disabling semaphores\n");
+					i915.semaphores = 0;
+				} else
+					dev_priv->semaphore_obj = obj;
+			}
+		}
 		ring->add_request = gen6_add_request;
 		ring->flush = gen8_render_ring_flush;
 		ring->irq_get = gen8_ring_get_irq;
@@ -1870,17 +2008,11 @@  int intel_init_render_ring_buffer(struct drm_device *dev)
 		ring->get_seqno = gen6_ring_get_seqno;
 		ring->set_seqno = ring_set_seqno;
 		ring->semaphore.sync_to = gen6_ring_sync;
-		if (i915_semaphore_is_enabled(dev))
-			ring->semaphore.signal = gen6_signal;
-		ring->semaphore.signal = gen6_signal;
-		ring->semaphore.mbox[RCS] = MI_SEMAPHORE_SYNC_INVALID;
-		ring->semaphore.mbox[VCS] = MI_SEMAPHORE_SYNC_INVALID;
-		ring->semaphore.mbox[BCS] = MI_SEMAPHORE_SYNC_INVALID;
-		ring->semaphore.mbox[VECS] = MI_SEMAPHORE_SYNC_INVALID;
-		ring->semaphore.signal_mbox[RCS] = GEN6_NOSYNC;
-		ring->semaphore.signal_mbox[VCS] = GEN6_NOSYNC;
-		ring->semaphore.signal_mbox[BCS] = GEN6_NOSYNC;
-		ring->semaphore.signal_mbox[VECS] = GEN6_NOSYNC;
+		if (i915_semaphore_is_enabled(dev)) {
+			BUG_ON(!dev_priv->semaphore_obj);
+			ring->semaphore.signal = gen8_rcs_signal;
+			GEN8_RING_SEMAPHORE_INIT;
+		}
 	} else if (INTEL_INFO(dev)->gen >= 6) {
 		ring->add_request = gen6_add_request;
 		ring->flush = gen7_render_ring_flush;
@@ -1947,9 +2079,6 @@  int intel_init_render_ring_buffer(struct drm_device *dev)
 
 	/* Workaround batchbuffer to combat CS tlb bug. */
 	if (HAS_BROKEN_CS_TLB(dev)) {
-		struct drm_i915_gem_object *obj;
-		int ret;
-
 		obj = i915_gem_alloc_object(dev, I830_BATCH_LIMIT);
 		if (obj == NULL) {
 			DRM_ERROR("Failed to allocate batch bo\n");
@@ -2064,16 +2193,10 @@  int intel_init_bsd_ring_buffer(struct drm_device *dev)
 			ring->dispatch_execbuffer =
 				gen8_ring_dispatch_execbuffer;
 			ring->semaphore.sync_to = gen6_ring_sync;
-			if (i915_semaphore_is_enabled(dev))
-				ring->semaphore.signal = gen6_signal;
-			ring->semaphore.mbox[RCS] = MI_SEMAPHORE_SYNC_INVALID;
-			ring->semaphore.mbox[VCS] = MI_SEMAPHORE_SYNC_INVALID;
-			ring->semaphore.mbox[BCS] = MI_SEMAPHORE_SYNC_INVALID;
-			ring->semaphore.mbox[VECS] = MI_SEMAPHORE_SYNC_INVALID;
-			ring->semaphore.signal_mbox[RCS] = GEN6_NOSYNC;
-			ring->semaphore.signal_mbox[VCS] = GEN6_NOSYNC;
-			ring->semaphore.signal_mbox[BCS] = GEN6_NOSYNC;
-			ring->semaphore.signal_mbox[VECS] = GEN6_NOSYNC;
+			if (i915_semaphore_is_enabled(dev)) {
+				ring->semaphore.signal = gen8_xcs_signal;
+				GEN8_RING_SEMAPHORE_INIT;
+			}
 		} else {
 			ring->irq_enable_mask = GT_BSD_USER_INTERRUPT;
 			ring->irq_get = gen6_ring_get_irq;
@@ -2135,16 +2258,10 @@  int intel_init_blt_ring_buffer(struct drm_device *dev)
 		ring->irq_put = gen8_ring_put_irq;
 		ring->dispatch_execbuffer = gen8_ring_dispatch_execbuffer;
 		ring->semaphore.sync_to = gen6_ring_sync;
-		if (i915_semaphore_is_enabled(dev))
-			ring->semaphore.signal = gen6_signal;
-		ring->semaphore.mbox[RCS] = MI_SEMAPHORE_SYNC_INVALID;
-		ring->semaphore.mbox[VCS] = MI_SEMAPHORE_SYNC_INVALID;
-		ring->semaphore.mbox[BCS] = MI_SEMAPHORE_SYNC_INVALID;
-		ring->semaphore.mbox[VECS] = MI_SEMAPHORE_SYNC_INVALID;
-		ring->semaphore.signal_mbox[RCS] = GEN6_NOSYNC;
-		ring->semaphore.signal_mbox[VCS] = GEN6_NOSYNC;
-		ring->semaphore.signal_mbox[BCS] = GEN6_NOSYNC;
-		ring->semaphore.signal_mbox[VECS] = GEN6_NOSYNC;
+		if (i915_semaphore_is_enabled(dev)) {
+			ring->semaphore.signal = gen8_xcs_signal;
+			GEN8_RING_SEMAPHORE_INIT;
+		}
 	} else {
 		ring->irq_enable_mask = GT_BLT_USER_INTERRUPT;
 		ring->irq_get = gen6_ring_get_irq;
@@ -2190,16 +2307,10 @@  int intel_init_vebox_ring_buffer(struct drm_device *dev)
 		ring->irq_put = gen8_ring_put_irq;
 		ring->dispatch_execbuffer = gen8_ring_dispatch_execbuffer;
 		ring->semaphore.sync_to = gen6_ring_sync;
-		if (i915_semaphore_is_enabled(dev))
-			ring->semaphore.signal = gen6_signal;
-		ring->semaphore.mbox[RCS] = MI_SEMAPHORE_SYNC_INVALID;
-		ring->semaphore.mbox[VCS] = MI_SEMAPHORE_SYNC_INVALID;
-		ring->semaphore.mbox[BCS] = MI_SEMAPHORE_SYNC_INVALID;
-		ring->semaphore.mbox[VECS] = MI_SEMAPHORE_SYNC_INVALID;
-		ring->semaphore.signal_mbox[RCS] = GEN6_NOSYNC;
-		ring->semaphore.signal_mbox[VCS] = GEN6_NOSYNC;
-		ring->semaphore.signal_mbox[BCS] = GEN6_NOSYNC;
-		ring->semaphore.signal_mbox[VECS] = GEN6_NOSYNC;
+		if (i915_semaphore_is_enabled(dev)) {
+			ring->semaphore.signal = gen8_xcs_signal;
+			GEN8_RING_SEMAPHORE_INIT;
+		}
 	} else {
 		ring->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
 		ring->irq_get = hsw_vebox_get_irq;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index c69ae10..f1e7a66 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -111,6 +111,39 @@  struct  intel_ring_buffer {
 #define I915_DISPATCH_PINNED 0x2
 	void		(*cleanup)(struct intel_ring_buffer *ring);
 
+	/* GEN8 signal/wait table
+	 *	  signal to  signal to    signal to   signal to
+	 *	    RCS         VCS          BCS        VECS
+	 *      ------------------------------------------------------
+	 *  RCS | NOP (0x00) | BCS (0x08) | VCS (0x10) | VECS (0x18) |
+	 *	|-----------------------------------------------------
+	 *  VCS | RCS (0x20) | NOP (0x28) | BCS (0x30) | VECS (0x38) |
+	 *	|-----------------------------------------------------
+	 *  BCS | RCS (0x40) | VCS (0x48) | NOP (0x50) | VECS (0x58) |
+	 *	|-----------------------------------------------------
+	 * VECS | RCS (0x60) | VCS (0x68) | BCS (0x70) |  NOP (0x78) |
+	 *	|-----------------------------------------------------
+	 *
+	 * Generalization:
+	 *  f(x, y) := (x->id * NUM_RINGS * seqno_size) + (seqno_size * y->id)
+	 *  ie. transpose of g(x, y)
+	 *
+	 *	 sync from   sync from    sync from    sync from
+	 *	    RCS         VCS          BCS        VECS
+	 *      ------------------------------------------------------
+	 *  RCS | NOP (0x00) | BCS (0x20) | VCS (0x40) | VECS (0x60) |
+	 *	|-----------------------------------------------------
+	 *  VCS | RCS (0x08) | NOP (0x28) | BCS (0x48) | VECS (0x68) |
+	 *	|-----------------------------------------------------
+	 *  BCS | RCS (0x10) | VCS (0x30) | NOP (0x50) | VECS (0x60) |
+	 *	|-----------------------------------------------------
+	 * VECS | RCS (0x18) | VCS (0x38) | BCS (0x58) |  NOP (0x78) |
+	 *	|-----------------------------------------------------
+	 *
+	 * Generalization:
+	 *  g(x, y) := (y->id * NUM_RINGS * seqno_size) + (seqno_size * x->id)
+	 *  ie. transpose of f(x, y)
+	 */
 	struct {
 		u32	sync_seqno[I915_NUM_RINGS-1];
 		/* AKA wait() */
@@ -120,7 +153,10 @@  struct  intel_ring_buffer {
 		/* our mbox written by others */
 		u32		mbox[I915_NUM_RINGS];
 		/* mboxes this ring signals to */
-		u32		signal_mbox[I915_NUM_RINGS];
+		union {
+			u32		signal_mbox[I915_NUM_RINGS];
+			u64		signal_ggtt[I915_NUM_RINGS];
+		};
 
 		/* num_dwords is space the caller will need for atomic update */
 		int		(*signal)(struct intel_ring_buffer *signaller,