diff mbox

[26/53] drm/i915/bdw: New logical ring submission mechanism

Message ID 1402673891-14618-27-git-send-email-oscar.mateo@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

oscar.mateo@intel.com June 13, 2014, 3:37 p.m. UTC
From: Oscar Mateo <oscar.mateo@intel.com>

Well, new-ish: if all this code looks familiar, that's because it's
a clone of the existing submission mechanism (with some modifications
here and there to adapt it to LRCs and Execlists).

And why did we do this? Execlists offer several advantages, like
control over when the GPU is done with a given workload, that can
help simplify the submission mechanism, no doubt, but I am interested
in getting Execlists to work first and foremost. As we are creating
a parallel submission mechanism (even if itñ? just a clone), we can
now start improving it without the fear of breaking old gens.

Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 214 +++++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.h |  18 ++++
 2 files changed, 232 insertions(+)

Comments

bradley.d.volkin@intel.com June 20, 2014, 9 p.m. UTC | #1
On Fri, Jun 13, 2014 at 08:37:44AM -0700, oscar.mateo@intel.com wrote:
> From: Oscar Mateo <oscar.mateo@intel.com>
> 
> Well, new-ish: if all this code looks familiar, that's because it's
> a clone of the existing submission mechanism (with some modifications
> here and there to adapt it to LRCs and Execlists).
> 
> And why did we do this? Execlists offer several advantages, like
> control over when the GPU is done with a given workload, that can
> help simplify the submission mechanism, no doubt, but I am interested
> in getting Execlists to work first and foremost. As we are creating
> a parallel submission mechanism (even if itñ? just a clone), we can
> now start improving it without the fear of breaking old gens.
> 
> Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 214 +++++++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_lrc.h |  18 ++++
>  2 files changed, 232 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 02fc3d0..89aed7a 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -86,6 +86,220 @@ bool intel_enable_execlists(struct drm_device *dev)
>  	return HAS_LOGICAL_RING_CONTEXTS(dev) && USES_PPGTT(dev);
>  }
>  
> +static inline struct intel_ringbuffer *
> +logical_ringbuf_get(struct intel_engine_cs *ring, struct intel_context *ctx)
> +{
> +	return ctx->engine[ring->id].ringbuf;
> +}
> +
> +void intel_logical_ring_advance_and_submit(struct intel_engine_cs *ring,
> +					   struct intel_context *ctx)
> +{
> +	struct intel_ringbuffer *ringbuf = logical_ringbuf_get(ring, ctx);
> +
> +	intel_logical_ring_advance(ringbuf);
> +
> +	if (intel_ring_stopped(ring))
> +		return;
> +
> +	ring->submit_ctx(ring, ctx, ringbuf->tail);
> +}
> +
> +static int logical_ring_alloc_seqno(struct intel_engine_cs *ring,
> +				    struct intel_context *ctx)
> +{
> +	if (ring->outstanding_lazy_seqno)
> +		return 0;
> +
> +	if (ring->preallocated_lazy_request == NULL) {
> +		struct drm_i915_gem_request *request;
> +
> +		request = kmalloc(sizeof(*request), GFP_KERNEL);
> +		if (request == NULL)
> +			return -ENOMEM;
> +
> +		ring->preallocated_lazy_request = request;
> +	}
> +
> +	return i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
> +}
> +
> +static int logical_ring_wait_request(struct intel_engine_cs *ring,
> +				     struct intel_ringbuffer *ringbuf,
> +				     struct intel_context *ctx,
> +				     int bytes)
> +{
> +	struct drm_i915_gem_request *request;
> +	u32 seqno = 0;
> +	int ret;
> +
> +	if (ringbuf->last_retired_head != -1) {
> +		ringbuf->head = ringbuf->last_retired_head;
> +		ringbuf->last_retired_head = -1;
> +
> +		ringbuf->space = intel_ring_space(ringbuf);
> +		if (ringbuf->space >= bytes)
> +			return 0;
> +	}
> +
> +	list_for_each_entry(request, &ring->request_list, list) {
> +		if (__intel_ring_space(request->tail, ringbuf->tail,
> +				ringbuf->size) >= bytes) {
> +			seqno = request->seqno;
> +			break;
> +		}
> +	}
> +
> +	if (seqno == 0)
> +		return -ENOSPC;
> +
> +	ret = i915_wait_seqno(ring, seqno);
> +	if (ret)
> +		return ret;
> +
> +	/* TODO: make sure we update the right ringbuffer's last_retired_head
> +	 * when retiring requests */
> +	i915_gem_retire_requests_ring(ring);
> +	ringbuf->head = ringbuf->last_retired_head;
> +	ringbuf->last_retired_head = -1;
> +
> +	ringbuf->space = intel_ring_space(ringbuf);
> +	return 0;
> +}
> +
> +static int logical_ring_wait_for_space(struct intel_engine_cs *ring,
> +						   struct intel_ringbuffer *ringbuf,
> +						   struct intel_context *ctx,
> +						   int bytes)
> +{
> +	struct drm_device *dev = ring->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	unsigned long end;
> +	int ret;
> +
> +	ret = logical_ring_wait_request(ring, ringbuf, ctx, bytes);
> +	if (ret != -ENOSPC)
> +		return ret;
> +
> +	/* Force the context submission in case we have been skipping it */
> +	intel_logical_ring_advance_and_submit(ring, ctx);
> +
> +	/* With GEM the hangcheck timer should kick us out of the loop,
> +	 * leaving it early runs the risk of corrupting GEM state (due
> +	 * to running on almost untested codepaths). But on resume
> +	 * timers don't work yet, so prevent a complete hang in that
> +	 * case by choosing an insanely large timeout. */
> +	end = jiffies + 60 * HZ;
> +

In the legacy ringbuffer version, there are tracepoints around the do loop.
Should we keep those? Or add lrc specific equivalents?

> +	do {
> +		ringbuf->head = I915_READ_HEAD(ring);
> +		ringbuf->space = intel_ring_space(ringbuf);
> +		if (ringbuf->space >= bytes) {
> +			ret = 0;
> +			break;
> +		}
> +
> +		if (!drm_core_check_feature(dev, DRIVER_MODESET) &&
> +		    dev->primary->master) {
> +			struct drm_i915_master_private *master_priv = dev->primary->master->driver_priv;
> +			if (master_priv->sarea_priv)
> +				master_priv->sarea_priv->perf_boxes |= I915_BOX_WAIT;
> +		}
> +
> +		msleep(1);
> +
> +		if (dev_priv->mm.interruptible && signal_pending(current)) {
> +			ret = -ERESTARTSYS;
> +			break;
> +		}
> +
> +		ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> +					   dev_priv->mm.interruptible);
> +		if (ret)
> +			break;
> +
> +		if (time_after(jiffies, end)) {
> +			ret = -EBUSY;
> +			break;
> +		}
> +	} while (1);
> +
> +	return ret;
> +}
> +
> +static int logical_ring_wrap_buffer(struct intel_engine_cs *ring,
> +						struct intel_ringbuffer *ringbuf,
> +						struct intel_context *ctx)
> +{
> +	uint32_t __iomem *virt;
> +	int rem = ringbuf->size - ringbuf->tail;
> +
> +	if (ringbuf->space < rem) {
> +		int ret = logical_ring_wait_for_space(ring, ringbuf, ctx, rem);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	virt = ringbuf->virtual_start + ringbuf->tail;
> +	rem /= 4;
> +	while (rem--)
> +		iowrite32(MI_NOOP, virt++);
> +
> +	ringbuf->tail = 0;
> +	ringbuf->space = intel_ring_space(ringbuf);
> +
> +	return 0;
> +}
> +
> +static int logical_ring_prepare(struct intel_engine_cs *ring,
> +				struct intel_ringbuffer *ringbuf,
> +				struct intel_context *ctx,
> +				int bytes)
> +{
> +	int ret;
> +
> +	if (unlikely(ringbuf->tail + bytes > ringbuf->effective_size)) {
> +		ret = logical_ring_wrap_buffer(ring, ringbuf, ctx);
> +		if (unlikely(ret))
> +			return ret;
> +	}
> +
> +	if (unlikely(ringbuf->space < bytes)) {
> +		ret = logical_ring_wait_for_space(ring, ringbuf, ctx, bytes);
> +		if (unlikely(ret))
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +int intel_logical_ring_begin(struct intel_engine_cs *ring,
> +			     struct intel_context *ctx,
> +			     int num_dwords)
> +{
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +	struct intel_ringbuffer *ringbuf = logical_ringbuf_get(ring, ctx);
> +	int ret;
> +
> +	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> +				   dev_priv->mm.interruptible);
> +	if (ret)
> +		return ret;
> +
> +	ret = logical_ring_prepare(ring, ringbuf, ctx,
> +			num_dwords * sizeof(uint32_t));
> +	if (ret)
> +		return ret;
> +
> +	/* Preallocate the olr before touching the ring */
> +	ret = logical_ring_alloc_seqno(ring, ctx);
> +	if (ret)
> +		return ret;
> +
> +	ringbuf->space -= num_dwords * sizeof(uint32_t);
> +	return 0;
> +}
> +
>  static int gen8_init_common_ring(struct intel_engine_cs *ring)
>  {
>  	struct drm_device *dev = ring->dev;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 26b0949..686ebf5 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -5,6 +5,24 @@
>  void intel_logical_ring_cleanup(struct intel_engine_cs *ring);
>  int intel_logical_rings_init(struct drm_device *dev);
>  
> +void intel_logical_ring_advance_and_submit(struct intel_engine_cs *ring,
> +					   struct intel_context *ctx);
> +
> +static inline void intel_logical_ring_advance(struct intel_ringbuffer *ringbuf)
> +{
> +	ringbuf->tail &= ringbuf->size - 1;
> +}
> +
> +static inline void intel_logical_ring_emit(struct intel_ringbuffer *ringbuf, u32 data)
> +{
> +	iowrite32(data, ringbuf->virtual_start + ringbuf->tail);
> +	ringbuf->tail += 4;
> +}
> +
> +int intel_logical_ring_begin(struct intel_engine_cs *ring,
> +			     struct intel_context *ctx,
> +			     int num_dwords);
> +

I think all of these are only used in intel_lrc.c, so don't need to be in
the header and could all be static. Right?

Brad

>  /* Logical Ring Contexts */
>  void intel_lr_context_free(struct intel_context *ctx);
>  int intel_lr_context_deferred_create(struct intel_context *ctx,
> -- 
> 1.9.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
oscar.mateo@intel.com June 23, 2014, 1:09 p.m. UTC | #2
> -----Original Message-----

> From: Volkin, Bradley D

> Sent: Friday, June 20, 2014 10:01 PM

> To: Mateo Lozano, Oscar

> Cc: intel-gfx@lists.freedesktop.org

> Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical ring

> submission mechanism

> 

> On Fri, Jun 13, 2014 at 08:37:44AM -0700, oscar.mateo@intel.com wrote:

> > From: Oscar Mateo <oscar.mateo@intel.com>

> >

> > Well, new-ish: if all this code looks familiar, that's because it's a

> > clone of the existing submission mechanism (with some modifications

> > here and there to adapt it to LRCs and Execlists).

> >

> > And why did we do this? Execlists offer several advantages, like

> > control over when the GPU is done with a given workload, that can help

> > simplify the submission mechanism, no doubt, but I am interested in

> > getting Execlists to work first and foremost. As we are creating a

> > parallel submission mechanism (even if itñ? just a clone), we can now

> > start improving it without the fear of breaking old gens.

> >

> > Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>

> > ---

> >  drivers/gpu/drm/i915/intel_lrc.c | 214

> > +++++++++++++++++++++++++++++++++++++++

> >  drivers/gpu/drm/i915/intel_lrc.h |  18 ++++

> >  2 files changed, 232 insertions(+)

> >

> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c

> > b/drivers/gpu/drm/i915/intel_lrc.c

> > index 02fc3d0..89aed7a 100644

> > --- a/drivers/gpu/drm/i915/intel_lrc.c

> > +++ b/drivers/gpu/drm/i915/intel_lrc.c

> > @@ -86,6 +86,220 @@ bool intel_enable_execlists(struct drm_device

> *dev)

> >  	return HAS_LOGICAL_RING_CONTEXTS(dev) && USES_PPGTT(dev);  }

> >

> > +static inline struct intel_ringbuffer * logical_ringbuf_get(struct

> > +intel_engine_cs *ring, struct intel_context *ctx) {

> > +	return ctx->engine[ring->id].ringbuf; }

> > +

> > +void intel_logical_ring_advance_and_submit(struct intel_engine_cs *ring,

> > +					   struct intel_context *ctx)

> > +{

> > +	struct intel_ringbuffer *ringbuf = logical_ringbuf_get(ring, ctx);

> > +

> > +	intel_logical_ring_advance(ringbuf);

> > +

> > +	if (intel_ring_stopped(ring))

> > +		return;

> > +

> > +	ring->submit_ctx(ring, ctx, ringbuf->tail); }

> > +

> > +static int logical_ring_alloc_seqno(struct intel_engine_cs *ring,

> > +				    struct intel_context *ctx)

> > +{

> > +	if (ring->outstanding_lazy_seqno)

> > +		return 0;

> > +

> > +	if (ring->preallocated_lazy_request == NULL) {

> > +		struct drm_i915_gem_request *request;

> > +

> > +		request = kmalloc(sizeof(*request), GFP_KERNEL);

> > +		if (request == NULL)

> > +			return -ENOMEM;

> > +

> > +		ring->preallocated_lazy_request = request;

> > +	}

> > +

> > +	return i915_gem_get_seqno(ring->dev, &ring-

> >outstanding_lazy_seqno);

> > +}

> > +

> > +static int logical_ring_wait_request(struct intel_engine_cs *ring,

> > +				     struct intel_ringbuffer *ringbuf,

> > +				     struct intel_context *ctx,

> > +				     int bytes)

> > +{

> > +	struct drm_i915_gem_request *request;

> > +	u32 seqno = 0;

> > +	int ret;

> > +

> > +	if (ringbuf->last_retired_head != -1) {

> > +		ringbuf->head = ringbuf->last_retired_head;

> > +		ringbuf->last_retired_head = -1;

> > +

> > +		ringbuf->space = intel_ring_space(ringbuf);

> > +		if (ringbuf->space >= bytes)

> > +			return 0;

> > +	}

> > +

> > +	list_for_each_entry(request, &ring->request_list, list) {

> > +		if (__intel_ring_space(request->tail, ringbuf->tail,

> > +				ringbuf->size) >= bytes) {

> > +			seqno = request->seqno;

> > +			break;

> > +		}

> > +	}

> > +

> > +	if (seqno == 0)

> > +		return -ENOSPC;

> > +

> > +	ret = i915_wait_seqno(ring, seqno);

> > +	if (ret)

> > +		return ret;

> > +

> > +	/* TODO: make sure we update the right ringbuffer's

> last_retired_head

> > +	 * when retiring requests */

> > +	i915_gem_retire_requests_ring(ring);

> > +	ringbuf->head = ringbuf->last_retired_head;

> > +	ringbuf->last_retired_head = -1;

> > +

> > +	ringbuf->space = intel_ring_space(ringbuf);

> > +	return 0;

> > +}

> > +

> > +static int logical_ring_wait_for_space(struct intel_engine_cs *ring,

> > +						   struct intel_ringbuffer

> *ringbuf,

> > +						   struct intel_context *ctx,

> > +						   int bytes)

> > +{

> > +	struct drm_device *dev = ring->dev;

> > +	struct drm_i915_private *dev_priv = dev->dev_private;

> > +	unsigned long end;

> > +	int ret;

> > +

> > +	ret = logical_ring_wait_request(ring, ringbuf, ctx, bytes);

> > +	if (ret != -ENOSPC)

> > +		return ret;

> > +

> > +	/* Force the context submission in case we have been skipping it */

> > +	intel_logical_ring_advance_and_submit(ring, ctx);

> > +

> > +	/* With GEM the hangcheck timer should kick us out of the loop,

> > +	 * leaving it early runs the risk of corrupting GEM state (due

> > +	 * to running on almost untested codepaths). But on resume

> > +	 * timers don't work yet, so prevent a complete hang in that

> > +	 * case by choosing an insanely large timeout. */

> > +	end = jiffies + 60 * HZ;

> > +

> 

> In the legacy ringbuffer version, there are tracepoints around the do loop.

> Should we keep those? Or add lrc specific equivalents?

> 

> > +	do {

> > +		ringbuf->head = I915_READ_HEAD(ring);

> > +		ringbuf->space = intel_ring_space(ringbuf);

> > +		if (ringbuf->space >= bytes) {

> > +			ret = 0;

> > +			break;

> > +		}

> > +

> > +		if (!drm_core_check_feature(dev, DRIVER_MODESET) &&

> > +		    dev->primary->master) {

> > +			struct drm_i915_master_private *master_priv = dev-

> >primary->master->driver_priv;

> > +			if (master_priv->sarea_priv)

> > +				master_priv->sarea_priv->perf_boxes |=

> I915_BOX_WAIT;

> > +		}

> > +

> > +		msleep(1);

> > +

> > +		if (dev_priv->mm.interruptible && signal_pending(current)) {

> > +			ret = -ERESTARTSYS;

> > +			break;

> > +		}

> > +

> > +		ret = i915_gem_check_wedge(&dev_priv->gpu_error,

> > +					   dev_priv->mm.interruptible);

> > +		if (ret)

> > +			break;

> > +

> > +		if (time_after(jiffies, end)) {

> > +			ret = -EBUSY;

> > +			break;

> > +		}

> > +	} while (1);

> > +

> > +	return ret;

> > +}

> > +

> > +static int logical_ring_wrap_buffer(struct intel_engine_cs *ring,

> > +						struct intel_ringbuffer

> *ringbuf,

> > +						struct intel_context *ctx)

> > +{

> > +	uint32_t __iomem *virt;

> > +	int rem = ringbuf->size - ringbuf->tail;

> > +

> > +	if (ringbuf->space < rem) {

> > +		int ret = logical_ring_wait_for_space(ring, ringbuf, ctx, rem);

> > +		if (ret)

> > +			return ret;

> > +	}

> > +

> > +	virt = ringbuf->virtual_start + ringbuf->tail;

> > +	rem /= 4;

> > +	while (rem--)

> > +		iowrite32(MI_NOOP, virt++);

> > +

> > +	ringbuf->tail = 0;

> > +	ringbuf->space = intel_ring_space(ringbuf);

> > +

> > +	return 0;

> > +}

> > +

> > +static int logical_ring_prepare(struct intel_engine_cs *ring,

> > +				struct intel_ringbuffer *ringbuf,

> > +				struct intel_context *ctx,

> > +				int bytes)

> > +{

> > +	int ret;

> > +

> > +	if (unlikely(ringbuf->tail + bytes > ringbuf->effective_size)) {

> > +		ret = logical_ring_wrap_buffer(ring, ringbuf, ctx);

> > +		if (unlikely(ret))

> > +			return ret;

> > +	}

> > +

> > +	if (unlikely(ringbuf->space < bytes)) {

> > +		ret = logical_ring_wait_for_space(ring, ringbuf, ctx, bytes);

> > +		if (unlikely(ret))

> > +			return ret;

> > +	}

> > +

> > +	return 0;

> > +}

> > +

> > +int intel_logical_ring_begin(struct intel_engine_cs *ring,

> > +			     struct intel_context *ctx,

> > +			     int num_dwords)

> > +{

> > +	struct drm_i915_private *dev_priv = ring->dev->dev_private;

> > +	struct intel_ringbuffer *ringbuf = logical_ringbuf_get(ring, ctx);

> > +	int ret;

> > +

> > +	ret = i915_gem_check_wedge(&dev_priv->gpu_error,

> > +				   dev_priv->mm.interruptible);

> > +	if (ret)

> > +		return ret;

> > +

> > +	ret = logical_ring_prepare(ring, ringbuf, ctx,

> > +			num_dwords * sizeof(uint32_t));

> > +	if (ret)

> > +		return ret;

> > +

> > +	/* Preallocate the olr before touching the ring */

> > +	ret = logical_ring_alloc_seqno(ring, ctx);

> > +	if (ret)

> > +		return ret;

> > +

> > +	ringbuf->space -= num_dwords * sizeof(uint32_t);

> > +	return 0;

> > +}

> > +

> >  static int gen8_init_common_ring(struct intel_engine_cs *ring)  {

> >  	struct drm_device *dev = ring->dev;

> > diff --git a/drivers/gpu/drm/i915/intel_lrc.h

> > b/drivers/gpu/drm/i915/intel_lrc.h

> > index 26b0949..686ebf5 100644

> > --- a/drivers/gpu/drm/i915/intel_lrc.h

> > +++ b/drivers/gpu/drm/i915/intel_lrc.h

> > @@ -5,6 +5,24 @@

> >  void intel_logical_ring_cleanup(struct intel_engine_cs *ring);  int

> > intel_logical_rings_init(struct drm_device *dev);

> >

> > +void intel_logical_ring_advance_and_submit(struct intel_engine_cs *ring,

> > +					   struct intel_context *ctx);

> > +

> > +static inline void intel_logical_ring_advance(struct intel_ringbuffer

> > +*ringbuf) {

> > +	ringbuf->tail &= ringbuf->size - 1;

> > +}

> > +

> > +static inline void intel_logical_ring_emit(struct intel_ringbuffer

> > +*ringbuf, u32 data) {

> > +	iowrite32(data, ringbuf->virtual_start + ringbuf->tail);

> > +	ringbuf->tail += 4;

> > +}

> > +

> > +int intel_logical_ring_begin(struct intel_engine_cs *ring,

> > +			     struct intel_context *ctx,

> > +			     int num_dwords);

> > +

> 

> I think all of these are only used in intel_lrc.c, so don't need to be in the

> header and could all be static. Right?

> 

> Brad


So far, yes, but that´s only because I artificially made intel_lrc.c self-contained, as Daniel requested. What if we need to execute commands from somewhere else, like in intel_gen7_queue_flip()?

And this takes me to another discussion: this logical ring vs legacy ring split is probably a good idea (time will tell), but we should provide a way of sending commands for execution without knowing if Execlists are enabled or not. In the early series that was easy because we reused the ring_begin, ring_emit & ring_advance functions, but this is not the case anymore. And without this, sooner or later somebody will break legacy or execlists (this already happened last week, when somebody here was implementing native sync without knowing about Execlists).

So, the questions is: how do you feel about a dev_priv.gt vfunc that takes a context, a ring, an array of DWORDS and a BB length and does the intel_(logical)_ring_begin/emit/advance based on i915.enable_execlists?

-- Oscar
Chris Wilson June 23, 2014, 1:13 p.m. UTC | #3
On Mon, Jun 23, 2014 at 01:09:37PM +0000, Mateo Lozano, Oscar wrote:
> So far, yes, but that´s only because I artificially made intel_lrc.c self-contained, as Daniel requested. What if we need to execute commands from somewhere else, like in intel_gen7_queue_flip()?
> 
> And this takes me to another discussion: this logical ring vs legacy ring split is probably a good idea (time will tell), but we should provide a way of sending commands for execution without knowing if Execlists are enabled or not. In the early series that was easy because we reused the ring_begin, ring_emit & ring_advance functions, but this is not the case anymore. And without this, sooner or later somebody will break legacy or execlists (this already happened last week, when somebody here was implementing native sync without knowing about Execlists).
> 
> So, the questions is: how do you feel about a dev_priv.gt vfunc that takes a context, a ring, an array of DWORDS and a BB length and does the intel_(logical)_ring_begin/emit/advance based on i915.enable_execlists?

I'm still baffled by the design. intel_ring_begin() and friends should
be able to find their context (logical or legacy) from the ring and
dtrt.
-Chris
oscar.mateo@intel.com June 23, 2014, 1:18 p.m. UTC | #4
> -----Original Message-----
> From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> Sent: Monday, June 23, 2014 2:14 PM
> To: Mateo Lozano, Oscar
> Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical ring
> submission mechanism
> 
> On Mon, Jun 23, 2014 at 01:09:37PM +0000, Mateo Lozano, Oscar wrote:
> > So far, yes, but that´s only because I artificially made intel_lrc.c self-
> contained, as Daniel requested. What if we need to execute commands from
> somewhere else, like in intel_gen7_queue_flip()?
> >
> > And this takes me to another discussion: this logical ring vs legacy ring split
> is probably a good idea (time will tell), but we should provide a way of
> sending commands for execution without knowing if Execlists are enabled or
> not. In the early series that was easy because we reused the ring_begin,
> ring_emit & ring_advance functions, but this is not the case anymore. And
> without this, sooner or later somebody will break legacy or execlists (this
> already happened last week, when somebody here was implementing native
> sync without knowing about Execlists).
> >
> > So, the questions is: how do you feel about a dev_priv.gt vfunc that takes a
> context, a ring, an array of DWORDS and a BB length and does the
> intel_(logical)_ring_begin/emit/advance based on i915.enable_execlists?
> 
> I'm still baffled by the design. intel_ring_begin() and friends should be able to
> find their context (logical or legacy) from the ring and dtrt.
> -Chris

Sorry, Chris, I obviously don´t have the same experience with 915 you have: how do you propose to extract the right context from the ring?
Chris Wilson June 23, 2014, 1:27 p.m. UTC | #5
On Mon, Jun 23, 2014 at 01:18:35PM +0000, Mateo Lozano, Oscar wrote:
> > -----Original Message-----
> > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > Sent: Monday, June 23, 2014 2:14 PM
> > To: Mateo Lozano, Oscar
> > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical ring
> > submission mechanism
> > 
> > On Mon, Jun 23, 2014 at 01:09:37PM +0000, Mateo Lozano, Oscar wrote:
> > > So far, yes, but that´s only because I artificially made intel_lrc.c self-
> > contained, as Daniel requested. What if we need to execute commands from
> > somewhere else, like in intel_gen7_queue_flip()?
> > >
> > > And this takes me to another discussion: this logical ring vs legacy ring split
> > is probably a good idea (time will tell), but we should provide a way of
> > sending commands for execution without knowing if Execlists are enabled or
> > not. In the early series that was easy because we reused the ring_begin,
> > ring_emit & ring_advance functions, but this is not the case anymore. And
> > without this, sooner or later somebody will break legacy or execlists (this
> > already happened last week, when somebody here was implementing native
> > sync without knowing about Execlists).
> > >
> > > So, the questions is: how do you feel about a dev_priv.gt vfunc that takes a
> > context, a ring, an array of DWORDS and a BB length and does the
> > intel_(logical)_ring_begin/emit/advance based on i915.enable_execlists?
> > 
> > I'm still baffled by the design. intel_ring_begin() and friends should be able to
> > find their context (logical or legacy) from the ring and dtrt.
> > -Chris
> 
> Sorry, Chris, I obviously don´t have the same experience with 915 you have: how do you propose to extract the right context from the ring?

The rings are a set of buffers and vfuncs that are associated with a
context. Before you can call intel_ring_begin() you must know what
context you want to operate on and therefore can pick the right
logical/legacy ring and interface for RCS/BCS/VCS/etc
-Chris
oscar.mateo@intel.com June 23, 2014, 1:36 p.m. UTC | #6
> -----Original Message-----
> From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> Sent: Monday, June 23, 2014 2:27 PM
> To: Mateo Lozano, Oscar
> Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical ring
> submission mechanism
> 
> On Mon, Jun 23, 2014 at 01:18:35PM +0000, Mateo Lozano, Oscar wrote:
> > > -----Original Message-----
> > > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > > Sent: Monday, June 23, 2014 2:14 PM
> > > To: Mateo Lozano, Oscar
> > > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical
> > > ring submission mechanism
> > >
> > > On Mon, Jun 23, 2014 at 01:09:37PM +0000, Mateo Lozano, Oscar
> wrote:
> > > > So far, yes, but that´s only because I artificially made
> > > > intel_lrc.c self-
> > > contained, as Daniel requested. What if we need to execute commands
> > > from somewhere else, like in intel_gen7_queue_flip()?
> > > >
> > > > And this takes me to another discussion: this logical ring vs
> > > > legacy ring split
> > > is probably a good idea (time will tell), but we should provide a
> > > way of sending commands for execution without knowing if Execlists
> > > are enabled or not. In the early series that was easy because we
> > > reused the ring_begin, ring_emit & ring_advance functions, but this
> > > is not the case anymore. And without this, sooner or later somebody
> > > will break legacy or execlists (this already happened last week,
> > > when somebody here was implementing native sync without knowing
> about Execlists).
> > > >
> > > > So, the questions is: how do you feel about a dev_priv.gt vfunc
> > > > that takes a
> > > context, a ring, an array of DWORDS and a BB length and does the
> > > intel_(logical)_ring_begin/emit/advance based on i915.enable_execlists?
> > >
> > > I'm still baffled by the design. intel_ring_begin() and friends
> > > should be able to find their context (logical or legacy) from the ring and
> dtrt.
> > > -Chris
> >
> > Sorry, Chris, I obviously don´t have the same experience with 915 you have:
> how do you propose to extract the right context from the ring?
> 
> The rings are a set of buffers and vfuncs that are associated with a context.
> Before you can call intel_ring_begin() you must know what context you want
> to operate on and therefore can pick the right logical/legacy ring and
> interface for RCS/BCS/VCS/etc -Chris

Ok, but then you need to pass some extra stuff down together with the intel_engine_cs, either intel_context or intel_ringbuffer, right? Because that´s exactly what I did in previous versions, plumbing intel_context  everywhere where it was needed (I could have plumbed intel_ringbuffer instead, it really doesn´t matter). This was rejected for being too intrusive and not allowing easy maintenance in the future.

-- Oscar
Chris Wilson June 23, 2014, 1:41 p.m. UTC | #7
On Mon, Jun 23, 2014 at 01:36:07PM +0000, Mateo Lozano, Oscar wrote:
> > -----Original Message-----
> > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > Sent: Monday, June 23, 2014 2:27 PM
> > To: Mateo Lozano, Oscar
> > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical ring
> > submission mechanism
> > 
> > On Mon, Jun 23, 2014 at 01:18:35PM +0000, Mateo Lozano, Oscar wrote:
> > > > -----Original Message-----
> > > > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > > > Sent: Monday, June 23, 2014 2:14 PM
> > > > To: Mateo Lozano, Oscar
> > > > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > > > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical
> > > > ring submission mechanism
> > > >
> > > > On Mon, Jun 23, 2014 at 01:09:37PM +0000, Mateo Lozano, Oscar
> > wrote:
> > > > > So far, yes, but that´s only because I artificially made
> > > > > intel_lrc.c self-
> > > > contained, as Daniel requested. What if we need to execute commands
> > > > from somewhere else, like in intel_gen7_queue_flip()?
> > > > >
> > > > > And this takes me to another discussion: this logical ring vs
> > > > > legacy ring split
> > > > is probably a good idea (time will tell), but we should provide a
> > > > way of sending commands for execution without knowing if Execlists
> > > > are enabled or not. In the early series that was easy because we
> > > > reused the ring_begin, ring_emit & ring_advance functions, but this
> > > > is not the case anymore. And without this, sooner or later somebody
> > > > will break legacy or execlists (this already happened last week,
> > > > when somebody here was implementing native sync without knowing
> > about Execlists).
> > > > >
> > > > > So, the questions is: how do you feel about a dev_priv.gt vfunc
> > > > > that takes a
> > > > context, a ring, an array of DWORDS and a BB length and does the
> > > > intel_(logical)_ring_begin/emit/advance based on i915.enable_execlists?
> > > >
> > > > I'm still baffled by the design. intel_ring_begin() and friends
> > > > should be able to find their context (logical or legacy) from the ring and
> > dtrt.
> > > > -Chris
> > >
> > > Sorry, Chris, I obviously don´t have the same experience with 915 you have:
> > how do you propose to extract the right context from the ring?
> > 
> > The rings are a set of buffers and vfuncs that are associated with a context.
> > Before you can call intel_ring_begin() you must know what context you want
> > to operate on and therefore can pick the right logical/legacy ring and
> > interface for RCS/BCS/VCS/etc -Chris
> 
> Ok, but then you need to pass some extra stuff down together with the intel_engine_cs, either intel_context or intel_ringbuffer, right? Because that´s exactly what I did in previous versions, plumbing intel_context  everywhere where it was needed (I could have plumbed intel_ringbuffer instead, it really doesn´t matter). This was rejected for being too intrusive and not allowing easy maintenance in the future.

Nope. You didn't redesign the ringbuffers to function as we expected but
tacked on extra information and layering violations.
-Chris
oscar.mateo@intel.com June 23, 2014, 2:35 p.m. UTC | #8
> -----Original Message-----
> From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> Sent: Monday, June 23, 2014 2:42 PM
> To: Mateo Lozano, Oscar
> Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical ring
> submission mechanism
> 
> On Mon, Jun 23, 2014 at 01:36:07PM +0000, Mateo Lozano, Oscar wrote:
> > > -----Original Message-----
> > > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > > Sent: Monday, June 23, 2014 2:27 PM
> > > To: Mateo Lozano, Oscar
> > > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical
> > > ring submission mechanism
> > >
> > > On Mon, Jun 23, 2014 at 01:18:35PM +0000, Mateo Lozano, Oscar
> wrote:
> > > > > -----Original Message-----
> > > > > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > > > > Sent: Monday, June 23, 2014 2:14 PM
> > > > > To: Mateo Lozano, Oscar
> > > > > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > > > > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical
> > > > > ring submission mechanism
> > > > >
> > > > > On Mon, Jun 23, 2014 at 01:09:37PM +0000, Mateo Lozano, Oscar
> > > wrote:
> > > > > > So far, yes, but that´s only because I artificially made
> > > > > > intel_lrc.c self-
> > > > > contained, as Daniel requested. What if we need to execute
> > > > > commands from somewhere else, like in intel_gen7_queue_flip()?
> > > > > >
> > > > > > And this takes me to another discussion: this logical ring vs
> > > > > > legacy ring split
> > > > > is probably a good idea (time will tell), but we should provide
> > > > > a way of sending commands for execution without knowing if
> > > > > Execlists are enabled or not. In the early series that was easy
> > > > > because we reused the ring_begin, ring_emit & ring_advance
> > > > > functions, but this is not the case anymore. And without this,
> > > > > sooner or later somebody will break legacy or execlists (this
> > > > > already happened last week, when somebody here was implementing
> > > > > native sync without knowing
> > > about Execlists).
> > > > > >
> > > > > > So, the questions is: how do you feel about a dev_priv.gt
> > > > > > vfunc that takes a
> > > > > context, a ring, an array of DWORDS and a BB length and does the
> > > > > intel_(logical)_ring_begin/emit/advance based on
> i915.enable_execlists?
> > > > >
> > > > > I'm still baffled by the design. intel_ring_begin() and friends
> > > > > should be able to find their context (logical or legacy) from
> > > > > the ring and
> > > dtrt.
> > > > > -Chris
> > > >
> > > > Sorry, Chris, I obviously don´t have the same experience with 915 you
> have:
> > > how do you propose to extract the right context from the ring?
> > >
> > > The rings are a set of buffers and vfuncs that are associated with a
> context.
> > > Before you can call intel_ring_begin() you must know what context
> > > you want to operate on and therefore can pick the right
> > > logical/legacy ring and interface for RCS/BCS/VCS/etc -Chris
> >
> > Ok, but then you need to pass some extra stuff down together with the
> intel_engine_cs, either intel_context or intel_ringbuffer, right? Because
> that´s exactly what I did in previous versions, plumbing intel_context
> everywhere where it was needed (I could have plumbed intel_ringbuffer
> instead, it really doesn´t matter). This was rejected for being too intrusive
> and not allowing easy maintenance in the future.
> 
> Nope. You didn't redesign the ringbuffers to function as we expected but
> tacked on extra information and layering violations.
> -Chris

I know is no excuse, but as I said, I don´t know the code as well as you do. Let me explain to you the history of this one and maybe you can help me out discovering where I got it all wrong:

- The original design I inherited from Ben and Jesse created a completely new "struct intel_ring_buffer" per context, and passed that one on instead of passing one of the &dev_priv->ring[i] ones. When it was time to submit a context to the ELSP, they simply took it from ring->context. The problem with that was that creating an unbound number of "struct intel_ring_buffer" meant there were also an unbound number of things like "struct list_head active_list" or "u32 irq_enable_mask", which made little sense.
- What we really needed, I naively thought, is to get rid of the concept of ring: there are no rings, there are engines (like the render engine, the vebox, etc...) and there are ringbuffers (as in circular buffer with a head offset, tail offset, and control information). So I went on and renamed the old "intel_ring_buffer" to "intel_engine_cs", then extracting a few things into a new "intel_ringbuffer" struct. Pre-Execlists, there was a 1:1 relationship between the ringbuffers and the engines. With Execlists, however, this 1:1 relationship is between the ringbuffers and the contexts.
- I remember explaining this problem in a face-to-face meeting in the UK with some OTC people back in February (I think they tried to get you on the phone but they didn´t manage. I do remember they got Daniel though). Here, I proposed that an easy solution (easy, but maybe not right) was to plumb the context down: in Execlists mode, we would retrieve the ringbuffer from the context while, in legacy mode, we would get it from the engine. Everybody seemed to agree with this.
- I worked on this premise for several versions that I sent to the mailing list for review (first the internal, then intel-gfx). Daniel only complained last month, when he pointed out that he asked, a long long time ago, for a completely separate execution path for Execlists. And this is where we are now...

And now back to the problem at hand: what do you suggest I do now (other than ritual seppuku, I mean)? I need to know the engine (for a lot of reasons), the ringbuffer (to know where to place commands) and the context (to know what to submit to the ELSP). I can spin it a thousand different ways, but I still need to know those three things. At the same time, I cannot reuse the old intel_ringbuffer code because Daniel won´t approve. What would you do?
bradley.d.volkin@intel.com June 23, 2014, 7:10 p.m. UTC | #9
On Mon, Jun 23, 2014 at 07:35:38AM -0700, Mateo Lozano, Oscar wrote:
> > -----Original Message-----
> > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > Sent: Monday, June 23, 2014 2:42 PM
> > To: Mateo Lozano, Oscar
> > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical ring
> > submission mechanism
> > 
> > On Mon, Jun 23, 2014 at 01:36:07PM +0000, Mateo Lozano, Oscar wrote:
> > > > -----Original Message-----
> > > > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > > > Sent: Monday, June 23, 2014 2:27 PM
> > > > To: Mateo Lozano, Oscar
> > > > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > > > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical
> > > > ring submission mechanism
> > > >
> > > > On Mon, Jun 23, 2014 at 01:18:35PM +0000, Mateo Lozano, Oscar
> > wrote:
> > > > > > -----Original Message-----
> > > > > > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > > > > > Sent: Monday, June 23, 2014 2:14 PM
> > > > > > To: Mateo Lozano, Oscar
> > > > > > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > > > > > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical
> > > > > > ring submission mechanism
> > > > > >
> > > > > > On Mon, Jun 23, 2014 at 01:09:37PM +0000, Mateo Lozano, Oscar
> > > > wrote:
> > > > > > > So far, yes, but that´s only because I artificially made
> > > > > > > intel_lrc.c self-
> > > > > > contained, as Daniel requested. What if we need to execute
> > > > > > commands from somewhere else, like in intel_gen7_queue_flip()?
> > > > > > >
> > > > > > > And this takes me to another discussion: this logical ring vs
> > > > > > > legacy ring split
> > > > > > is probably a good idea (time will tell), but we should provide
> > > > > > a way of sending commands for execution without knowing if
> > > > > > Execlists are enabled or not. In the early series that was easy
> > > > > > because we reused the ring_begin, ring_emit & ring_advance
> > > > > > functions, but this is not the case anymore. And without this,
> > > > > > sooner or later somebody will break legacy or execlists (this
> > > > > > already happened last week, when somebody here was implementing
> > > > > > native sync without knowing
> > > > about Execlists).
> > > > > > >
> > > > > > > So, the questions is: how do you feel about a dev_priv.gt
> > > > > > > vfunc that takes a
> > > > > > context, a ring, an array of DWORDS and a BB length and does the
> > > > > > intel_(logical)_ring_begin/emit/advance based on
> > i915.enable_execlists?

There are 3 cases of non-execbuffer submissions that I can think of: flips,
render state, and clear-buffer (proposed patches on the list). I wonder if the
right approach might be to use batchbuffers with a small wrapper around the
dispatch_execbuffer/emit_bb_start vfuncs. Basically the rule would be to only
touch a ringbuffer from within the intel_engine_cs vfuncs, which always know
which set of functions to use.

For flips, we could use MMIO flips. Render state already uses the existing
dispatch_execbuffer() and add_request(). The clear code could potentially do
the same. There would obviously be some overhead in using a batch buffer for
what could end up being just a few commands. Perhaps the batch buffer pool
code from the command parser would help though.

> > > > > >
> > > > > > I'm still baffled by the design. intel_ring_begin() and friends
> > > > > > should be able to find their context (logical or legacy) from
> > > > > > the ring and
> > > > dtrt.
> > > > > > -Chris
> > > > >
> > > > > Sorry, Chris, I obviously don´t have the same experience with 915 you
> > have:
> > > > how do you propose to extract the right context from the ring?
> > > >
> > > > The rings are a set of buffers and vfuncs that are associated with a
> > context.
> > > > Before you can call intel_ring_begin() you must know what context
> > > > you want to operate on and therefore can pick the right
> > > > logical/legacy ring and interface for RCS/BCS/VCS/etc -Chris
> > >
> > > Ok, but then you need to pass some extra stuff down together with the
> > intel_engine_cs, either intel_context or intel_ringbuffer, right? Because
> > that´s exactly what I did in previous versions, plumbing intel_context
> > everywhere where it was needed (I could have plumbed intel_ringbuffer
> > instead, it really doesn´t matter). This was rejected for being too intrusive
> > and not allowing easy maintenance in the future.
> > 
> > Nope. You didn't redesign the ringbuffers to function as we expected but
> > tacked on extra information and layering violations.

Not sure what your proposed alternative is here Chris. I'll elaborate on what
I had proposed r.e. plumbing intel_ringbuffer instead of intel_context, which
might be along the lines of what you envision. At this point, we're starting
to go in circles, so I don't know if it's worth revisting beyond that.

The earlier versions of the series modified all of the intel_ring_* functions
to accept (engine, context) as parameters. At or near the bottom of the call
chain (e.g. in the engine vfuncs), we called a new function,
intel_ringbuffer_get(engine, context), to return the appropriate ringbuffer
in both legacy and lrc modes. However, a given struct intel_ringbuffer is
only ever used with a particular engine (for both legacy and lrc) and with
a particular context (for lrc only). So I suggested that we:

- Add a back pointer from struct intel_rinbuffer to intel_context (would only
  be valid for lrc mode)
- Move the intel_ringbuffer_get(engine, context) calls up to the callers
- Pass (engine, ringbuf) instead of (engine, context) to intel_ring_* functions
- Have the vfunc implemenations get the context from the ringbuffer where
  needed and ignore it where not

Looking again, we could probably add a back pointer to the intel_engine_cs as
well and then just pass around the ringbuf. In any case, we went back and forth
on this a bit and decided to just stick with passing (engine, context). Which we
then decided was too invasive, and here we are. So is that closer to what you're
thinking of, or did you have something else in mind?

Thanks,
Brad

> > -Chris
> 
> I know is no excuse, but as I said, I don´t know the code as well as you do. Let me explain to you the history of this one and maybe you can help me out discovering where I got it all wrong:
> 
> - The original design I inherited from Ben and Jesse created a completely new "struct intel_ring_buffer" per context, and passed that one on instead of passing one of the &dev_priv->ring[i] ones. When it was time to submit a context to the ELSP, they simply took it from ring->context. The problem with that was that creating an unbound number of "struct intel_ring_buffer" meant there were also an unbound number of things like "struct list_head active_list" or "u32 irq_enable_mask", which made little sense.
> - What we really needed, I naively thought, is to get rid of the concept of ring: there are no rings, there are engines (like the render engine, the vebox, etc...) and there are ringbuffers (as in circular buffer with a head offset, tail offset, and control information). So I went on and renamed the old "intel_ring_buffer" to "intel_engine_cs", then extracting a few things into a new "intel_ringbuffer" struct. Pre-Execlists, there was a 1:1 relationship between the ringbuffers and the engines. With Execlists, however, this 1:1 relationship is between the ringbuffers and the contexts.
> - I remember explaining this problem in a face-to-face meeting in the UK with some OTC people back in February (I think they tried to get you on the phone but they didn´t manage. I do remember they got Daniel though). Here, I proposed that an easy solution (easy, but maybe not right) was to plumb the context down: in Execlists mode, we would retrieve the ringbuffer from the context while, in legacy mode, we would get it from the engine. Everybody seemed to agree with this.
> - I worked on this premise for several versions that I sent to the mailing list for review (first the internal, then intel-gfx). Daniel only complained last month, when he pointed out that he asked, a long long time ago, for a completely separate execution path for Execlists. And this is where we are now...
> 
> And now back to the problem at hand: what do you suggest I do now (other than ritual seppuku, I mean)? I need to know the engine (for a lot of reasons), the ringbuffer (to know where to place commands) and the context (to know what to submit to the ELSP). I can spin it a thousand different ways, but I still need to know those three things. At the same time, I cannot reuse the old intel_ringbuffer code because Daniel won´t approve. What would you do?
Ben Widawsky June 24, 2014, 12:23 a.m. UTC | #10
On Mon, Jun 23, 2014 at 02:35:38PM +0000, Mateo Lozano, Oscar wrote:
> > -----Original Message-----
> > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > Sent: Monday, June 23, 2014 2:42 PM
> > To: Mateo Lozano, Oscar
> > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical ring
> > submission mechanism
> > 
> > On Mon, Jun 23, 2014 at 01:36:07PM +0000, Mateo Lozano, Oscar wrote:
> > > > -----Original Message-----
> > > > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > > > Sent: Monday, June 23, 2014 2:27 PM
> > > > To: Mateo Lozano, Oscar
> > > > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > > > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical
> > > > ring submission mechanism
> > > >
> > > > On Mon, Jun 23, 2014 at 01:18:35PM +0000, Mateo Lozano, Oscar
> > wrote:
> > > > > > -----Original Message-----
> > > > > > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > > > > > Sent: Monday, June 23, 2014 2:14 PM
> > > > > > To: Mateo Lozano, Oscar
> > > > > > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > > > > > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical
> > > > > > ring submission mechanism
> > > > > >
> > > > > > On Mon, Jun 23, 2014 at 01:09:37PM +0000, Mateo Lozano, Oscar
> > > > wrote:
> > > > > > > So far, yes, but that´s only because I artificially made
> > > > > > > intel_lrc.c self-
> > > > > > contained, as Daniel requested. What if we need to execute
> > > > > > commands from somewhere else, like in intel_gen7_queue_flip()?
> > > > > > >
> > > > > > > And this takes me to another discussion: this logical ring vs
> > > > > > > legacy ring split
> > > > > > is probably a good idea (time will tell), but we should provide
> > > > > > a way of sending commands for execution without knowing if
> > > > > > Execlists are enabled or not. In the early series that was easy
> > > > > > because we reused the ring_begin, ring_emit & ring_advance
> > > > > > functions, but this is not the case anymore. And without this,
> > > > > > sooner or later somebody will break legacy or execlists (this
> > > > > > already happened last week, when somebody here was implementing
> > > > > > native sync without knowing
> > > > about Execlists).
> > > > > > >
> > > > > > > So, the questions is: how do you feel about a dev_priv.gt
> > > > > > > vfunc that takes a
> > > > > > context, a ring, an array of DWORDS and a BB length and does the
> > > > > > intel_(logical)_ring_begin/emit/advance based on
> > i915.enable_execlists?
> > > > > >
> > > > > > I'm still baffled by the design. intel_ring_begin() and friends
> > > > > > should be able to find their context (logical or legacy) from
> > > > > > the ring and
> > > > dtrt.
> > > > > > -Chris
> > > > >
> > > > > Sorry, Chris, I obviously don´t have the same experience with 915 you
> > have:
> > > > how do you propose to extract the right context from the ring?
> > > >
> > > > The rings are a set of buffers and vfuncs that are associated with a
> > context.
> > > > Before you can call intel_ring_begin() you must know what context
> > > > you want to operate on and therefore can pick the right
> > > > logical/legacy ring and interface for RCS/BCS/VCS/etc -Chris
> > >
> > > Ok, but then you need to pass some extra stuff down together with the
> > intel_engine_cs, either intel_context or intel_ringbuffer, right? Because
> > that´s exactly what I did in previous versions, plumbing intel_context
> > everywhere where it was needed (I could have plumbed intel_ringbuffer
> > instead, it really doesn´t matter). This was rejected for being too intrusive
> > and not allowing easy maintenance in the future.
> > 
> > Nope. You didn't redesign the ringbuffers to function as we expected but
> > tacked on extra information and layering violations.
> > -Chris
> 
> I know is no excuse, but as I said, I don´t know the code as well as you do. Let me explain to you the history of this one and maybe you can help me out discovering where I got it all wrong:
> 
> - The original design I inherited from Ben and Jesse created a completely new "struct intel_ring_buffer" per context, and passed that one on instead of passing one of the &dev_priv->ring[i] ones. When it was time to submit a context to the ELSP, they simply took it from ring->context. The problem with that was that creating an unbound number of "struct intel_ring_buffer" meant there were also an unbound number of things like "struct list_head active_list" or "u32 irq_enable_mask", which made little sense.
> - What we really needed, I naively thought, is to get rid of the concept of ring: there are no rings, there are engines (like the render engine, the vebox, etc...) and there are ringbuffers (as in circular buffer with a head offset, tail offset, and control information). So I went on and renamed the old "intel_ring_buffer" to "intel_engine_cs", then extracting a few things into a new "intel_ringbuffer" struct. Pre-Execlists, there was a 1:1 relationship between the ringbuffers and the engines. With Execlists, however, this 1:1 relationship is between the ringbuffers and the contexts.
> - I remember explaining this problem in a face-to-face meeting in the UK with some OTC people back in February (I think they tried to get you on the phone but they didn´t manage. I do remember they got Daniel though). Here, I proposed that an easy solution (easy, but maybe not right) was to plumb the context down: in Execlists mode, we would retrieve the ringbuffer from the context while, in legacy mode, we would get it from the engine. Everybody seemed to agree with this.
> - I worked on this premise for several versions that I sent to the mailing list for review (first the internal, then intel-gfx). Daniel only complained last month, when he pointed out that he asked, a long long time ago, for a completely separate execution path for Execlists. And this is where we are now...
> 
> And now back to the problem at hand: what do you suggest I do now (other than ritual seppuku, I mean)? I need to know the engine (for a lot of reasons), the ringbuffer (to know where to place commands) and the context (to know what to submit to the ELSP). I can spin it a thousand different ways, but I still need to know those three things. At the same time, I cannot reuse the old intel_ringbuffer code because Daniel won´t approve. What would you do?

I think what Chris is trying to say is that all of your operations
should be context driven. The ringbuffer is always a derivative of the
context. If I understand Chris correctly, I agree with him, but he is
being characteristically terse.

There should be two data structures:
intel_engine_cs, formerly intel_ringbuffer - for legacy
intel_context, formerly intel_hw_context - for execlist

You did that.

Then there should be a data structure to represent the ringbuffer within
the execlist context.

You did that, too.

I don't think the fact that there is a separate execbuf path makes any
difference in this conversation, but perhaps I missed some detail.

So at least from what I can tell, the data structures are right. The
problem is that we're mixing and matching intel_engine_cs with the new
[and I wish we could have used a different name] intel_ringbuffer. As
an example from near the top of the patch:
+       intel_logical_ring_advance(ringbuf);
+
+       if (intel_ring_stopped(ring))
+               return;

You're advancing the ringbuf, but checking the ring? That's confusing to
me.

I think the only solution for what Chris is asking for is to implement
this as 1 context per engine, as opposed to 1 context with a context
object per engine. As you correctly stated, I think we all agreed the
latter was fine when we met. Functionally, I see no difference, but it
does allow you to always use a context as the sole mechanism for making
any decisions and performing any operations. Now without writing all the
code, I can't promise it actually will look better, but I think it's
likely going to be a lot cleaner. Before you do any changes though...

On to what I see as the real problem: fundamentally, if Daniel put Chris
in charge of giving the thumbs or down, then you should get Daniel to
agree that he will defer to Chris, and you should do whatever Chris
says. You need not be caught in the middle of Daniel and Chris - it is a
bad place to be (I know from much experience). If Daniel is not okay
with that, then he needs to find a different reviewer.
oscar.mateo@intel.com June 24, 2014, 11:45 a.m. UTC | #11
Ok, let´s try to extract something positive out of all this.

OPTION A (Ben´s proposal):

> I think the only solution for what Chris is asking for is to implement this as 1

> context per engine, as opposed to 1 context with a context object per

> engine. As you correctly stated, I think we all agreed the latter was fine when

> we met. Functionally, I see no difference, but it does allow you to always use

> a context as the sole mechanism for making any decisions and performing

> any operations. Now without writing all the code, I can't promise it actually

> will look better, but I think it's likely going to be a lot cleaner. Before you do

> any changes though...


We agreed on this early on (v1), yes, but then the idea was frowned upon by Brad and then by Daniel. I cannot recall exactly why anymore, but one big reason was that the idr mechanism makes it difficult to track several contexts with the same id (and userspace only has one context handle) and something about ctx->hang_stats. From v2 on, we agreed to multiplex different engines inside one intel_context (and that´s why we renamed i915_hw_context to intel_context).

OPTION B (Brad´s proposal):

> So I suggested that we:

> 

> - Add a back pointer from struct intel_rinbuffer to intel_context (would only

>   be valid for lrc mode)

> - Move the intel_ringbuffer_get(engine, context) calls up to the callers

> - Pass (engine, ringbuf) instead of (engine, context) to intel_ring_* functions

> - Have the vfunc implemenations get the context from the ringbuffer where

>   needed and ignore it where not

> 

> Looking again, we could probably add a back pointer to the intel_engine_cs

> as well and then just pass around the ringbuf.


Sounds fine by me: intel_ringbuffer is only related to exactly one intel_engine_cs and one intel_context, so having pointers to those two makes sense.
As before, this could be easily done within the existing code (passing intel_rinbgbuffer instead of intel_engine_cs), but Daniel wants a code split, so I can only do it for the logical ring functions.
oscar.mateo@intel.com June 24, 2014, 12:29 p.m. UTC | #12
> -----Original Message-----
> From: Volkin, Bradley D
> Sent: Monday, June 23, 2014 8:10 PM
> To: Mateo Lozano, Oscar
> Cc: Chris Wilson; intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical ring
> submission mechanism
> 
> On Mon, Jun 23, 2014 at 07:35:38AM -0700, Mateo Lozano, Oscar wrote:
> > > -----Original Message-----
> > > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > > Sent: Monday, June 23, 2014 2:42 PM
> > > To: Mateo Lozano, Oscar
> > > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical
> > > ring submission mechanism
> > >
> > > On Mon, Jun 23, 2014 at 01:36:07PM +0000, Mateo Lozano, Oscar
> wrote:
> > > > > -----Original Message-----
> > > > > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > > > > Sent: Monday, June 23, 2014 2:27 PM
> > > > > To: Mateo Lozano, Oscar
> > > > > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > > > > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical
> > > > > ring submission mechanism
> > > > >
> > > > > On Mon, Jun 23, 2014 at 01:18:35PM +0000, Mateo Lozano, Oscar
> > > wrote:
> > > > > > > -----Original Message-----
> > > > > > > From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> > > > > > > Sent: Monday, June 23, 2014 2:14 PM
> > > > > > > To: Mateo Lozano, Oscar
> > > > > > > Cc: Volkin, Bradley D; intel-gfx@lists.freedesktop.org
> > > > > > > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New
> > > > > > > logical ring submission mechanism
> > > > > > >
> > > > > > > On Mon, Jun 23, 2014 at 01:09:37PM +0000, Mateo Lozano,
> > > > > > > Oscar
> > > > > wrote:
> > > > > > > > So far, yes, but that´s only because I artificially made
> > > > > > > > intel_lrc.c self-
> > > > > > > contained, as Daniel requested. What if we need to execute
> > > > > > > commands from somewhere else, like in intel_gen7_queue_flip()?
> > > > > > > >
> > > > > > > > And this takes me to another discussion: this logical ring
> > > > > > > > vs legacy ring split
> > > > > > > is probably a good idea (time will tell), but we should
> > > > > > > provide a way of sending commands for execution without
> > > > > > > knowing if Execlists are enabled or not. In the early series
> > > > > > > that was easy because we reused the ring_begin, ring_emit &
> > > > > > > ring_advance functions, but this is not the case anymore.
> > > > > > > And without this, sooner or later somebody will break legacy
> > > > > > > or execlists (this already happened last week, when somebody
> > > > > > > here was implementing native sync without knowing
> > > > > about Execlists).
> > > > > > > >
> > > > > > > > So, the questions is: how do you feel about a dev_priv.gt
> > > > > > > > vfunc that takes a
> > > > > > > context, a ring, an array of DWORDS and a BB length and does
> > > > > > > the intel_(logical)_ring_begin/emit/advance based on
> > > i915.enable_execlists?
> 
> There are 3 cases of non-execbuffer submissions that I can think of: flips,
> render state, and clear-buffer (proposed patches on the list). I wonder if the
> right approach might be to use batchbuffers with a small wrapper around the
> dispatch_execbuffer/emit_bb_start vfuncs. Basically the rule would be to
> only touch a ringbuffer from within the intel_engine_cs vfuncs, which always
> know which set of functions to use.
> 
> For flips, we could use MMIO flips. Render state already uses the existing
> dispatch_execbuffer() and add_request(). The clear code could potentially do
> the same. There would obviously be some overhead in using a batch buffer
> for what could end up being just a few commands. Perhaps the batch buffer
> pool code from the command parser would help though.

This has another positive side-effect: the scheduler guys do not like things inside the ring without a proper batchbuffer & request, because it makes their life more complex.
bradley.d.volkin@intel.com June 24, 2014, 2:41 p.m. UTC | #13
On Tue, Jun 24, 2014 at 04:45:05AM -0700, Mateo Lozano, Oscar wrote:
> Ok, let´s try to extract something positive out of all this.
> 
> OPTION A (Ben´s proposal):
> 
> > I think the only solution for what Chris is asking for is to implement this as 1
> > context per engine, as opposed to 1 context with a context object per
> > engine. As you correctly stated, I think we all agreed the latter was fine when
> > we met. Functionally, I see no difference, but it does allow you to always use
> > a context as the sole mechanism for making any decisions and performing
> > any operations. Now without writing all the code, I can't promise it actually
> > will look better, but I think it's likely going to be a lot cleaner. Before you do
> > any changes though...
> 
> We agreed on this early on (v1), yes, but then the idea was frowned upon by Brad and then by Daniel. I cannot recall exactly why anymore, but one big reason was that the idr mechanism makes it difficult to track several contexts with the same id (and userspace only has one context handle) and something about ctx->hang_stats. From v2 on, we agreed to multiplex different engines inside one intel_context (and that´s why we renamed i915_hw_context to intel_context).

Yeah, at least for me, the reason was that the multiple structs per context id
code felt awkward given that most/all of the fields in a struct intel_context are
logically per-context rather than per-engine (vm, hang_stats, etc). It didn't
seem like the right approach to me at the time.

Brad

> 
> OPTION B (Brad´s proposal):
> 
> > So I suggested that we:
> > 
> > - Add a back pointer from struct intel_rinbuffer to intel_context (would only
> >   be valid for lrc mode)
> > - Move the intel_ringbuffer_get(engine, context) calls up to the callers
> > - Pass (engine, ringbuf) instead of (engine, context) to intel_ring_* functions
> > - Have the vfunc implemenations get the context from the ringbuffer where
> >   needed and ignore it where not
> > 
> > Looking again, we could probably add a back pointer to the intel_engine_cs
> > as well and then just pass around the ringbuf.
> 
> Sounds fine by me: intel_ringbuffer is only related to exactly one intel_engine_cs and one intel_context, so having pointers to those two makes sense.
> As before, this could be easily done within the existing code (passing intel_rinbgbuffer instead of intel_engine_cs), but Daniel wants a code split, so I can only do it for the logical ring functions.
Jesse Barnes June 24, 2014, 5:19 p.m. UTC | #14
On Mon, 23 Jun 2014 14:13:55 +0100
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> On Mon, Jun 23, 2014 at 01:09:37PM +0000, Mateo Lozano, Oscar wrote:
> > So far, yes, but that´s only because I artificially made
> > intel_lrc.c self-contained, as Daniel requested. What if we need to
> > execute commands from somewhere else, like in
> > intel_gen7_queue_flip()?
> > 
> > And this takes me to another discussion: this logical ring vs
> > legacy ring split is probably a good idea (time will tell), but we
> > should provide a way of sending commands for execution without
> > knowing if Execlists are enabled or not. In the early series that
> > was easy because we reused the ring_begin, ring_emit & ring_advance
> > functions, but this is not the case anymore. And without this,
> > sooner or later somebody will break legacy or execlists (this
> > already happened last week, when somebody here was implementing
> > native sync without knowing about Execlists).
> > 
> > So, the questions is: how do you feel about a dev_priv.gt vfunc
> > that takes a context, a ring, an array of DWORDS and a BB length
> > and does the intel_(logical)_ring_begin/emit/advance based on
> > i915.enable_execlists?
> 
> I'm still baffled by the design. intel_ring_begin() and friends should
> be able to find their context (logical or legacy) from the ring and
> dtrt.

To me, given that the LRC contains the ring, the right thing to do
would be to do what Oscar did in the first place: pass the context
around everywhere.  If there were cases where that wasn't ideal (the
layering violations you mention later?) we should fix them up instead.

But given that this is a standalone file, it's easy to fix up however
we want incrementally, as long as things work well to begin with and
it's reasonably easy to review it...

Jesse
oscar.mateo@intel.com June 26, 2014, 1:28 p.m. UTC | #15
> -----Original Message-----

> From: Jesse Barnes [mailto:jbarnes@virtuousgeek.org]

> Sent: Tuesday, June 24, 2014 6:20 PM

> To: Chris Wilson

> Cc: Mateo Lozano, Oscar; intel-gfx@lists.freedesktop.org

> Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical ring

> submission mechanism

> 

> On Mon, 23 Jun 2014 14:13:55 +0100

> Chris Wilson <chris@chris-wilson.co.uk> wrote:

> 

> > On Mon, Jun 23, 2014 at 01:09:37PM +0000, Mateo Lozano, Oscar wrote:

> > > So far, yes, but that´s only because I artificially made intel_lrc.c

> > > self-contained, as Daniel requested. What if we need to execute

> > > commands from somewhere else, like in intel_gen7_queue_flip()?

> > >

> > > And this takes me to another discussion: this logical ring vs legacy

> > > ring split is probably a good idea (time will tell), but we should

> > > provide a way of sending commands for execution without knowing if

> > > Execlists are enabled or not. In the early series that was easy

> > > because we reused the ring_begin, ring_emit & ring_advance

> > > functions, but this is not the case anymore. And without this,

> > > sooner or later somebody will break legacy or execlists (this

> > > already happened last week, when somebody here was implementing

> > > native sync without knowing about Execlists).

> > >

> > > So, the questions is: how do you feel about a dev_priv.gt vfunc that

> > > takes a context, a ring, an array of DWORDS and a BB length and does

> > > the intel_(logical)_ring_begin/emit/advance based on

> > > i915.enable_execlists?

> >

> > I'm still baffled by the design. intel_ring_begin() and friends should

> > be able to find their context (logical or legacy) from the ring and

> > dtrt.

> 

> To me, given that the LRC contains the ring, the right thing to do would be to

> do what Oscar did in the first place: pass the context around everywhere.  If

> there were cases where that wasn't ideal (the layering violations you

> mention later?) we should fix them up instead.

> 

> But given that this is a standalone file, it's easy to fix up however we want

> incrementally, as long as things work well to begin with and it's reasonably

> easy to review it...


Hi Jesse,

I spoke with Chris on IRC, and he told me that he is currently rewriting this stuff to show (and prove) what his approach is.

In the meantime, I´ll keep working on the review comments I have received from Brad and Daniel and I´ll also send some early patches with prep-work that (hopefully) can be merged without too much fuss.

-- Oscar
Daniel Vetter July 7, 2014, 12:39 p.m. UTC | #16
On Tue, Jun 24, 2014 at 12:29:45PM +0000, Mateo Lozano, Oscar wrote:
> > -----Original Message-----
> > From: Volkin, Bradley D
> > Sent: Monday, June 23, 2014 8:10 PM
> > To: Mateo Lozano, Oscar
> > Cc: Chris Wilson; intel-gfx@lists.freedesktop.org
> > Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical ring
> > submission mechanism
> > There are 3 cases of non-execbuffer submissions that I can think of: flips,
> > render state, and clear-buffer (proposed patches on the list). I wonder if the
> > right approach might be to use batchbuffers with a small wrapper around the
> > dispatch_execbuffer/emit_bb_start vfuncs. Basically the rule would be to
> > only touch a ringbuffer from within the intel_engine_cs vfuncs, which always
> > know which set of functions to use.
> > 
> > For flips, we could use MMIO flips. Render state already uses the existing
> > dispatch_execbuffer() and add_request(). The clear code could potentially do
> > the same. There would obviously be some overhead in using a batch buffer
> > for what could end up being just a few commands. Perhaps the batch buffer
> > pool code from the command parser would help though.
> 
> This has another positive side-effect: the scheduler guys do not like
> things inside the ring without a proper batchbuffer & request, because
> it makes their life more complex.

I'm probably missing all the context here but I've thought this is the
plan forward: We'll use mmio flips with execlists and otherwise we'll
submit everything with the right context/engine whaterever using
execlist-specific functions.

Since the gpu clear code isn't merged yet we can imo ignore it for now and
merge execlists first.
-Daniel
Daniel Vetter July 7, 2014, 12:41 p.m. UTC | #17
On Mon, Jun 23, 2014 at 02:13:55PM +0100, Chris Wilson wrote:
> On Mon, Jun 23, 2014 at 01:09:37PM +0000, Mateo Lozano, Oscar wrote:
> > So far, yes, but that´s only because I artificially made intel_lrc.c self-contained, as Daniel requested. What if we need to execute commands from somewhere else, like in intel_gen7_queue_flip()?
> > 
> > And this takes me to another discussion: this logical ring vs legacy ring split is probably a good idea (time will tell), but we should provide a way of sending commands for execution without knowing if Execlists are enabled or not. In the early series that was easy because we reused the ring_begin, ring_emit & ring_advance functions, but this is not the case anymore. And without this, sooner or later somebody will break legacy or execlists (this already happened last week, when somebody here was implementing native sync without knowing about Execlists).
> > 
> > So, the questions is: how do you feel about a dev_priv.gt vfunc that takes a context, a ring, an array of DWORDS and a BB length and does the intel_(logical)_ring_begin/emit/advance based on i915.enable_execlists?
> 
> I'm still baffled by the design. intel_ring_begin() and friends should
> be able to find their context (logical or legacy) from the ring and
> dtrt.

Well I'm opting for a the different approach of presuming that the callers
knows whether we're running with execlists or legacy rings and so will
have a clean (and full) split. If we really need to submit massive amounts
of cs commands from the kernel we should launch that as a batch, which
should be fairly unform for both legacy ring and execlists mode.
-Daniel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 02fc3d0..89aed7a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -86,6 +86,220 @@  bool intel_enable_execlists(struct drm_device *dev)
 	return HAS_LOGICAL_RING_CONTEXTS(dev) && USES_PPGTT(dev);
 }
 
+static inline struct intel_ringbuffer *
+logical_ringbuf_get(struct intel_engine_cs *ring, struct intel_context *ctx)
+{
+	return ctx->engine[ring->id].ringbuf;
+}
+
+void intel_logical_ring_advance_and_submit(struct intel_engine_cs *ring,
+					   struct intel_context *ctx)
+{
+	struct intel_ringbuffer *ringbuf = logical_ringbuf_get(ring, ctx);
+
+	intel_logical_ring_advance(ringbuf);
+
+	if (intel_ring_stopped(ring))
+		return;
+
+	ring->submit_ctx(ring, ctx, ringbuf->tail);
+}
+
+static int logical_ring_alloc_seqno(struct intel_engine_cs *ring,
+				    struct intel_context *ctx)
+{
+	if (ring->outstanding_lazy_seqno)
+		return 0;
+
+	if (ring->preallocated_lazy_request == NULL) {
+		struct drm_i915_gem_request *request;
+
+		request = kmalloc(sizeof(*request), GFP_KERNEL);
+		if (request == NULL)
+			return -ENOMEM;
+
+		ring->preallocated_lazy_request = request;
+	}
+
+	return i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
+}
+
+static int logical_ring_wait_request(struct intel_engine_cs *ring,
+				     struct intel_ringbuffer *ringbuf,
+				     struct intel_context *ctx,
+				     int bytes)
+{
+	struct drm_i915_gem_request *request;
+	u32 seqno = 0;
+	int ret;
+
+	if (ringbuf->last_retired_head != -1) {
+		ringbuf->head = ringbuf->last_retired_head;
+		ringbuf->last_retired_head = -1;
+
+		ringbuf->space = intel_ring_space(ringbuf);
+		if (ringbuf->space >= bytes)
+			return 0;
+	}
+
+	list_for_each_entry(request, &ring->request_list, list) {
+		if (__intel_ring_space(request->tail, ringbuf->tail,
+				ringbuf->size) >= bytes) {
+			seqno = request->seqno;
+			break;
+		}
+	}
+
+	if (seqno == 0)
+		return -ENOSPC;
+
+	ret = i915_wait_seqno(ring, seqno);
+	if (ret)
+		return ret;
+
+	/* TODO: make sure we update the right ringbuffer's last_retired_head
+	 * when retiring requests */
+	i915_gem_retire_requests_ring(ring);
+	ringbuf->head = ringbuf->last_retired_head;
+	ringbuf->last_retired_head = -1;
+
+	ringbuf->space = intel_ring_space(ringbuf);
+	return 0;
+}
+
+static int logical_ring_wait_for_space(struct intel_engine_cs *ring,
+						   struct intel_ringbuffer *ringbuf,
+						   struct intel_context *ctx,
+						   int bytes)
+{
+	struct drm_device *dev = ring->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	unsigned long end;
+	int ret;
+
+	ret = logical_ring_wait_request(ring, ringbuf, ctx, bytes);
+	if (ret != -ENOSPC)
+		return ret;
+
+	/* Force the context submission in case we have been skipping it */
+	intel_logical_ring_advance_and_submit(ring, ctx);
+
+	/* With GEM the hangcheck timer should kick us out of the loop,
+	 * leaving it early runs the risk of corrupting GEM state (due
+	 * to running on almost untested codepaths). But on resume
+	 * timers don't work yet, so prevent a complete hang in that
+	 * case by choosing an insanely large timeout. */
+	end = jiffies + 60 * HZ;
+
+	do {
+		ringbuf->head = I915_READ_HEAD(ring);
+		ringbuf->space = intel_ring_space(ringbuf);
+		if (ringbuf->space >= bytes) {
+			ret = 0;
+			break;
+		}
+
+		if (!drm_core_check_feature(dev, DRIVER_MODESET) &&
+		    dev->primary->master) {
+			struct drm_i915_master_private *master_priv = dev->primary->master->driver_priv;
+			if (master_priv->sarea_priv)
+				master_priv->sarea_priv->perf_boxes |= I915_BOX_WAIT;
+		}
+
+		msleep(1);
+
+		if (dev_priv->mm.interruptible && signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		ret = i915_gem_check_wedge(&dev_priv->gpu_error,
+					   dev_priv->mm.interruptible);
+		if (ret)
+			break;
+
+		if (time_after(jiffies, end)) {
+			ret = -EBUSY;
+			break;
+		}
+	} while (1);
+
+	return ret;
+}
+
+static int logical_ring_wrap_buffer(struct intel_engine_cs *ring,
+						struct intel_ringbuffer *ringbuf,
+						struct intel_context *ctx)
+{
+	uint32_t __iomem *virt;
+	int rem = ringbuf->size - ringbuf->tail;
+
+	if (ringbuf->space < rem) {
+		int ret = logical_ring_wait_for_space(ring, ringbuf, ctx, rem);
+		if (ret)
+			return ret;
+	}
+
+	virt = ringbuf->virtual_start + ringbuf->tail;
+	rem /= 4;
+	while (rem--)
+		iowrite32(MI_NOOP, virt++);
+
+	ringbuf->tail = 0;
+	ringbuf->space = intel_ring_space(ringbuf);
+
+	return 0;
+}
+
+static int logical_ring_prepare(struct intel_engine_cs *ring,
+				struct intel_ringbuffer *ringbuf,
+				struct intel_context *ctx,
+				int bytes)
+{
+	int ret;
+
+	if (unlikely(ringbuf->tail + bytes > ringbuf->effective_size)) {
+		ret = logical_ring_wrap_buffer(ring, ringbuf, ctx);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	if (unlikely(ringbuf->space < bytes)) {
+		ret = logical_ring_wait_for_space(ring, ringbuf, ctx, bytes);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	return 0;
+}
+
+int intel_logical_ring_begin(struct intel_engine_cs *ring,
+			     struct intel_context *ctx,
+			     int num_dwords)
+{
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct intel_ringbuffer *ringbuf = logical_ringbuf_get(ring, ctx);
+	int ret;
+
+	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
+				   dev_priv->mm.interruptible);
+	if (ret)
+		return ret;
+
+	ret = logical_ring_prepare(ring, ringbuf, ctx,
+			num_dwords * sizeof(uint32_t));
+	if (ret)
+		return ret;
+
+	/* Preallocate the olr before touching the ring */
+	ret = logical_ring_alloc_seqno(ring, ctx);
+	if (ret)
+		return ret;
+
+	ringbuf->space -= num_dwords * sizeof(uint32_t);
+	return 0;
+}
+
 static int gen8_init_common_ring(struct intel_engine_cs *ring)
 {
 	struct drm_device *dev = ring->dev;
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 26b0949..686ebf5 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -5,6 +5,24 @@ 
 void intel_logical_ring_cleanup(struct intel_engine_cs *ring);
 int intel_logical_rings_init(struct drm_device *dev);
 
+void intel_logical_ring_advance_and_submit(struct intel_engine_cs *ring,
+					   struct intel_context *ctx);
+
+static inline void intel_logical_ring_advance(struct intel_ringbuffer *ringbuf)
+{
+	ringbuf->tail &= ringbuf->size - 1;
+}
+
+static inline void intel_logical_ring_emit(struct intel_ringbuffer *ringbuf, u32 data)
+{
+	iowrite32(data, ringbuf->virtual_start + ringbuf->tail);
+	ringbuf->tail += 4;
+}
+
+int intel_logical_ring_begin(struct intel_engine_cs *ring,
+			     struct intel_context *ctx,
+			     int num_dwords);
+
 /* Logical Ring Contexts */
 void intel_lr_context_free(struct intel_context *ctx);
 int intel_lr_context_deferred_create(struct intel_context *ctx,