drm/i915: Lazily apply the SNB+ seqno w/a

Message ID	1344506310-18191-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org> From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Thu, 9 Aug 2012 10:58:30 +0100 Message-Id: <1344506310-18191-1-git-send-email-chris@chris-wilson.co.uk> In-Reply-To: <1344503896-18093-1-git-send-email-chris@chris-wilson.co.uk> References: <1344503896-18093-1-git-send-email-chris@chris-wilson.co.uk> Subject: [Intel-gfx] [PATCH] drm/i915: Lazily apply the SNB+ seqno w/a Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org Errors-To: intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org

Message ID

1344506310-18191-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)

State

New, archived

Headers

From: Chris Wilson <chris@chris-wilson.co.uk>
To: intel-gfx@lists.freedesktop.org
Date: Thu,  9 Aug 2012 10:58:30 +0100
Message-Id: <1344506310-18191-1-git-send-email-chris@chris-wilson.co.uk>
In-Reply-To: <1344503896-18093-1-git-send-email-chris@chris-wilson.co.uk>
References: <1344503896-18093-1-git-send-email-chris@chris-wilson.co.uk>
Subject: [Intel-gfx] [PATCH] drm/i915: Lazily apply the SNB+ seqno w/a
Precedence: list
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Sender: intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org
Errors-To: intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org

Commit Message

Chris Wilson Aug. 9, 2012, 9:58 a.m. UTC

Avoid the forcewake overhead when simply retiring requests, as often the
last seen seqno is good enough to satisfy the retirment process and will
be promptly re-run in any case. Only ensure that we force the coherent
seqno read when we are explicitly waiting upon a completion event to be
sure that none go missing, and also for when we are reporting seqno
values in case of error or debugging.

This greatly reduces the load for userspace using the busy-ioctl to
track active buffers, for instance halving the CPU used by X in pushing
the pixels from a software render (flash). The effect will be even more
magnified with userptr and so providing a zero-copy upload path in that
instance, or in similar instances where X is simply compositing DRI
buffers.

v2: Reverse the polarity of the tachyon stream. Daniel suggested that
'force' was too generic for the parameter name and that 'lazy_coherency'
better encapsulated the semantics of it being an optimization and its
purpose. Also notice that gen6_get_seqno() is only used by gen6/7
chipsets and so the test for IS_GEN6 || IS_GEN7 is redundant in that
function.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |    2 +-
 drivers/gpu/drm/i915/i915_gem.c         |    6 +++---
 drivers/gpu/drm/i915/i915_irq.c         |    9 +++++----
 drivers/gpu/drm/i915/intel_ringbuffer.c |   10 ++++------
 drivers/gpu/drm/i915/intel_ringbuffer.h |    9 ++++++++-
 5 files changed, 21 insertions(+), 15 deletions(-)

Comments

Daniel Vetter Aug. 9, 2012, 10:29 a.m. UTC | #1

On Thu, Aug 09, 2012 at 10:58:30AM +0100, Chris Wilson wrote:
> Avoid the forcewake overhead when simply retiring requests, as often the
> last seen seqno is good enough to satisfy the retirment process and will
> be promptly re-run in any case. Only ensure that we force the coherent
> seqno read when we are explicitly waiting upon a completion event to be
> sure that none go missing, and also for when we are reporting seqno
> values in case of error or debugging.
> 
> This greatly reduces the load for userspace using the busy-ioctl to
> track active buffers, for instance halving the CPU used by X in pushing
> the pixels from a software render (flash). The effect will be even more
> magnified with userptr and so providing a zero-copy upload path in that
> instance, or in similar instances where X is simply compositing DRI
> buffers.
> 
> v2: Reverse the polarity of the tachyon stream. Daniel suggested that
> 'force' was too generic for the parameter name and that 'lazy_coherency'
> better encapsulated the semantics of it being an optimization and its
> purpose. Also notice that gen6_get_seqno() is only used by gen6/7
> chipsets and so the test for IS_GEN6 || IS_GEN7 is redundant in that
> function.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Yeah, I like the new color.

Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>

I'll muse over this some more before picking it up, just in case I'll
notice a place this could blow up ...
-Daniel

> ---
>  drivers/gpu/drm/i915/i915_debugfs.c     |    2 +-
>  drivers/gpu/drm/i915/i915_gem.c         |    6 +++---
>  drivers/gpu/drm/i915/i915_irq.c         |    9 +++++----
>  drivers/gpu/drm/i915/intel_ringbuffer.c |   10 ++++------
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    9 ++++++++-
>  5 files changed, 21 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 86444fe..544abf5 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -426,7 +426,7 @@ static void i915_ring_seqno_info(struct seq_file *m,
>  {
>  	if (ring->get_seqno) {
>  		seq_printf(m, "Current sequence (%s): %d\n",
> -			   ring->name, ring->get_seqno(ring));
> +			   ring->name, ring->get_seqno(ring, false));
>  	}
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 153c533..0582e22 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1888,7 +1888,7 @@ i915_gem_retire_requests_ring(struct intel_ring_buffer *ring)
>  
>  	WARN_ON(i915_verify_lists(ring->dev));
>  
> -	seqno = ring->get_seqno(ring);
> +	seqno = ring->get_seqno(ring, true);
>  
>  	for (i = 0; i < ARRAY_SIZE(ring->sync_seqno); i++)
>  		if (seqno >= ring->sync_seqno[i])
> @@ -2060,7 +2060,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
>  	bool wait_forever = true;
>  	int ret;
>  
> -	if (i915_seqno_passed(ring->get_seqno(ring), seqno))
> +	if (i915_seqno_passed(ring->get_seqno(ring, true), seqno))
>  		return 0;
>  
>  	trace_i915_gem_request_wait_begin(ring, seqno);
> @@ -2079,7 +2079,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
>  	getrawmonotonic(&before);
>  
>  #define EXIT_COND \
> -	(i915_seqno_passed(ring->get_seqno(ring), seqno) || \
> +	(i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
>  	atomic_read(&dev_priv->mm.wedged))
>  	do {
>  		if (interruptible)
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 57e4f2b..0ba15e8 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -335,7 +335,7 @@ static void notify_ring(struct drm_device *dev,
>  	if (ring->obj == NULL)
>  		return;
>  
> -	trace_i915_gem_request_complete(ring, ring->get_seqno(ring));
> +	trace_i915_gem_request_complete(ring, ring->get_seqno(ring, false));
>  
>  	wake_up_all(&ring->irq_queue);
>  	if (i915_enable_hangcheck) {
> @@ -1052,7 +1052,7 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>  	if (!ring->get_seqno)
>  		return NULL;
>  
> -	seqno = ring->get_seqno(ring);
> +	seqno = ring->get_seqno(ring, false);
>  	list_for_each_entry(obj, &dev_priv->mm.active_list, mm_list) {
>  		if (obj->ring != ring)
>  			continue;
> @@ -1106,7 +1106,7 @@ static void i915_record_ring_state(struct drm_device *dev,
>  
>  	error->waiting[ring->id] = waitqueue_active(&ring->irq_queue);
>  	error->instpm[ring->id] = I915_READ(RING_INSTPM(ring->mmio_base));
> -	error->seqno[ring->id] = ring->get_seqno(ring);
> +	error->seqno[ring->id] = ring->get_seqno(ring, false);
>  	error->acthd[ring->id] = intel_ring_get_active_head(ring);
>  	error->head[ring->id] = I915_READ_HEAD(ring);
>  	error->tail[ring->id] = I915_READ_TAIL(ring);
> @@ -1603,7 +1603,8 @@ ring_last_seqno(struct intel_ring_buffer *ring)
>  static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring, bool *err)
>  {
>  	if (list_empty(&ring->request_list) ||
> -	    i915_seqno_passed(ring->get_seqno(ring), ring_last_seqno(ring))) {
> +	    i915_seqno_passed(ring->get_seqno(ring, false),
> +			      ring_last_seqno(ring))) {
>  		/* Issue a wake-up to catch stuck h/w. */
>  		if (waitqueue_active(&ring->irq_queue)) {
>  			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 6c0f504..c77bcd8 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -618,26 +618,24 @@ pc_render_add_request(struct intel_ring_buffer *ring,
>  }
>  
>  static u32
> -gen6_ring_get_seqno(struct intel_ring_buffer *ring)
> +gen6_ring_get_seqno(struct intel_ring_buffer *ring, bool lazy_coherency)
>  {
> -	struct drm_device *dev = ring->dev;
> -
>  	/* Workaround to force correct ordering between irq and seqno writes on
>  	 * ivb (and maybe also on snb) by reading from a CS register (like
>  	 * ACTHD) before reading the status page. */
> -	if (IS_GEN6(dev) || IS_GEN7(dev))
> +	if (!lazy_coherency)
>  		intel_ring_get_active_head(ring);
>  	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
>  }
>  
>  static u32
> -ring_get_seqno(struct intel_ring_buffer *ring)
> +ring_get_seqno(struct intel_ring_buffer *ring, bool lazy_coherency)
>  {
>  	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
>  }
>  
>  static u32
> -pc_render_get_seqno(struct intel_ring_buffer *ring)
> +pc_render_get_seqno(struct intel_ring_buffer *ring, bool lazy_coherency)
>  {
>  	struct pipe_control *pc = ring->private;
>  	return pc->cpu_page[0];
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 8b2b92e..2ea7a31 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -72,7 +72,14 @@ struct  intel_ring_buffer {
>  				  u32	flush_domains);
>  	int		(*add_request)(struct intel_ring_buffer *ring,
>  				       u32 *seqno);
> -	u32		(*get_seqno)(struct intel_ring_buffer *ring);
> +	/* Some chipsets are not quite as coherent as advertised and need
> +	 * an expensive kick to force a true read of the up-to-date seqno.
> +	 * However, the up-to-date seqno is not always required and the last
> +	 * seen value is good enough. Note that the seqno will always be
> +	 * monotonic, even if not coherent.
> +	 */
> +	u32		(*get_seqno)(struct intel_ring_buffer *ring,
> +				     bool lazy_coherency);
>  	int		(*dispatch_execbuffer)(struct intel_ring_buffer *ring,
>  					       u32 offset, u32 length);
>  	void		(*cleanup)(struct intel_ring_buffer *ring);
> -- 
> 1.7.10.4
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Daniel Vetter Aug. 10, 2012, 9:38 a.m. UTC | #2

On Thu, Aug 09, 2012 at 12:29:16PM +0200, Daniel Vetter wrote:
> On Thu, Aug 09, 2012 at 10:58:30AM +0100, Chris Wilson wrote:
> > Avoid the forcewake overhead when simply retiring requests, as often the
> > last seen seqno is good enough to satisfy the retirment process and will
> > be promptly re-run in any case. Only ensure that we force the coherent
> > seqno read when we are explicitly waiting upon a completion event to be
> > sure that none go missing, and also for when we are reporting seqno
> > values in case of error or debugging.
> > 
> > This greatly reduces the load for userspace using the busy-ioctl to
> > track active buffers, for instance halving the CPU used by X in pushing
> > the pixels from a software render (flash). The effect will be even more
> > magnified with userptr and so providing a zero-copy upload path in that
> > instance, or in similar instances where X is simply compositing DRI
> > buffers.
> > 
> > v2: Reverse the polarity of the tachyon stream. Daniel suggested that
> > 'force' was too generic for the parameter name and that 'lazy_coherency'
> > better encapsulated the semantics of it being an optimization and its
> > purpose. Also notice that gen6_get_seqno() is only used by gen6/7
> > chipsets and so the test for IS_GEN6 || IS_GEN7 is redundant in that
> > function.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Yeah, I like the new color.
> 
> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> 
> I'll muse over this some more before picking it up, just in case I'll
> notice a place this could blow up ...

Ok, slurped into dinq, thanks for the patch.
-Daniel

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 86444fe..544abf5 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -426,7 +426,7 @@  static void i915_ring_seqno_info(struct seq_file *m,
 {
 	if (ring->get_seqno) {
 		seq_printf(m, "Current sequence (%s): %d\n",
-			   ring->name, ring->get_seqno(ring));
+			   ring->name, ring->get_seqno(ring, false));
 	}
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 153c533..0582e22 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1888,7 +1888,7 @@  i915_gem_retire_requests_ring(struct intel_ring_buffer *ring)
 
 	WARN_ON(i915_verify_lists(ring->dev));
 
-	seqno = ring->get_seqno(ring);
+	seqno = ring->get_seqno(ring, true);
 
 	for (i = 0; i < ARRAY_SIZE(ring->sync_seqno); i++)
 		if (seqno >= ring->sync_seqno[i])
@@ -2060,7 +2060,7 @@  static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
 	bool wait_forever = true;
 	int ret;
 
-	if (i915_seqno_passed(ring->get_seqno(ring), seqno))
+	if (i915_seqno_passed(ring->get_seqno(ring, true), seqno))
 		return 0;
 
 	trace_i915_gem_request_wait_begin(ring, seqno);
@@ -2079,7 +2079,7 @@  static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
 	getrawmonotonic(&before);
 
 #define EXIT_COND \
-	(i915_seqno_passed(ring->get_seqno(ring), seqno) || \
+	(i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
 	atomic_read(&dev_priv->mm.wedged))
 	do {
 		if (interruptible)
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 57e4f2b..0ba15e8 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -335,7 +335,7 @@  static void notify_ring(struct drm_device *dev,
 	if (ring->obj == NULL)
 		return;
 
-	trace_i915_gem_request_complete(ring, ring->get_seqno(ring));
+	trace_i915_gem_request_complete(ring, ring->get_seqno(ring, false));
 
 	wake_up_all(&ring->irq_queue);
 	if (i915_enable_hangcheck) {
@@ -1052,7 +1052,7 @@  i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
 	if (!ring->get_seqno)
 		return NULL;
 
-	seqno = ring->get_seqno(ring);
+	seqno = ring->get_seqno(ring, false);
 	list_for_each_entry(obj, &dev_priv->mm.active_list, mm_list) {
 		if (obj->ring != ring)
 			continue;
@@ -1106,7 +1106,7 @@  static void i915_record_ring_state(struct drm_device *dev,
 
 	error->waiting[ring->id] = waitqueue_active(&ring->irq_queue);
 	error->instpm[ring->id] = I915_READ(RING_INSTPM(ring->mmio_base));
-	error->seqno[ring->id] = ring->get_seqno(ring);
+	error->seqno[ring->id] = ring->get_seqno(ring, false);
 	error->acthd[ring->id] = intel_ring_get_active_head(ring);
 	error->head[ring->id] = I915_READ_HEAD(ring);
 	error->tail[ring->id] = I915_READ_TAIL(ring);
@@ -1603,7 +1603,8 @@  ring_last_seqno(struct intel_ring_buffer *ring)
 static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring, bool *err)
 {
 	if (list_empty(&ring->request_list) ||
-	    i915_seqno_passed(ring->get_seqno(ring), ring_last_seqno(ring))) {
+	    i915_seqno_passed(ring->get_seqno(ring, false),
+			      ring_last_seqno(ring))) {
 		/* Issue a wake-up to catch stuck h/w. */
 		if (waitqueue_active(&ring->irq_queue)) {
 			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 6c0f504..c77bcd8 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -618,26 +618,24 @@  pc_render_add_request(struct intel_ring_buffer *ring,
 }
 
 static u32
-gen6_ring_get_seqno(struct intel_ring_buffer *ring)
+gen6_ring_get_seqno(struct intel_ring_buffer *ring, bool lazy_coherency)
 {
-	struct drm_device *dev = ring->dev;
-
 	/* Workaround to force correct ordering between irq and seqno writes on
 	 * ivb (and maybe also on snb) by reading from a CS register (like
 	 * ACTHD) before reading the status page. */
-	if (IS_GEN6(dev) || IS_GEN7(dev))
+	if (!lazy_coherency)
 		intel_ring_get_active_head(ring);
 	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
 }
 
 static u32
-ring_get_seqno(struct intel_ring_buffer *ring)
+ring_get_seqno(struct intel_ring_buffer *ring, bool lazy_coherency)
 {
 	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
 }
 
 static u32
-pc_render_get_seqno(struct intel_ring_buffer *ring)
+pc_render_get_seqno(struct intel_ring_buffer *ring, bool lazy_coherency)
 {
 	struct pipe_control *pc = ring->private;
 	return pc->cpu_page[0];
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 8b2b92e..2ea7a31 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -72,7 +72,14 @@  struct  intel_ring_buffer {
 				  u32	flush_domains);
 	int		(*add_request)(struct intel_ring_buffer *ring,
 				       u32 *seqno);
-	u32		(*get_seqno)(struct intel_ring_buffer *ring);
+	/* Some chipsets are not quite as coherent as advertised and need
+	 * an expensive kick to force a true read of the up-to-date seqno.
+	 * However, the up-to-date seqno is not always required and the last
+	 * seen value is good enough. Note that the seqno will always be
+	 * monotonic, even if not coherent.
+	 */
+	u32		(*get_seqno)(struct intel_ring_buffer *ring,
+				     bool lazy_coherency);
 	int		(*dispatch_execbuffer)(struct intel_ring_buffer *ring,
 					       u32 offset, u32 length);
 	void		(*cleanup)(struct intel_ring_buffer *ring);

drm/i915: Lazily apply the SNB+ seqno w/a

Commit Message

Comments

Patch