diff mbox

[17/70] drm/i915: Optimistically spin for the request completion

Message ID 1428420094-18352-18-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson April 7, 2015, 3:20 p.m. UTC
This provides a nice boost to mesa in swap bound scenarios (as mesa
throttles itself to the previous frame and given the scenario that will
complete shortly). It will also provide a good boost to systems running
with semaphores disabled and so frequently waiting on the GPU as it
switches rings. In the most favourable of microbenchmarks, this can
increase performance by around 15% - though in practice improvements
will be marginal and rarely noticeable.

v2: Account for user timeouts
v3: Limit the spinning to a single jiffie (~1us) at most. On an
otherwise idle system, there is no scheduler contention and so without a
limit we would spin until the GPU is ready.
v4: Drop forcewake - the lazy coherent access doesn't require it, and we
have no reason to believe that the forcewake itself improves seqno
coherency - it only adds delay.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 44 +++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

Comments

Daniel Vetter April 8, 2015, 11:39 a.m. UTC | #1
On Tue, Apr 07, 2015 at 04:20:41PM +0100, Chris Wilson wrote:
> This provides a nice boost to mesa in swap bound scenarios (as mesa
> throttles itself to the previous frame and given the scenario that will
> complete shortly). It will also provide a good boost to systems running
> with semaphores disabled and so frequently waiting on the GPU as it
> switches rings. In the most favourable of microbenchmarks, this can
> increase performance by around 15% - though in practice improvements
> will be marginal and rarely noticeable.
> 
> v2: Account for user timeouts
> v3: Limit the spinning to a single jiffie (~1us) at most. On an
> otherwise idle system, there is no scheduler contention and so without a
> limit we would spin until the GPU is ready.
> v4: Drop forcewake - the lazy coherent access doesn't require it, and we
> have no reason to believe that the forcewake itself improves seqno
> coherency - it only adds delay.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>

Eero/Valtterri, do you have perf data for this one?

Thanks, Daniel

> ---
>  drivers/gpu/drm/i915/i915_gem.c | 44 +++++++++++++++++++++++++++++++++++------
>  1 file changed, 38 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index c7d9ee2f708a..47650327204e 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1181,6 +1181,29 @@ static bool missed_irq(struct drm_i915_private *dev_priv,
>  	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
>  }
>  
> +static int __i915_spin_request(struct drm_i915_gem_request *rq)
> +{
> +	unsigned long timeout;
> +
> +	if (i915_gem_request_get_ring(rq)->irq_refcount)
> +		return -EBUSY;
> +
> +	timeout = jiffies + 1;
> +	while (!need_resched()) {
> +		if (i915_gem_request_completed(rq, true))
> +			return 0;
> +
> +		if (time_after_eq(jiffies, timeout))
> +			break;
> +
> +		cpu_relax_lowlatency();
> +	}
> +	if (i915_gem_request_completed(rq, false))
> +		return 0;
> +
> +	return -EAGAIN;
> +}
> +
>  /**
>   * __i915_wait_request - wait until execution of request has finished
>   * @req: duh!
> @@ -1225,12 +1248,20 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>  	if (INTEL_INFO(dev)->gen >= 6)
>  		gen6_rps_boost(dev_priv, file_priv);
>  
> -	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring)))
> -		return -ENODEV;
> -
>  	/* Record current time in case interrupted by signal, or wedged */
>  	trace_i915_gem_request_wait_begin(req);
>  	before = ktime_get_raw_ns();
> +
> +	/* Optimistic spin for the next jiffie before touching IRQs */
> +	ret = __i915_spin_request(req);
> +	if (ret == 0)
> +		goto out;
> +
> +	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
> +
>  	for (;;) {
>  		struct timer_list timer;
>  
> @@ -1279,14 +1310,15 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>  			destroy_timer_on_stack(&timer);
>  		}
>  	}
> -	now = ktime_get_raw_ns();
> -	trace_i915_gem_request_wait_end(req);
> -
>  	if (!irq_test_in_progress)
>  		ring->irq_put(ring);
>  
>  	finish_wait(&ring->irq_queue, &wait);
>  
> +out:
> +	now = ktime_get_raw_ns();
> +	trace_i915_gem_request_wait_end(req);
> +
>  	if (timeout) {
>  		s64 tres = *timeout - (now - before);
>  
> -- 
> 2.1.4
>
Rantala, Valtteri April 8, 2015, 1:43 p.m. UTC | #2
Hi, 


> -----Original Message-----
> From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel Vetter
> Sent: Wednesday, April 08, 2015 2:40 PM
> To: Chris Wilson
> Cc: intel-gfx@lists.freedesktop.org; Daniel Vetter; Tvrtko Ursulin; Tamminen,
> Eero T; Rantala, Valtteri
> Subject: Re: [PATCH 17/70] drm/i915: Optimistically spin for the request
> completion
> 
> On Tue, Apr 07, 2015 at 04:20:41PM +0100, Chris Wilson wrote:
> > This provides a nice boost to mesa in swap bound scenarios (as mesa
> > throttles itself to the previous frame and given the scenario that
> > will complete shortly). It will also provide a good boost to systems
> > running with semaphores disabled and so frequently waiting on the GPU
> > as it switches rings. In the most favourable of microbenchmarks, this
> > can increase performance by around 15% - though in practice
> > improvements will be marginal and rarely noticeable.
> >
> > v2: Account for user timeouts
> > v3: Limit the spinning to a single jiffie (~1us) at most. On an
> > otherwise idle system, there is no scheduler contention and so without
> > a limit we would spin until the GPU is ready.
> > v4: Drop forcewake - the lazy coherent access doesn't require it, and
> > we have no reason to believe that the forcewake itself improves seqno
> > coherency - it only adds delay.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> > Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
> 
> Eero/Valtterri, do you have perf data for this one?
> 
[Rantala, Valtteri] 
I have issues with applying this patch to latest night, have to check that out.

Can you provide Git/branch that I could use?

--
Valtteri

> Thanks, Daniel
> 
> > ---
> >  drivers/gpu/drm/i915/i915_gem.c | 44
> > +++++++++++++++++++++++++++++++++++------
> >  1 file changed, 38 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c
> > b/drivers/gpu/drm/i915/i915_gem.c index c7d9ee2f708a..47650327204e
> > 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -1181,6 +1181,29 @@ static bool missed_irq(struct drm_i915_private
> *dev_priv,
> >  	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
> >  }
> >
> > +static int __i915_spin_request(struct drm_i915_gem_request *rq) {
> > +	unsigned long timeout;
> > +
> > +	if (i915_gem_request_get_ring(rq)->irq_refcount)
> > +		return -EBUSY;
> > +
> > +	timeout = jiffies + 1;
> > +	while (!need_resched()) {
> > +		if (i915_gem_request_completed(rq, true))
> > +			return 0;
> > +
> > +		if (time_after_eq(jiffies, timeout))
> > +			break;
> > +
> > +		cpu_relax_lowlatency();
> > +	}
> > +	if (i915_gem_request_completed(rq, false))
> > +		return 0;
> > +
> > +	return -EAGAIN;
> > +}
> > +
> >  /**
> >   * __i915_wait_request - wait until execution of request has finished
> >   * @req: duh!
> > @@ -1225,12 +1248,20 @@ int __i915_wait_request(struct
> drm_i915_gem_request *req,
> >  	if (INTEL_INFO(dev)->gen >= 6)
> >  		gen6_rps_boost(dev_priv, file_priv);
> >
> > -	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring)))
> > -		return -ENODEV;
> > -
> >  	/* Record current time in case interrupted by signal, or wedged
> */
> >  	trace_i915_gem_request_wait_begin(req);
> >  	before = ktime_get_raw_ns();
> > +
> > +	/* Optimistic spin for the next jiffie before touching IRQs */
> > +	ret = __i915_spin_request(req);
> > +	if (ret == 0)
> > +		goto out;
> > +
> > +	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
> > +		ret = -ENODEV;
> > +		goto out;
> > +	}
> > +
> >  	for (;;) {
> >  		struct timer_list timer;
> >
> > @@ -1279,14 +1310,15 @@ int __i915_wait_request(struct
> drm_i915_gem_request *req,
> >  			destroy_timer_on_stack(&timer);
> >  		}
> >  	}
> > -	now = ktime_get_raw_ns();
> > -	trace_i915_gem_request_wait_end(req);
> > -
> >  	if (!irq_test_in_progress)
> >  		ring->irq_put(ring);
> >
> >  	finish_wait(&ring->irq_queue, &wait);
> >
> > +out:
> > +	now = ktime_get_raw_ns();
> > +	trace_i915_gem_request_wait_end(req);
> > +
> >  	if (timeout) {
> >  		s64 tres = *timeout - (now - before);
> >
> > --
> > 2.1.4
> >
> 
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
---------------------------------------------------------------------
Intel Finland Oy
Registered Address: PL 281, 00181 Helsinki 
Business Identity Code: 0357606 - 4 
Domiciled in Helsinki 

This e-mail and any attachments may contain confidential material for
the sole use of the intended recipient(s). Any review or distribution
by others is strictly prohibited. If you are not the intended
recipient, please contact the sender and delete all copies.
Daniel Vetter April 8, 2015, 2:15 p.m. UTC | #3
On Wed, Apr 08, 2015 at 01:43:47PM +0000, Rantala, Valtteri wrote:
> Hi, 
> 
> 
> > -----Original Message-----
> > From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel Vetter
> > Sent: Wednesday, April 08, 2015 2:40 PM
> > To: Chris Wilson
> > Cc: intel-gfx@lists.freedesktop.org; Daniel Vetter; Tvrtko Ursulin; Tamminen,
> > Eero T; Rantala, Valtteri
> > Subject: Re: [PATCH 17/70] drm/i915: Optimistically spin for the request
> > completion
> > 
> > On Tue, Apr 07, 2015 at 04:20:41PM +0100, Chris Wilson wrote:
> > > This provides a nice boost to mesa in swap bound scenarios (as mesa
> > > throttles itself to the previous frame and given the scenario that
> > > will complete shortly). It will also provide a good boost to systems
> > > running with semaphores disabled and so frequently waiting on the GPU
> > > as it switches rings. In the most favourable of microbenchmarks, this
> > > can increase performance by around 15% - though in practice
> > > improvements will be marginal and rarely noticeable.
> > >
> > > v2: Account for user timeouts
> > > v3: Limit the spinning to a single jiffie (~1us) at most. On an
> > > otherwise idle system, there is no scheduler contention and so without
> > > a limit we would spin until the GPU is ready.
> > > v4: Drop forcewake - the lazy coherent access doesn't require it, and
> > > we have no reason to believe that the forcewake itself improves seqno
> > > coherency - it only adds delay.
> > >
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> > > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > > Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> > > Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
> > 
> > Eero/Valtterri, do you have perf data for this one?
> > 
> [Rantala, Valtteri] 
> I have issues with applying this patch to latest night, have to check that out.
> 
> Can you provide Git/branch that I could use?

It applies cleanly here afaict - you double-checked that you updated
drm-intel.git? Just today I merged a pile of patches from Chris' series
here.
-Daniel
Tvrtko Ursulin April 13, 2015, 11:34 a.m. UTC | #4
Hi,

On 04/07/2015 04:20 PM, Chris Wilson wrote:
> This provides a nice boost to mesa in swap bound scenarios (as mesa
> throttles itself to the previous frame and given the scenario that will
> complete shortly). It will also provide a good boost to systems running
> with semaphores disabled and so frequently waiting on the GPU as it
> switches rings. In the most favourable of microbenchmarks, this can
> increase performance by around 15% - though in practice improvements
> will be marginal and rarely noticeable.
>
> v2: Account for user timeouts
> v3: Limit the spinning to a single jiffie (~1us) at most. On an
> otherwise idle system, there is no scheduler contention and so without a
> limit we would spin until the GPU is ready.
> v4: Drop forcewake - the lazy coherent access doesn't require it, and we
> have no reason to believe that the forcewake itself improves seqno
> coherency - it only adds delay.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>

I already said that I already gave my r-b for this one. :)

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
Daniel Vetter April 13, 2015, 12:25 p.m. UTC | #5
On Mon, Apr 13, 2015 at 12:34:19PM +0100, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> On 04/07/2015 04:20 PM, Chris Wilson wrote:
> >This provides a nice boost to mesa in swap bound scenarios (as mesa
> >throttles itself to the previous frame and given the scenario that will
> >complete shortly). It will also provide a good boost to systems running
> >with semaphores disabled and so frequently waiting on the GPU as it
> >switches rings. In the most favourable of microbenchmarks, this can
> >increase performance by around 15% - though in practice improvements
> >will be marginal and rarely noticeable.
> >
> >v2: Account for user timeouts
> >v3: Limit the spinning to a single jiffie (~1us) at most. On an
> >otherwise idle system, there is no scheduler contention and so without a
> >limit we would spin until the GPU is ready.
> >v4: Drop forcewake - the lazy coherent access doesn't require it, and we
> >have no reason to believe that the forcewake itself improves seqno
> >coherency - it only adds delay.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> >Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> >Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> >Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
> 
> I already said that I already gave my r-b for this one. :)
> 
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Queued for -next, thanks for the patch.
-Daniel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c7d9ee2f708a..47650327204e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1181,6 +1181,29 @@  static bool missed_irq(struct drm_i915_private *dev_priv,
 	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
 }
 
+static int __i915_spin_request(struct drm_i915_gem_request *rq)
+{
+	unsigned long timeout;
+
+	if (i915_gem_request_get_ring(rq)->irq_refcount)
+		return -EBUSY;
+
+	timeout = jiffies + 1;
+	while (!need_resched()) {
+		if (i915_gem_request_completed(rq, true))
+			return 0;
+
+		if (time_after_eq(jiffies, timeout))
+			break;
+
+		cpu_relax_lowlatency();
+	}
+	if (i915_gem_request_completed(rq, false))
+		return 0;
+
+	return -EAGAIN;
+}
+
 /**
  * __i915_wait_request - wait until execution of request has finished
  * @req: duh!
@@ -1225,12 +1248,20 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 	if (INTEL_INFO(dev)->gen >= 6)
 		gen6_rps_boost(dev_priv, file_priv);
 
-	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring)))
-		return -ENODEV;
-
 	/* Record current time in case interrupted by signal, or wedged */
 	trace_i915_gem_request_wait_begin(req);
 	before = ktime_get_raw_ns();
+
+	/* Optimistic spin for the next jiffie before touching IRQs */
+	ret = __i915_spin_request(req);
+	if (ret == 0)
+		goto out;
+
+	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
+		ret = -ENODEV;
+		goto out;
+	}
+
 	for (;;) {
 		struct timer_list timer;
 
@@ -1279,14 +1310,15 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 			destroy_timer_on_stack(&timer);
 		}
 	}
-	now = ktime_get_raw_ns();
-	trace_i915_gem_request_wait_end(req);
-
 	if (!irq_test_in_progress)
 		ring->irq_put(ring);
 
 	finish_wait(&ring->irq_queue, &wait);
 
+out:
+	now = ktime_get_raw_ns();
+	trace_i915_gem_request_wait_end(req);
+
 	if (timeout) {
 		s64 tres = *timeout - (now - before);