diff mbox

[v3] drm/i915: Optimistically spin for the request completion

Message ID 20150319151615.GQ10812@nuc-i3427.alporthouse.com (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson March 19, 2015, 3:16 p.m. UTC
On Thu, Mar 12, 2015 at 11:11:17AM +0000, Chris Wilson wrote:
> This provides a nice boost to mesa in swap bound scenarios (as mesa
> throttles itself to the previous frame and given the scenario that will
> complete shortly). It will also provide a good boost to systems running
> with semaphores disabled and so frequently waiting on the GPU as it
> switches rings. In the most favourable of microbenchmarks, this can
> increase performance by around 15% - though in practice improvements
> will be marginal and rarely noticeable.
> 
> v2: Account for user timeouts
> v3: Limit the spinning to a single jiffie (~1us) at most. On an
> otherwise idle system, there is no scheduler contention and so without a
> limit we would spin until the GPU is ready.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>

Just recording ideas for the future. Replace the busy-spin with
monitor/mwait. This requires Pentium4+, a cooperating GPU with working
cacheline snooping and that we use HWS seqno.

Comments

Daniel Vetter March 20, 2015, 2:54 p.m. UTC | #1
On Thu, Mar 19, 2015 at 03:16:15PM +0000, Chris Wilson wrote:
> On Thu, Mar 12, 2015 at 11:11:17AM +0000, Chris Wilson wrote:
> > This provides a nice boost to mesa in swap bound scenarios (as mesa
> > throttles itself to the previous frame and given the scenario that will
> > complete shortly). It will also provide a good boost to systems running
> > with semaphores disabled and so frequently waiting on the GPU as it
> > switches rings. In the most favourable of microbenchmarks, this can
> > increase performance by around 15% - though in practice improvements
> > will be marginal and rarely noticeable.
> > 
> > v2: Account for user timeouts
> > v3: Limit the spinning to a single jiffie (~1us) at most. On an
> > otherwise idle system, there is no scheduler contention and so without a
> > limit we would spin until the GPU is ready.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> 
> Just recording ideas for the future. Replace the busy-spin with
> monitor/mwait. This requires Pentium4+, a cooperating GPU with working
> cacheline snooping and that we use HWS seqno.

Just for the record: Did it help with powersaving or was it all in the
noise?
-Daniel

> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 85e71e0e2340..454a38d4caa3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -37,6 +37,7 @@
>  #include <linux/swap.h>
>  #include <linux/pci.h>
>  #include <linux/dma-buf.h>
> +#include <asm/mwait.h>
>  
>  #define RQ_BUG_ON(expr)
>  
> @@ -1187,18 +1188,42 @@ static int __i915_spin_request(struct drm_i915_gem_request *req)
>         unsigned long timeout;
>         int ret = -EBUSY;
>  
> +       if (ring->irq_refcount) /* IRQ is already active, keep using it */
> +               return ret;
> +
>         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
>         timeout = jiffies + 1;
> -       while (!need_resched()) {
> -               if (i915_gem_request_completed(req, true)) {
> -                       ret = 0;
> -                       goto out;
> -               }
> +       if (this_cpu_has(X86_FEATURE_MWAIT)) {
> +               do {
> +                       unsigned long ecx = 1; /* break on interrupt */
> +                       unsigned long eax = 0; /* cstate */
>  
> -               if (time_after_eq(jiffies, timeout))
> -                       break;
> +                       __monitor((void *)&ring->status_page.page_addr[I915_GEM_HWS_INDEX], 0, 0);
> +                       if (need_resched())
> +                               break;
> +
> +                       if (i915_gem_request_completed(req, true)) {
> +                               ret = 0;
> +                               goto out;
> +                       }
>  
> -               cpu_relax_lowlatency();
> +                       if (time_after_eq(jiffies, timeout))
> +                               break;
> +
> +                       __mwait(eax, ecx);
> +               } while (1);
> +       } else {
> +               while (!need_resched()) {
> +                       if (i915_gem_request_completed(req, true)) {
> +                               ret = 0;
> +                               goto out;
> +                       }
> +
> +                       if (time_after_eq(jiffies, timeout))
> +                               break;
> +
> +                       cpu_relax_lowlatency();
> +               }
>         }
>         if (i915_gem_request_completed(req, false))
>                 ret = 0;
> 
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre
Chris Wilson March 20, 2015, 3:27 p.m. UTC | #2
On Fri, Mar 20, 2015 at 03:54:01PM +0100, Daniel Vetter wrote:
> On Thu, Mar 19, 2015 at 03:16:15PM +0000, Chris Wilson wrote:
> > On Thu, Mar 12, 2015 at 11:11:17AM +0000, Chris Wilson wrote:
> > > This provides a nice boost to mesa in swap bound scenarios (as mesa
> > > throttles itself to the previous frame and given the scenario that will
> > > complete shortly). It will also provide a good boost to systems running
> > > with semaphores disabled and so frequently waiting on the GPU as it
> > > switches rings. In the most favourable of microbenchmarks, this can
> > > increase performance by around 15% - though in practice improvements
> > > will be marginal and rarely noticeable.
> > > 
> > > v2: Account for user timeouts
> > > v3: Limit the spinning to a single jiffie (~1us) at most. On an
> > > otherwise idle system, there is no scheduler contention and so without a
> > > limit we would spin until the GPU is ready.
> > > 
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> > 
> > Just recording ideas for the future. Replace the busy-spin with
> > monitor/mwait. This requires Pentium4+, a cooperating GPU with working
> > cacheline snooping and that we use HWS seqno.
> 
> Just for the record: Did it help with powersaving or was it all in the
> noise?

Unscientifically, I would say mwait(cstate=0) was worse. It gave a
marginally higher peak, but there was clearly worse thermal throttling
than the simple busy-wait. powertop suggests that with the mwait we were
not reaching as low a package cstate as often.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 85e71e0e2340..454a38d4caa3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -37,6 +37,7 @@ 
 #include <linux/swap.h>
 #include <linux/pci.h>
 #include <linux/dma-buf.h>
+#include <asm/mwait.h>
 
 #define RQ_BUG_ON(expr)
 
@@ -1187,18 +1188,42 @@  static int __i915_spin_request(struct drm_i915_gem_request *req)
        unsigned long timeout;
        int ret = -EBUSY;
 
+       if (ring->irq_refcount) /* IRQ is already active, keep using it */
+               return ret;
+
        intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
        timeout = jiffies + 1;
-       while (!need_resched()) {
-               if (i915_gem_request_completed(req, true)) {
-                       ret = 0;
-                       goto out;
-               }
+       if (this_cpu_has(X86_FEATURE_MWAIT)) {
+               do {
+                       unsigned long ecx = 1; /* break on interrupt */
+                       unsigned long eax = 0; /* cstate */
 
-               if (time_after_eq(jiffies, timeout))
-                       break;
+                       __monitor((void *)&ring->status_page.page_addr[I915_GEM_HWS_INDEX], 0, 0);
+                       if (need_resched())
+                               break;
+
+                       if (i915_gem_request_completed(req, true)) {
+                               ret = 0;
+                               goto out;
+                       }
 
-               cpu_relax_lowlatency();
+                       if (time_after_eq(jiffies, timeout))
+                               break;
+
+                       __mwait(eax, ecx);
+               } while (1);
+       } else {
+               while (!need_resched()) {
+                       if (i915_gem_request_completed(req, true)) {
+                               ret = 0;
+                               goto out;
+                       }
+
+                       if (time_after_eq(jiffies, timeout))
+                               break;
+
+                       cpu_relax_lowlatency();
+               }
        }
        if (i915_gem_request_completed(req, false))
                ret = 0;