diff mbox series

[3/3] drm/i915/execlists: Force preemption

Message ID 20190620070559.30076-3-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/3] drm/i915/execlists: Preempt-to-busy | expand

Commit Message

Chris Wilson June 20, 2019, 7:05 a.m. UTC
If the preempted context takes too long to relinquish control, e.g. it
is stuck inside a shader with arbitration disabled, evict that context
with an engine reset. This ensures that preemptions are reasonably
responsive, providing a tighter QoS for the more important context at
the cost of flagging unresponsive contexts more frequently (i.e. instead
of using an ~10s hangcheck, we now evict at ~10ms).  The challenge of
lies in picking a timeout that can be reasonably serviced by HW for
typical workloads, balancing the existing clients against the needs for
responsiveness.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/Kconfig.profile | 12 ++++++
 drivers/gpu/drm/i915/gt/intel_lrc.c  | 56 ++++++++++++++++++++++++++--
 2 files changed, 65 insertions(+), 3 deletions(-)

Comments

Mika Kuoppala June 20, 2019, 2 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> If the preempted context takes too long to relinquish control, e.g. it
> is stuck inside a shader with arbitration disabled, evict that context
> with an engine reset. This ensures that preemptions are reasonably
> responsive, providing a tighter QoS for the more important context at
> the cost of flagging unresponsive contexts more frequently (i.e. instead
> of using an ~10s hangcheck, we now evict at ~10ms).  The challenge of
> lies in picking a timeout that can be reasonably serviced by HW for
> typical workloads, balancing the existing clients against the needs for
> responsiveness.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/Kconfig.profile | 12 ++++++
>  drivers/gpu/drm/i915/gt/intel_lrc.c  | 56 ++++++++++++++++++++++++++--
>  2 files changed, 65 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/Kconfig.profile b/drivers/gpu/drm/i915/Kconfig.profile
> index 48df8889a88a..8273d3baafe4 100644
> --- a/drivers/gpu/drm/i915/Kconfig.profile
> +++ b/drivers/gpu/drm/i915/Kconfig.profile
> @@ -25,3 +25,15 @@ config DRM_I915_SPIN_REQUEST
>  	  May be 0 to disable the initial spin. In practice, we estimate
>  	  the cost of enabling the interrupt (if currently disabled) to be
>  	  a few microseconds.
> +
> +config DRM_I915_PREEMPT_TIMEOUT
> +	int "Preempt timeout (ms)"
> +	default 10 # milliseconds
> +	help
> +	  How long to wait (in milliseconds) for a preemption event to occur
> +	  when submitting a new context via execlists. If the current context
> +	  does not hit an arbitration point and yield to HW before the timer
> +	  expires, the HW will be reset to allow the more important context
> +	  to execute.
> +
> +	  May be 0 to disable the timeout.
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index fca79adb4aa3..e8d7deba3e49 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -889,6 +889,15 @@ enable_timeslice(struct intel_engine_cs *engine)
>  	return last && need_timeslice(engine, last);
>  }
>  
> +static unsigned long preempt_expires(void)
> +{
> +	unsigned long timeout =

could be const

> +		msecs_to_jiffies_timeout(CONFIG_DRM_I915_PREEMPT_TIMEOUT);
> +
> +	barrier();

This needs a comment. I fail to connect the dots as jiffies
is volatile by nature.

> +	return jiffies + timeout;
> +}
> +
>  static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
>  	struct intel_engine_execlists * const execlists = &engine->execlists;
> @@ -1220,6 +1229,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		*port = execlists_schedule_in(last, port - execlists->pending);
>  		memset(port + 1, 0, (last_port - port) * sizeof(*port));
>  		execlists_submit_ports(engine);
> +
> +		if (CONFIG_DRM_I915_PREEMPT_TIMEOUT)
> +			mod_timer(&execlists->timer, preempt_expires());
>  	}
>  }
>  
> @@ -1376,13 +1388,48 @@ static void process_csb(struct intel_engine_cs *engine)
>  	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
>  }
>  
> -static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
> +static bool __execlists_submission_tasklet(struct intel_engine_cs *const engine)
>  {
>  	lockdep_assert_held(&engine->active.lock);
>  
>  	process_csb(engine);
> -	if (!engine->execlists.pending[0])
> +	if (!engine->execlists.pending[0]) {
>  		execlists_dequeue(engine);
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +static void preempt_reset(struct intel_engine_cs *engine)
> +{
> +	const unsigned int bit = I915_RESET_ENGINE + engine->id;
> +	unsigned long *lock = &engine->i915->gpu_error.flags;
> +
> +	if (test_and_set_bit(bit, lock))
> +		return;
> +
> +	tasklet_disable_nosync(&engine->execlists.tasklet);
> +	spin_unlock(&engine->active.lock);
> +

Why do we need to drop the lock?
-Mika

> +	i915_reset_engine(engine, "preemption time out");
> +
> +	spin_lock(&engine->active.lock);
> +	tasklet_enable(&engine->execlists.tasklet);
> +
> +	clear_bit(bit, lock);
> +	wake_up_bit(lock, bit);
> +}
> +
> +static bool preempt_timeout(struct intel_engine_cs *const engine)
> +{
> +	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
> +		return false;
> +
> +	if (!intel_engine_has_preemption(engine))
> +		return false;
> +
> +	return !timer_pending(&engine->execlists.timer);
>  }
>  
>  /*
> @@ -1395,7 +1442,10 @@ static void execlists_submission_tasklet(unsigned long data)
>  	unsigned long flags;
>  
>  	spin_lock_irqsave(&engine->active.lock, flags);
> -	__execlists_submission_tasklet(engine);
> +
> +	if (!__execlists_submission_tasklet(engine) && preempt_timeout(engine))
> +		preempt_reset(engine);
> +
>  	spin_unlock_irqrestore(&engine->active.lock, flags);
>  }
>  
> -- 
> 2.20.1
Chris Wilson June 20, 2019, 2:08 p.m. UTC | #2
Quoting Mika Kuoppala (2019-06-20 15:00:44)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > If the preempted context takes too long to relinquish control, e.g. it
> > is stuck inside a shader with arbitration disabled, evict that context
> > with an engine reset. This ensures that preemptions are reasonably
> > responsive, providing a tighter QoS for the more important context at
> > the cost of flagging unresponsive contexts more frequently (i.e. instead
> > of using an ~10s hangcheck, we now evict at ~10ms).  The challenge of
> > lies in picking a timeout that can be reasonably serviced by HW for
> > typical workloads, balancing the existing clients against the needs for
> > responsiveness.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >  drivers/gpu/drm/i915/Kconfig.profile | 12 ++++++
> >  drivers/gpu/drm/i915/gt/intel_lrc.c  | 56 ++++++++++++++++++++++++++--
> >  2 files changed, 65 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/Kconfig.profile b/drivers/gpu/drm/i915/Kconfig.profile
> > index 48df8889a88a..8273d3baafe4 100644
> > --- a/drivers/gpu/drm/i915/Kconfig.profile
> > +++ b/drivers/gpu/drm/i915/Kconfig.profile
> > @@ -25,3 +25,15 @@ config DRM_I915_SPIN_REQUEST
> >         May be 0 to disable the initial spin. In practice, we estimate
> >         the cost of enabling the interrupt (if currently disabled) to be
> >         a few microseconds.
> > +
> > +config DRM_I915_PREEMPT_TIMEOUT
> > +     int "Preempt timeout (ms)"
> > +     default 10 # milliseconds
> > +     help
> > +       How long to wait (in milliseconds) for a preemption event to occur
> > +       when submitting a new context via execlists. If the current context
> > +       does not hit an arbitration point and yield to HW before the timer
> > +       expires, the HW will be reset to allow the more important context
> > +       to execute.
> > +
> > +       May be 0 to disable the timeout.
> > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> > index fca79adb4aa3..e8d7deba3e49 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> > @@ -889,6 +889,15 @@ enable_timeslice(struct intel_engine_cs *engine)
> >       return last && need_timeslice(engine, last);
> >  }
> >  
> > +static unsigned long preempt_expires(void)
> > +{
> > +     unsigned long timeout =
> 
> could be const
> 
> > +             msecs_to_jiffies_timeout(CONFIG_DRM_I915_PREEMPT_TIMEOUT);
> > +
> > +     barrier();
> 
> This needs a comment. I fail to connect the dots as jiffies
> is volatile by nature.

It's just crossing the 't' and dotting the 'i'. What I was thinking was
we don't want the compiler to load jiffies then compute the timeout. So
barrier() there says that timeout is always computed first. Now since it
is likely to be a function call (but I'm trying to find a way to let it
precompute the constant), it will always be precomputed, but who trusts
a compiler.

> > +     return jiffies + timeout;
> > +}
> > +
> >  static void execlists_dequeue(struct intel_engine_cs *engine)
> >  {
> >       struct intel_engine_execlists * const execlists = &engine->execlists;
> > @@ -1220,6 +1229,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> >               *port = execlists_schedule_in(last, port - execlists->pending);
> >               memset(port + 1, 0, (last_port - port) * sizeof(*port));
> >               execlists_submit_ports(engine);
> > +
> > +             if (CONFIG_DRM_I915_PREEMPT_TIMEOUT)
> > +                     mod_timer(&execlists->timer, preempt_expires());
> >       }
> >  }
> >  
> > @@ -1376,13 +1388,48 @@ static void process_csb(struct intel_engine_cs *engine)
> >       invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
> >  }
> >  
> > -static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
> > +static bool __execlists_submission_tasklet(struct intel_engine_cs *const engine)
> >  {
> >       lockdep_assert_held(&engine->active.lock);
> >  
> >       process_csb(engine);
> > -     if (!engine->execlists.pending[0])
> > +     if (!engine->execlists.pending[0]) {
> >               execlists_dequeue(engine);
> > +             return true;
> > +     }
> > +
> > +     return false;
> > +}
> > +
> > +static void preempt_reset(struct intel_engine_cs *engine)
> > +{
> > +     const unsigned int bit = I915_RESET_ENGINE + engine->id;
> > +     unsigned long *lock = &engine->i915->gpu_error.flags;
> > +
> > +     if (test_and_set_bit(bit, lock))
> > +             return;
> > +
> > +     tasklet_disable_nosync(&engine->execlists.tasklet);
> > +     spin_unlock(&engine->active.lock);
> > +
> 
> Why do we need to drop the lock?

We take it again inside the reset, and I am far too lazy to lift it to
the caller :) Disabling the tasklet will prevent other threads from
submitting as we drop the lock.
-Chris
Mika Kuoppala June 20, 2019, 2:19 p.m. UTC | #3
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2019-06-20 15:00:44)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > If the preempted context takes too long to relinquish control, e.g. it
>> > is stuck inside a shader with arbitration disabled, evict that context
>> > with an engine reset. This ensures that preemptions are reasonably
>> > responsive, providing a tighter QoS for the more important context at
>> > the cost of flagging unresponsive contexts more frequently (i.e. instead
>> > of using an ~10s hangcheck, we now evict at ~10ms).  The challenge of
>> > lies in picking a timeout that can be reasonably serviced by HW for
>> > typical workloads, balancing the existing clients against the needs for
>> > responsiveness.
>> >
>> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> > ---
>> >  drivers/gpu/drm/i915/Kconfig.profile | 12 ++++++
>> >  drivers/gpu/drm/i915/gt/intel_lrc.c  | 56 ++++++++++++++++++++++++++--
>> >  2 files changed, 65 insertions(+), 3 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/i915/Kconfig.profile b/drivers/gpu/drm/i915/Kconfig.profile
>> > index 48df8889a88a..8273d3baafe4 100644
>> > --- a/drivers/gpu/drm/i915/Kconfig.profile
>> > +++ b/drivers/gpu/drm/i915/Kconfig.profile
>> > @@ -25,3 +25,15 @@ config DRM_I915_SPIN_REQUEST
>> >         May be 0 to disable the initial spin. In practice, we estimate
>> >         the cost of enabling the interrupt (if currently disabled) to be
>> >         a few microseconds.
>> > +
>> > +config DRM_I915_PREEMPT_TIMEOUT
>> > +     int "Preempt timeout (ms)"
>> > +     default 10 # milliseconds
>> > +     help
>> > +       How long to wait (in milliseconds) for a preemption event to occur
>> > +       when submitting a new context via execlists. If the current context
>> > +       does not hit an arbitration point and yield to HW before the timer
>> > +       expires, the HW will be reset to allow the more important context
>> > +       to execute.
>> > +
>> > +       May be 0 to disable the timeout.
>> > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
>> > index fca79adb4aa3..e8d7deba3e49 100644
>> > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
>> > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
>> > @@ -889,6 +889,15 @@ enable_timeslice(struct intel_engine_cs *engine)
>> >       return last && need_timeslice(engine, last);
>> >  }
>> >  
>> > +static unsigned long preempt_expires(void)
>> > +{
>> > +     unsigned long timeout =
>> 
>> could be const
>> 
>> > +             msecs_to_jiffies_timeout(CONFIG_DRM_I915_PREEMPT_TIMEOUT);
>> > +
>> > +     barrier();
>> 
>> This needs a comment. I fail to connect the dots as jiffies
>> is volatile by nature.
>
> It's just crossing the 't' and dotting the 'i'. What I was thinking was
> we don't want the compiler to load jiffies then compute the timeout. So
> barrier() there says that timeout is always computed first. Now since it
> is likely to be a function call (but I'm trying to find a way to let it
> precompute the constant), it will always be precomputed, but who trusts
> a compiler.
>
>> > +     return jiffies + timeout;
>> > +}
>> > +
>> >  static void execlists_dequeue(struct intel_engine_cs *engine)
>> >  {
>> >       struct intel_engine_execlists * const execlists = &engine->execlists;
>> > @@ -1220,6 +1229,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>> >               *port = execlists_schedule_in(last, port - execlists->pending);
>> >               memset(port + 1, 0, (last_port - port) * sizeof(*port));
>> >               execlists_submit_ports(engine);
>> > +
>> > +             if (CONFIG_DRM_I915_PREEMPT_TIMEOUT)
>> > +                     mod_timer(&execlists->timer, preempt_expires());
>> >       }
>> >  }
>> >  
>> > @@ -1376,13 +1388,48 @@ static void process_csb(struct intel_engine_cs *engine)
>> >       invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
>> >  }
>> >  
>> > -static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
>> > +static bool __execlists_submission_tasklet(struct intel_engine_cs *const engine)
>> >  {
>> >       lockdep_assert_held(&engine->active.lock);
>> >  
>> >       process_csb(engine);
>> > -     if (!engine->execlists.pending[0])
>> > +     if (!engine->execlists.pending[0]) {
>> >               execlists_dequeue(engine);
>> > +             return true;
>> > +     }
>> > +
>> > +     return false;
>> > +}
>> > +
>> > +static void preempt_reset(struct intel_engine_cs *engine)
>> > +{
>> > +     const unsigned int bit = I915_RESET_ENGINE + engine->id;
>> > +     unsigned long *lock = &engine->i915->gpu_error.flags;
>> > +
>> > +     if (test_and_set_bit(bit, lock))
>> > +             return;
>> > +
>> > +     tasklet_disable_nosync(&engine->execlists.tasklet);
>> > +     spin_unlock(&engine->active.lock);
>> > +
>> 
>> Why do we need to drop the lock?
>
> We take it again inside the reset, and I am far too lazy to lift it to
> the caller :) Disabling the tasklet will prevent other threads from
> submitting as we drop the lock.

With the barrier commented,

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

Ok, the way you got to timeslicing with 2 last patches was
very elegant, surpricingly so.

Now lets hope I wasn't completely fooled by the first one.
There is atleast somewhat reassuring amount of CI cycles
behind these at this stage.
-Mika
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/Kconfig.profile b/drivers/gpu/drm/i915/Kconfig.profile
index 48df8889a88a..8273d3baafe4 100644
--- a/drivers/gpu/drm/i915/Kconfig.profile
+++ b/drivers/gpu/drm/i915/Kconfig.profile
@@ -25,3 +25,15 @@  config DRM_I915_SPIN_REQUEST
 	  May be 0 to disable the initial spin. In practice, we estimate
 	  the cost of enabling the interrupt (if currently disabled) to be
 	  a few microseconds.
+
+config DRM_I915_PREEMPT_TIMEOUT
+	int "Preempt timeout (ms)"
+	default 10 # milliseconds
+	help
+	  How long to wait (in milliseconds) for a preemption event to occur
+	  when submitting a new context via execlists. If the current context
+	  does not hit an arbitration point and yield to HW before the timer
+	  expires, the HW will be reset to allow the more important context
+	  to execute.
+
+	  May be 0 to disable the timeout.
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index fca79adb4aa3..e8d7deba3e49 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -889,6 +889,15 @@  enable_timeslice(struct intel_engine_cs *engine)
 	return last && need_timeslice(engine, last);
 }
 
+static unsigned long preempt_expires(void)
+{
+	unsigned long timeout =
+		msecs_to_jiffies_timeout(CONFIG_DRM_I915_PREEMPT_TIMEOUT);
+
+	barrier();
+	return jiffies + timeout;
+}
+
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -1220,6 +1229,9 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 		*port = execlists_schedule_in(last, port - execlists->pending);
 		memset(port + 1, 0, (last_port - port) * sizeof(*port));
 		execlists_submit_ports(engine);
+
+		if (CONFIG_DRM_I915_PREEMPT_TIMEOUT)
+			mod_timer(&execlists->timer, preempt_expires());
 	}
 }
 
@@ -1376,13 +1388,48 @@  static void process_csb(struct intel_engine_cs *engine)
 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
 }
 
-static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
+static bool __execlists_submission_tasklet(struct intel_engine_cs *const engine)
 {
 	lockdep_assert_held(&engine->active.lock);
 
 	process_csb(engine);
-	if (!engine->execlists.pending[0])
+	if (!engine->execlists.pending[0]) {
 		execlists_dequeue(engine);
+		return true;
+	}
+
+	return false;
+}
+
+static void preempt_reset(struct intel_engine_cs *engine)
+{
+	const unsigned int bit = I915_RESET_ENGINE + engine->id;
+	unsigned long *lock = &engine->i915->gpu_error.flags;
+
+	if (test_and_set_bit(bit, lock))
+		return;
+
+	tasklet_disable_nosync(&engine->execlists.tasklet);
+	spin_unlock(&engine->active.lock);
+
+	i915_reset_engine(engine, "preemption time out");
+
+	spin_lock(&engine->active.lock);
+	tasklet_enable(&engine->execlists.tasklet);
+
+	clear_bit(bit, lock);
+	wake_up_bit(lock, bit);
+}
+
+static bool preempt_timeout(struct intel_engine_cs *const engine)
+{
+	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
+		return false;
+
+	if (!intel_engine_has_preemption(engine))
+		return false;
+
+	return !timer_pending(&engine->execlists.timer);
 }
 
 /*
@@ -1395,7 +1442,10 @@  static void execlists_submission_tasklet(unsigned long data)
 	unsigned long flags;
 
 	spin_lock_irqsave(&engine->active.lock, flags);
-	__execlists_submission_tasklet(engine);
+
+	if (!__execlists_submission_tasklet(engine) && preempt_timeout(engine))
+		preempt_reset(engine);
+
 	spin_unlock_irqrestore(&engine->active.lock, flags);
 }