diff mbox series

[06/10] drm/i915/gt: Introduce barrier pulses along engines

Message ID 20191010071434.31195-6-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/10] drm/i915: Note the addition of timeslicing to the pretend scheduler | expand

Commit Message

Chris Wilson Oct. 10, 2019, 7:14 a.m. UTC
To flush idle barriers, and even inflight requests, we want to send a
preemptive 'pulse' along an engine. We use a no-op request along the
pinned kernel_context at high priority so that it should run or else
kick off the stuck requests. We can use this to ensure idle barriers are
immediately flushed, as part of a context cancellation mechanism, or as
part of a heartbeat mechanism to detect and reset a stuck GPU.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Makefile                 |  1 +
 .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 56 +++++++++++++++++++
 .../gpu/drm/i915/gt/intel_engine_heartbeat.h  | 14 +++++
 drivers/gpu/drm/i915/gt/intel_engine_pm.c     |  2 +-
 drivers/gpu/drm/i915/i915_priolist_types.h    |  1 +
 5 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
 create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h

Comments

Tvrtko Ursulin Oct. 11, 2019, 9:11 a.m. UTC | #1
On 10/10/2019 08:14, Chris Wilson wrote:
> To flush idle barriers, and even inflight requests, we want to send a
> preemptive 'pulse' along an engine. We use a no-op request along the
> pinned kernel_context at high priority so that it should run or else
> kick off the stuck requests. We can use this to ensure idle barriers are
> immediately flushed, as part of a context cancellation mechanism, or as
> part of a heartbeat mechanism to detect and reset a stuck GPU.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/Makefile                 |  1 +
>   .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 56 +++++++++++++++++++
>   .../gpu/drm/i915/gt/intel_engine_heartbeat.h  | 14 +++++
>   drivers/gpu/drm/i915/gt/intel_engine_pm.c     |  2 +-
>   drivers/gpu/drm/i915/i915_priolist_types.h    |  1 +
>   5 files changed, 73 insertions(+), 1 deletion(-)
>   create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
>   create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index cd9a10ba2516..cfab7c8585b3 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -78,6 +78,7 @@ gt-y += \
>   	gt/intel_breadcrumbs.o \
>   	gt/intel_context.o \
>   	gt/intel_engine_cs.o \
> +	gt/intel_engine_heartbeat.o \
>   	gt/intel_engine_pm.o \
>   	gt/intel_engine_pool.o \
>   	gt/intel_engine_sysfs.o \
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> new file mode 100644
> index 000000000000..2fc413f9d506
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> @@ -0,0 +1,56 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#include "i915_request.h"
> +
> +#include "intel_context.h"
> +#include "intel_engine_heartbeat.h"
> +#include "intel_engine_pm.h"
> +#include "intel_engine.h"
> +#include "intel_gt.h"
> +
> +static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
> +{
> +	engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
> +	i915_request_add_active_barriers(rq);

Why do you need active barriers with the idle pulse? Just because it is 
a handy point to release the previously pinned contexts? But they may 
get reused as soon as idle pulse finishes, no?

Regards,

Tvrtko

> +}
> +
> +int intel_engine_pulse(struct intel_engine_cs *engine)
> +{
> +	struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
> +	struct intel_context *ce = engine->kernel_context;
> +	struct i915_request *rq;
> +	int err = 0;
> +
> +	if (!intel_engine_has_preemption(engine))
> +		return -ENODEV;
> +
> +	if (!intel_engine_pm_get_if_awake(engine))
> +		return 0;
> +
> +	if (mutex_lock_interruptible(&ce->timeline->mutex))
> +		goto out_rpm;
> +
> +	intel_context_enter(ce);
> +	rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN);
> +	intel_context_exit(ce);
> +	if (IS_ERR(rq)) {
> +		err = PTR_ERR(rq);
> +		goto out_unlock;
> +	}
> +
> +	rq->flags |= I915_REQUEST_SENTINEL;
> +	idle_pulse(engine, rq);
> +
> +	__i915_request_commit(rq);
> +	__i915_request_queue(rq, &attr);
> +
> +out_unlock:
> +	mutex_unlock(&ce->timeline->mutex);
> +out_rpm:
> +	intel_engine_pm_put(engine);
> +	return err;
> +}
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
> new file mode 100644
> index 000000000000..b950451b5998
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
> @@ -0,0 +1,14 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#ifndef INTEL_ENGINE_HEARTBEAT_H
> +#define INTEL_ENGINE_HEARTBEAT_H
> +
> +struct intel_engine_cs;
> +
> +int intel_engine_pulse(struct intel_engine_cs *engine);
> +
> +#endif /* INTEL_ENGINE_HEARTBEAT_H */
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> index 67eb6183648a..7d76611d9df1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> @@ -111,7 +111,7 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
>   	i915_request_add_active_barriers(rq);
>   
>   	/* Install ourselves as a preemption barrier */
> -	rq->sched.attr.priority = I915_PRIORITY_UNPREEMPTABLE;
> +	rq->sched.attr.priority = I915_PRIORITY_BARRIER;
>   	__i915_request_commit(rq);
>   
>   	/* Release our exclusive hold on the engine */
> diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
> index 21037a2e2038..ae8bb3cb627e 100644
> --- a/drivers/gpu/drm/i915/i915_priolist_types.h
> +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
> @@ -39,6 +39,7 @@ enum {
>    * active request.
>    */
>   #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
> +#define I915_PRIORITY_BARRIER INT_MAX
>   
>   #define __NO_PREEMPTION (I915_PRIORITY_WAIT)
>   
>
Chris Wilson Oct. 11, 2019, 9:52 a.m. UTC | #2
Quoting Tvrtko Ursulin (2019-10-11 10:11:58)
> 
> On 10/10/2019 08:14, Chris Wilson wrote:
> > +#include "intel_context.h"
> > +#include "intel_engine_heartbeat.h"
> > +#include "intel_engine_pm.h"
> > +#include "intel_engine.h"
> > +#include "intel_gt.h"
> > +
> > +static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
> > +{
> > +     engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
> > +     i915_request_add_active_barriers(rq);
> 
> Why do you need active barriers with the idle pulse? Just because it is 
> a handy point to release the previously pinned contexts? But they may 
> get reused as soon as idle pulse finishes, no?

Yes. It is a known point in time where the other context has finished,
and when this request runs has completed a context switch.

Remember all that time we were arguing about idle barriers and how we
needed to run them periodically to allow them to be reaped and avoid
having the entire aperture pinned with stale contexts forcing a stall.
And avoiding making the idle barriers themselves a global serialising
barrier. :|

The idea we had was that we would take advantage of any guaranteed
context switches and send regular pulses from the kernel context to pick
up stragglers. So we could use any context switch after the we retire
the old context to unpin it, but to keep the locking and preallocations
of the rbtree simple (you've seen i915_active, simple is anything but),
I left it to the engine->kernel_context to manage.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index cd9a10ba2516..cfab7c8585b3 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -78,6 +78,7 @@  gt-y += \
 	gt/intel_breadcrumbs.o \
 	gt/intel_context.o \
 	gt/intel_engine_cs.o \
+	gt/intel_engine_heartbeat.o \
 	gt/intel_engine_pm.o \
 	gt/intel_engine_pool.o \
 	gt/intel_engine_sysfs.o \
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
new file mode 100644
index 000000000000..2fc413f9d506
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -0,0 +1,56 @@ 
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#include "i915_request.h"
+
+#include "intel_context.h"
+#include "intel_engine_heartbeat.h"
+#include "intel_engine_pm.h"
+#include "intel_engine.h"
+#include "intel_gt.h"
+
+static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
+{
+	engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
+	i915_request_add_active_barriers(rq);
+}
+
+int intel_engine_pulse(struct intel_engine_cs *engine)
+{
+	struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
+	struct intel_context *ce = engine->kernel_context;
+	struct i915_request *rq;
+	int err = 0;
+
+	if (!intel_engine_has_preemption(engine))
+		return -ENODEV;
+
+	if (!intel_engine_pm_get_if_awake(engine))
+		return 0;
+
+	if (mutex_lock_interruptible(&ce->timeline->mutex))
+		goto out_rpm;
+
+	intel_context_enter(ce);
+	rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN);
+	intel_context_exit(ce);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto out_unlock;
+	}
+
+	rq->flags |= I915_REQUEST_SENTINEL;
+	idle_pulse(engine, rq);
+
+	__i915_request_commit(rq);
+	__i915_request_queue(rq, &attr);
+
+out_unlock:
+	mutex_unlock(&ce->timeline->mutex);
+out_rpm:
+	intel_engine_pm_put(engine);
+	return err;
+}
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
new file mode 100644
index 000000000000..b950451b5998
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
@@ -0,0 +1,14 @@ 
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#ifndef INTEL_ENGINE_HEARTBEAT_H
+#define INTEL_ENGINE_HEARTBEAT_H
+
+struct intel_engine_cs;
+
+int intel_engine_pulse(struct intel_engine_cs *engine);
+
+#endif /* INTEL_ENGINE_HEARTBEAT_H */
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index 67eb6183648a..7d76611d9df1 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -111,7 +111,7 @@  static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 	i915_request_add_active_barriers(rq);
 
 	/* Install ourselves as a preemption barrier */
-	rq->sched.attr.priority = I915_PRIORITY_UNPREEMPTABLE;
+	rq->sched.attr.priority = I915_PRIORITY_BARRIER;
 	__i915_request_commit(rq);
 
 	/* Release our exclusive hold on the engine */
diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
index 21037a2e2038..ae8bb3cb627e 100644
--- a/drivers/gpu/drm/i915/i915_priolist_types.h
+++ b/drivers/gpu/drm/i915/i915_priolist_types.h
@@ -39,6 +39,7 @@  enum {
  * active request.
  */
 #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
+#define I915_PRIORITY_BARRIER INT_MAX
 
 #define __NO_PREEMPTION (I915_PRIORITY_WAIT)