diff mbox series

[CI] drm/i915/gt: Introduce barrier pulses along engines

Message ID 20191021160833.26866-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [CI] drm/i915/gt: Introduce barrier pulses along engines | expand

Commit Message

Chris Wilson Oct. 21, 2019, 4:08 p.m. UTC
To flush idle barriers, and even inflight requests, we want to send a
preemptive 'pulse' along an engine. We use a no-op request along the
pinned kernel_context at high priority so that it should run or else
kick off the stuck requests. We can use this to ensure idle barriers are
immediately flushed, as part of a context cancellation mechanism, or as
part of a heartbeat mechanism to detect and reset a stuck GPU.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/Makefile                 |   3 +-
 .../gpu/drm/i915/gt/intel_engine_heartbeat.c  |  77 +++++++++
 .../gpu/drm/i915/gt/intel_engine_heartbeat.h  |  15 ++
 drivers/gpu/drm/i915/gt/intel_engine_pm.c     |   2 +-
 .../drm/i915/gt/selftest_engine_heartbeat.c   | 150 ++++++++++++++++++
 drivers/gpu/drm/i915/i915_priolist_types.h    |   1 +
 .../drm/i915/selftests/i915_live_selftests.h  |   1 +
 7 files changed, 247 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
 create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
 create mode 100644 drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index a16a2daef977..2fd4bed188e5 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -78,8 +78,9 @@  gt-y += \
 	gt/intel_breadcrumbs.o \
 	gt/intel_context.o \
 	gt/intel_engine_cs.o \
-	gt/intel_engine_pool.o \
+	gt/intel_engine_heartbeat.o \
 	gt/intel_engine_pm.o \
+	gt/intel_engine_pool.o \
 	gt/intel_engine_user.o \
 	gt/intel_gt.o \
 	gt/intel_gt_irq.o \
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
new file mode 100644
index 000000000000..4b9ab7813d54
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -0,0 +1,77 @@ 
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#include "i915_request.h"
+
+#include "intel_context.h"
+#include "intel_engine_heartbeat.h"
+#include "intel_engine_pm.h"
+#include "intel_engine.h"
+#include "intel_gt.h"
+
+static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
+{
+	engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
+	i915_request_add_active_barriers(rq);
+}
+
+int intel_engine_pulse(struct intel_engine_cs *engine)
+{
+	struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
+	struct intel_context *ce = engine->kernel_context;
+	struct i915_request *rq;
+	int err = 0;
+
+	if (!intel_engine_has_preemption(engine))
+		return -ENODEV;
+
+	if (!intel_engine_pm_get_if_awake(engine))
+		return 0;
+
+	if (mutex_lock_interruptible(&ce->timeline->mutex))
+		goto out_rpm;
+
+	intel_context_enter(ce);
+	rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN);
+	intel_context_exit(ce);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto out_unlock;
+	}
+
+	rq->flags |= I915_REQUEST_SENTINEL;
+	idle_pulse(engine, rq);
+
+	__i915_request_commit(rq);
+	__i915_request_queue(rq, &attr);
+
+out_unlock:
+	mutex_unlock(&ce->timeline->mutex);
+out_rpm:
+	intel_engine_pm_put(engine);
+	return err;
+}
+
+int intel_engine_flush_barriers(struct intel_engine_cs *engine)
+{
+	struct i915_request *rq;
+
+	if (llist_empty(&engine->barrier_tasks))
+		return 0;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	idle_pulse(engine, rq);
+	i915_request_add(rq);
+
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftest_engine_heartbeat.c"
+#endif
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
new file mode 100644
index 000000000000..b334e5aaf78d
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
@@ -0,0 +1,15 @@ 
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#ifndef INTEL_ENGINE_HEARTBEAT_H
+#define INTEL_ENGINE_HEARTBEAT_H
+
+struct intel_engine_cs;
+
+int intel_engine_pulse(struct intel_engine_cs *engine);
+int intel_engine_flush_barriers(struct intel_engine_cs *engine);
+
+#endif /* INTEL_ENGINE_HEARTBEAT_H */
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index 67eb6183648a..7d76611d9df1 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -111,7 +111,7 @@  static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 	i915_request_add_active_barriers(rq);
 
 	/* Install ourselves as a preemption barrier */
-	rq->sched.attr.priority = I915_PRIORITY_UNPREEMPTABLE;
+	rq->sched.attr.priority = I915_PRIORITY_BARRIER;
 	__i915_request_commit(rq);
 
 	/* Release our exclusive hold on the engine */
diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c
new file mode 100644
index 000000000000..ad2f0543cbda
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c
@@ -0,0 +1,150 @@ 
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2018 Intel Corporation
+ */
+
+#include "i915_drv.h"
+
+#include "intel_gt_requests.h"
+#include "i915_selftest.h"
+
+struct pulse {
+	struct i915_active active;
+	struct kref kref;
+};
+
+static int pulse_active(struct i915_active *active)
+{
+	kref_get(&container_of(active, struct pulse, active)->kref);
+	return 0;
+}
+
+static void pulse_free(struct kref *kref)
+{
+	kfree(container_of(kref, struct pulse, kref));
+}
+
+static void pulse_put(struct pulse *p)
+{
+	kref_put(&p->kref, pulse_free);
+}
+
+static void pulse_retire(struct i915_active *active)
+{
+	pulse_put(container_of(active, struct pulse, active));
+}
+
+static struct pulse *pulse_create(void)
+{
+	struct pulse *p;
+
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return p;
+
+	kref_init(&p->kref);
+	i915_active_init(&p->active, pulse_active, pulse_retire);
+
+	return p;
+}
+
+static int __live_idle_pulse(struct intel_engine_cs *engine,
+			     int (*fn)(struct intel_engine_cs *cs))
+{
+	struct pulse *p;
+	int err;
+
+	p = pulse_create();
+	if (!p)
+		return -ENOMEM;
+
+	err = i915_active_acquire_preallocate_barrier(&p->active, engine);
+	if (err)
+		goto out;
+
+	i915_active_acquire_barrier(&p->active);
+
+	err = fn(engine);
+	if (err)
+		goto out;
+
+	if (intel_gt_retire_requests_timeout(engine->gt, HZ / 5)) {
+		err = -ETIME;
+		goto out;
+	}
+
+	if (atomic_read(&p->active.count)) {
+		pr_err("%s: heartbeat pulse did not flush idle tasks\n",
+		       engine->name);
+		err = -EINVAL;
+		goto out;
+	}
+
+out:
+	pulse_put(p);
+	return err;
+}
+
+static int live_idle_flush(void *arg)
+{
+	struct intel_gt *gt = arg;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	int err = 0;
+
+	/* Check that we can flush the idle barriers */
+
+	for_each_engine(engine, gt, id) {
+		intel_engine_pm_get(engine);
+		err = __live_idle_pulse(engine, intel_engine_flush_barriers);
+		intel_engine_pm_put(engine);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+static int live_idle_pulse(void *arg)
+{
+	struct intel_gt *gt = arg;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	int err = 0;
+
+	/* Check that heartbeat pulses flush the idle barriers */
+
+	for_each_engine(engine, gt, id) {
+		intel_engine_pm_get(engine);
+		err = __live_idle_pulse(engine, intel_engine_pulse);
+		intel_engine_pm_put(engine);
+		if (err && err != -ENODEV)
+			break;
+
+		err = 0;
+	}
+
+	return err;
+}
+
+int intel_heartbeat_live_selftests(struct drm_i915_private *i915)
+{
+	static const struct i915_subtest tests[] = {
+		SUBTEST(live_idle_flush),
+		SUBTEST(live_idle_pulse),
+	};
+	int saved_hangcheck;
+	int err;
+
+	if (intel_gt_is_wedged(&i915->gt))
+		return 0;
+
+	saved_hangcheck = i915_modparams.enable_hangcheck;
+	i915_modparams.enable_hangcheck = INT_MAX;
+
+	err =  intel_gt_live_subtests(tests, &i915->gt);
+
+	i915_modparams.enable_hangcheck = saved_hangcheck;
+	return err;
+}
diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
index 21037a2e2038..ae8bb3cb627e 100644
--- a/drivers/gpu/drm/i915/i915_priolist_types.h
+++ b/drivers/gpu/drm/i915/i915_priolist_types.h
@@ -39,6 +39,7 @@  enum {
  * active request.
  */
 #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
+#define I915_PRIORITY_BARRIER INT_MAX
 
 #define __NO_PREEMPTION (I915_PRIORITY_WAIT)
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
index 6daf6599ec79..00a063730bc3 100644
--- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
@@ -17,6 +17,7 @@  selftest(gt_timelines, intel_timeline_live_selftests)
 selftest(gt_contexts, intel_context_live_selftests)
 selftest(gt_lrc, intel_lrc_live_selftests)
 selftest(gt_pm, intel_gt_pm_live_selftests)
+selftest(gt_heartbeat, intel_heartbeat_live_selftests)
 selftest(requests, i915_request_live_selftests)
 selftest(active, i915_active_live_selftests)
 selftest(objects, i915_gem_object_live_selftests)