diff mbox series

drm/i915/gt: Stop poisoning the idle kernel context alone when waking up

Message ID 20240724141745.17345-3-krzysztof.niemiec@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/gt: Stop poisoning the idle kernel context alone when waking up | expand

Commit Message

Krzysztof Niemiec July 24, 2024, 2:17 p.m. UTC
From: Chris Wilson <chris.p.wilson@linux.intel.com>

The kernel context was poisoned on wakeup to simulate how the driver
would cope with bad HW that caused corruption of any context that was
still resident during power loss, see commit 1d0e2c9359fe ("drm/i915/gt:
Always poison the kernel_context image before unparking"). However
clearing the entire context image also poisons the ppHWSP which may
contain pertinent information (such as the breadcrumb and context
switches) that we want to preserve. We could restrict the poisoning to
exclude the ppHWSP, or more simply recognise that we have poisoned the
HW enough and can leave the verification to after suspend/resume.

References: 1d0e2c9359fe ("drm/i915/gt: Always poison the kernel_context image before unparking")
Signed-off-by: Chris Wilson <chris.p.wilson@linux.intel.com>
Signed-off-by: Krzysztof Niemiec <krzysztof.niemiec@intel.com>
---
 .../gpu/drm/i915/gt/intel_engine_heartbeat.c  |  4 ++++
 drivers/gpu/drm/i915/gt/intel_engine_pm.c     | 24 -------------------
 drivers/gpu/drm/i915/gt/intel_lrc.c           | 12 ++++++----
 3 files changed, 12 insertions(+), 28 deletions(-)
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index 8d4bb95f8424..7d69bc496283 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -127,6 +127,10 @@  reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
 		 */
 		intel_guc_find_hung_context(engine);
 
+	/* If the heartbeat failed to resume after reset, declare an emergency. */
+	if (xchg(&rq->fence.error, -ENODEV) == -ENODEV)
+		intel_gt_set_wedged(engine->gt);
+
 	intel_gt_handle_error(engine->gt, engine->mask,
 			      I915_ERROR_CAPTURE,
 			      "stopped heartbeat on %s",
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index fb7bff27b45a..a5c42b784168 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -59,35 +59,11 @@  static int __engine_unpark(struct intel_wakeref *wf)
 {
 	struct intel_engine_cs *engine =
 		container_of(wf, typeof(*engine), wakeref);
-	struct intel_context *ce;
 
 	ENGINE_TRACE(engine, "\n");
 
 	engine->wakeref_track = intel_gt_pm_get(engine->gt);
 
-	/* Discard stale context state from across idling */
-	ce = engine->kernel_context;
-	if (ce) {
-		GEM_BUG_ON(test_bit(CONTEXT_VALID_BIT, &ce->flags));
-
-		/* Flush all pending HW writes before we touch the context */
-		while (unlikely(intel_context_inflight(ce)))
-			intel_engine_flush_submission(engine);
-
-		/* First poison the image to verify we never fully trust it */
-		dbg_poison_ce(ce);
-
-		/* Scrub the context image after our loss of control */
-		ce->ops->reset(ce);
-
-		CE_TRACE(ce, "reset { seqno:%x, *hwsp:%x, ring:%x }\n",
-			 ce->timeline->seqno,
-			 READ_ONCE(*ce->timeline->hwsp_seqno),
-			 ce->ring->emit);
-		GEM_BUG_ON(ce->timeline->seqno !=
-			   READ_ONCE(*ce->timeline->hwsp_seqno));
-	}
-
 	if (engine->unpark)
 		engine->unpark(engine);
 
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 7bd5d2c29056..f742707b703e 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1017,15 +1017,19 @@  void lrc_init_state(struct intel_context *ce,
 
 	set_redzone(state, engine);
 
+	/* Clear the ppHWSP (inc. per-context counters) */
+	if (!test_bit(CONTEXT_VALID_BIT, &ce->flags))
+		memset(state, 0, LRC_STATE_OFFSET);
+
 	if (ce->default_state) {
-		shmem_read(ce->default_state, 0, state, engine->context_size);
+		shmem_read(ce->default_state, /* exclude ppHWSP */
+			   LRC_STATE_OFFSET,
+			   state + LRC_STATE_OFFSET,
+			   engine->context_size - LRC_STATE_OFFSET);
 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
 		inhibit = false;
 	}
 
-	/* Clear the ppHWSP (inc. per-context counters) */
-	memset(state, 0, PAGE_SIZE);
-
 	/* Clear the indirect wa and storage */
 	if (ce->wa_bb_page)
 		memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);