diff mbox series

drm/i915/guc: Set wedged if enable guc communication failed

Message ID 20230426161133.1009519-1-zhanjun.dong@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/guc: Set wedged if enable guc communication failed | expand

Commit Message

Dong, Zhanjun April 26, 2023, 4:11 p.m. UTC
Add err code check for enable_communication on resume path. When resume failed, we can no longer use the GPU, marking the GPU as wedged.

Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_gt_pm.c |  7 ++++++-
 drivers/gpu/drm/i915/gt/intel_reset.c | 19 ++++++++++++++++---
 drivers/gpu/drm/i915/gt/intel_reset.h |  1 +
 drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++--
 4 files changed, 30 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index e02cb90723ae..775ce511f810 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -373,8 +373,13 @@  int intel_gt_runtime_resume(struct intel_gt *gt)
 	intel_ggtt_restore_fences(gt->ggtt);
 
 	ret = intel_uc_runtime_resume(&gt->uc);
-	if (ret)
+	if (ret && intel_uc_uses_guc_submission(&gt->uc)) {
+		/* Resume failed on GuC submission, we can no longer use the GPU, marking the GPU
+		 * as wedged.
+		 */
+		intel_gt_set_wedged_flag(gt);
 		return ret;
+	}
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 195ff72d7a14..05142761770a 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -962,6 +962,20 @@  static void nop_submit_request(struct i915_request *request)
 	}
 }
 
+void intel_gt_set_wedged_flag(struct intel_gt *gt)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	if (test_bit(I915_WEDGED, &gt->reset.flags))
+		return;
+
+	for_each_engine(engine, gt, id)
+		engine->submit_request = nop_submit_request;
+
+	set_bit(I915_WEDGED, &gt->reset.flags);
+}
+
 static void __intel_gt_set_wedged(struct intel_gt *gt)
 {
 	struct intel_engine_cs *engine;
@@ -984,8 +998,8 @@  static void __intel_gt_set_wedged(struct intel_gt *gt)
 	if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
 		__intel_gt_reset(gt, ALL_ENGINES);
 
-	for_each_engine(engine, gt, id)
-		engine->submit_request = nop_submit_request;
+
+	intel_gt_set_wedged_flag(gt);
 
 	/*
 	 * Make sure no request can slip through without getting completed by
@@ -993,7 +1007,6 @@  static void __intel_gt_set_wedged(struct intel_gt *gt)
 	 * in nop_submit_request.
 	 */
 	synchronize_rcu_expedited();
-	set_bit(I915_WEDGED, &gt->reset.flags);
 
 	/* Mark all executing requests as skipped */
 	local_bh_disable();
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h
index 25c975b6e8fc..3796b8d877b7 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.h
+++ b/drivers/gpu/drm/i915/gt/intel_reset.h
@@ -42,6 +42,7 @@  int __must_check intel_gt_reset_trylock(struct intel_gt *gt, int *srcu);
 int __must_check intel_gt_reset_lock_interruptible(struct intel_gt *gt, int *srcu);
 void intel_gt_reset_unlock(struct intel_gt *gt, int tag);
 
+void intel_gt_set_wedged_flag(struct intel_gt *gt);
 void intel_gt_set_wedged(struct intel_gt *gt);
 bool intel_gt_unset_wedged(struct intel_gt *gt);
 int intel_gt_terminally_wedged(struct intel_gt *gt);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index 4ccb4be4c9cb..62c5a953991c 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -700,8 +700,13 @@  static int __uc_resume(struct intel_uc *uc, bool enable_communication)
 	/* Make sure we enable communication if and only if it's disabled */
 	GEM_BUG_ON(enable_communication == intel_guc_ct_enabled(&guc->ct));
 
-	if (enable_communication)
-		guc_enable_communication(guc);
+	if (enable_communication) {
+		err = guc_enable_communication(guc);
+		if (err) {
+			DRM_DEBUG_DRIVER("Failed to enable communication, %pe", ERR_PTR(err));
+			return err;
+		}
+	}
 
 	/* If we are only resuming GuC communication but not reloading
 	 * GuC, we need to ensure the ARAT timer interrupt is enabled