@@ -237,6 +237,9 @@ __printf(3, 4)
void intel_engine_dump(struct intel_engine_cs *engine,
struct drm_printer *m,
const char *header, ...);
+void intel_engine_dump_active_requests(struct list_head *requests,
+ struct i915_request *hung_rq,
+ struct drm_printer *m);
ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine,
ktime_t *now);
@@ -1680,6 +1680,97 @@ static void print_properties(struct intel_engine_cs *engine,
read_ul(&engine->defaults, p->offset));
}
+static void engine_dump_request(struct i915_request *rq, struct drm_printer *m, const char *msg)
+{
+ struct intel_timeline *tl = get_timeline(rq);
+
+ i915_request_show(m, rq, msg, 0);
+
+ drm_printf(m, "\t\tring->start: 0x%08x\n",
+ i915_ggtt_offset(rq->ring->vma));
+ drm_printf(m, "\t\tring->head: 0x%08x\n",
+ rq->ring->head);
+ drm_printf(m, "\t\tring->tail: 0x%08x\n",
+ rq->ring->tail);
+ drm_printf(m, "\t\tring->emit: 0x%08x\n",
+ rq->ring->emit);
+ drm_printf(m, "\t\tring->space: 0x%08x\n",
+ rq->ring->space);
+
+ if (tl) {
+ drm_printf(m, "\t\tring->hwsp: 0x%08x\n",
+ tl->hwsp_offset);
+ intel_timeline_put(tl);
+ }
+
+ print_request_ring(m, rq);
+
+ if (rq->context->lrc_reg_state) {
+ drm_printf(m, "Logical Ring Context:\n");
+ hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE);
+ }
+}
+
+void intel_engine_dump_active_requests(struct list_head *requests,
+ struct i915_request *hung_rq,
+ struct drm_printer *m)
+{
+ struct i915_request *rq;
+ const char *msg;
+ enum i915_request_state state;
+
+ list_for_each_entry(rq, requests, sched.link) {
+ if (rq == hung_rq)
+ continue;
+
+ state = i915_test_request_state(rq);
+ if (state < I915_REQUEST_QUEUED)
+ continue;
+
+ if (state == I915_REQUEST_ACTIVE)
+ msg = "\t\tactive on engine";
+ else
+ msg = "\t\tactive in queue";
+
+ engine_dump_request(rq, m, msg);
+ }
+}
+
+static void engine_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m)
+{
+ struct i915_request *hung_rq = NULL;
+ struct intel_context *ce;
+ bool guc;
+
+ /*
+ * No need for an engine->irq_seqno_barrier() before the seqno reads.
+ * The GPU is still running so requests are still executing and any
+ * hardware reads will be out of date by the time they are reported.
+ * But the intention here is just to report an instantaneous snapshot
+ * so that's fine.
+ */
+ lockdep_assert_held(&engine->sched_engine->lock);
+
+ drm_printf(m, "\tRequests:\n");
+
+ guc = intel_uc_uses_guc_submission(&engine->gt->uc);
+ if (guc) {
+ ce = intel_engine_get_hung_context(engine);
+ if (ce)
+ hung_rq = intel_context_find_active_request(ce);
+ } else
+ hung_rq = intel_engine_execlist_find_hung_request(engine);
+
+ if (hung_rq)
+ engine_dump_request(hung_rq, m, "\t\thung");
+
+ if (guc)
+ intel_guc_dump_active_requests(engine, hung_rq, m);
+ else
+ intel_engine_dump_active_requests(&engine->sched_engine->requests,
+ hung_rq, m);
+}
+
void intel_engine_dump(struct intel_engine_cs *engine,
struct drm_printer *m,
const char *header, ...)
@@ -1724,39 +1815,9 @@ void intel_engine_dump(struct intel_engine_cs *engine,
i915_reset_count(error));
print_properties(engine, m);
- drm_printf(m, "\tRequests:\n");
-
spin_lock_irqsave(&engine->sched_engine->lock, flags);
- rq = intel_engine_execlist_find_hung_request(engine);
- if (rq) {
- struct intel_timeline *tl = get_timeline(rq);
-
- i915_request_show(m, rq, "\t\tactive ", 0);
-
- drm_printf(m, "\t\tring->start: 0x%08x\n",
- i915_ggtt_offset(rq->ring->vma));
- drm_printf(m, "\t\tring->head: 0x%08x\n",
- rq->ring->head);
- drm_printf(m, "\t\tring->tail: 0x%08x\n",
- rq->ring->tail);
- drm_printf(m, "\t\tring->emit: 0x%08x\n",
- rq->ring->emit);
- drm_printf(m, "\t\tring->space: 0x%08x\n",
- rq->ring->space);
-
- if (tl) {
- drm_printf(m, "\t\tring->hwsp: 0x%08x\n",
- tl->hwsp_offset);
- intel_timeline_put(tl);
- }
-
- print_request_ring(m, rq);
+ engine_dump_active_requests(engine, m);
- if (rq->context->lrc_reg_state) {
- drm_printf(m, "Logical Ring Context:\n");
- hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE);
- }
- }
drm_printf(m, "\tOn hold?: %lu\n",
list_count(&engine->sched_engine->hold));
spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
@@ -1830,13 +1891,6 @@ intel_engine_create_virtual(struct intel_engine_cs **siblings,
return siblings[0]->cops->create_virtual(siblings, count);
}
-static bool match_ring(struct i915_request *rq)
-{
- u32 ring = ENGINE_READ(rq->engine, RING_START);
-
- return ring == i915_ggtt_offset(rq->ring->vma);
-}
-
struct i915_request *
intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine)
{
@@ -1880,14 +1934,7 @@ intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine)
list_for_each_entry(request, &engine->sched_engine->requests,
sched.link) {
- if (__i915_request_is_complete(request))
- continue;
-
- if (!__i915_request_has_started(request))
- continue;
-
- /* More than one preemptible request may match! */
- if (!match_ring(request))
+ if (i915_test_request_state(request) != I915_REQUEST_ACTIVE)
continue;
active = request;
@@ -90,6 +90,14 @@ reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
show_heartbeat(rq, engine);
+ if (intel_engine_uses_guc(engine))
+ /*
+ * GuC itself is toast or GuC's hang detection
+ * is disabled. Either way, need to find the
+ * hang culprit manually.
+ */
+ intel_guc_find_hung_context(engine);
+
intel_gt_handle_error(engine->gt, engine->mask,
I915_ERROR_CAPTURE,
"stopped heartbeat on %s",
@@ -156,7 +156,7 @@ void __i915_request_reset(struct i915_request *rq, bool guilty)
if (guilty) {
i915_request_set_error_once(rq, -EIO);
__i915_request_skip(rq);
- if (mark_guilty(rq))
+ if (mark_guilty(rq) && !intel_engine_uses_guc(rq->engine))
skip_context(rq);
} else {
i915_request_set_error_once(rq, -EAGAIN);
@@ -269,6 +269,8 @@ int intel_guc_context_reset_process_msg(struct intel_guc *guc,
int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
const u32 *msg, u32 len);
+void intel_guc_find_hung_context(struct intel_engine_cs *engine);
+
void intel_guc_submission_reset_prepare(struct intel_guc *guc);
void intel_guc_submission_reset(struct intel_guc *guc, bool stalled);
void intel_guc_submission_reset_finish(struct intel_guc *guc);
@@ -2267,6 +2267,73 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
return 0;
}
+void intel_guc_find_hung_context(struct intel_engine_cs *engine)
+{
+ struct intel_guc *guc = &engine->gt->uc.guc;
+ struct intel_context *ce;
+ struct i915_request *rq;
+ unsigned long index;
+
+ /* Reset called during driver load? GuC not yet initialised! */
+ if (unlikely(!guc_submission_initialized(guc)))
+ return;
+
+ xa_for_each(&guc->context_lookup, index, ce) {
+ if (!intel_context_is_pinned(ce))
+ continue;
+
+ if (intel_engine_is_virtual(ce->engine)) {
+ if (!(ce->engine->mask & engine->mask))
+ continue;
+ } else {
+ if (ce->engine != engine)
+ continue;
+ }
+
+ list_for_each_entry(rq, &ce->guc_active.requests, sched.link) {
+ if (i915_test_request_state(rq) != I915_REQUEST_ACTIVE)
+ continue;
+
+ intel_engine_set_hung_context(engine, ce);
+
+ /* Can only cope with one hang at a time... */
+ return;
+ }
+ }
+}
+
+void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
+ struct i915_request *hung_rq,
+ struct drm_printer *m)
+{
+ struct intel_guc *guc = &engine->gt->uc.guc;
+ struct intel_context *ce;
+ unsigned long index;
+ unsigned long flags;
+
+ /* Reset called during driver load? GuC not yet initialised! */
+ if (unlikely(!guc_submission_initialized(guc)))
+ return;
+
+ xa_for_each(&guc->context_lookup, index, ce) {
+ if (!intel_context_is_pinned(ce))
+ continue;
+
+ if (intel_engine_is_virtual(ce->engine)) {
+ if (!(ce->engine->mask & engine->mask))
+ continue;
+ } else {
+ if (ce->engine != engine)
+ continue;
+ }
+
+ spin_lock_irqsave(&ce->guc_active.lock, flags);
+ intel_engine_dump_active_requests(&ce->guc_active.requests,
+ hung_rq, m);
+ spin_unlock_irqrestore(&ce->guc_active.lock, flags);
+ }
+}
+
void intel_guc_log_submission_info(struct intel_guc *guc,
struct drm_printer *p)
{
@@ -24,6 +24,9 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine);
void intel_guc_log_submission_info(struct intel_guc *guc,
struct drm_printer *p);
void intel_guc_log_context_info(struct intel_guc *guc, struct drm_printer *p);
+void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
+ struct i915_request *hung_rq,
+ struct drm_printer *m);
bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
@@ -2076,6 +2076,47 @@ void i915_request_show(struct drm_printer *m,
name);
}
+static bool engine_match_ring(struct intel_engine_cs *engine, struct i915_request *rq)
+{
+ u32 ring = ENGINE_READ(engine, RING_START);
+
+ return ring == i915_ggtt_offset(rq->ring->vma);
+}
+
+static bool match_ring(struct i915_request *rq)
+{
+ struct intel_engine_cs *engine;
+ bool found;
+ int i;
+
+ if (!intel_engine_is_virtual(rq->engine))
+ return engine_match_ring(rq->engine, rq);
+
+ found = false;
+ i = 0;
+ while ((engine = intel_engine_get_sibling(rq->engine, i++))) {
+ found = engine_match_ring(engine, rq);
+ if (found)
+ break;
+ }
+
+ return found;
+}
+
+enum i915_request_state i915_test_request_state(struct i915_request *rq)
+{
+ if (i915_request_completed(rq))
+ return I915_REQUEST_COMPLETE;
+
+ if (!i915_request_started(rq))
+ return I915_REQUEST_PENDING;
+
+ if (match_ring(rq))
+ return I915_REQUEST_ACTIVE;
+
+ return I915_REQUEST_QUEUED;
+}
+
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "selftests/mock_request.c"
#include "selftests/i915_request.c"
@@ -651,4 +651,15 @@ i915_request_active_engine(struct i915_request *rq,
void i915_request_notify_execute_cb_imm(struct i915_request *rq);
+enum i915_request_state
+{
+ I915_REQUEST_UNKNOWN = 0,
+ I915_REQUEST_COMPLETE,
+ I915_REQUEST_PENDING,
+ I915_REQUEST_QUEUED,
+ I915_REQUEST_ACTIVE,
+};
+
+enum i915_request_state i915_test_request_state(struct i915_request *rq);
+
#endif /* I915_REQUEST_H */