[12/20] drm/i915: Only include active engines in the capture state

Message ID	20190718070024.21781-12-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Thu, 18 Jul 2019 08:00:16 +0100 Message-Id: <20190718070024.21781-12-chris@chris-wilson.co.uk> In-Reply-To: <20190718070024.21781-1-chris@chris-wilson.co.uk> References: <20190718070024.21781-1-chris@chris-wilson.co.uk> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH 12/20] drm/i915: Only include active engines in the capture state Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	[01/20] drm/i915: Move aliasing_ppgtt underneath its i915_ggtt \| expand [01/20] drm/i915: Move aliasing_ppgtt underneath its i915_ggtt [02/20] drm/i915/gt: Hook up intel_context_fini() [03/20] drm/i915/gt: Provde a local intel_context.vm [04/20] drm/i915/execlists: Cancel breadcrumb on preempting the virtual engine [05/20] drm/i915: Hide unshrinkable context objects from the shrinker [06/20] drm/i915: Remove obsolete engine cleanup [07/20] drm/i915/gt: Move the [class][inst] lookup for engines onto the GT [08/20] drm/i915: Introduce for_each_user_engine() [09/20] drm/i915: Use intel_engine_lookup_user for probing HAS_BSD etc [10/20] drm/i915: Isolate i915_getparam_ioctl() [11/20] drm/i915: Rely on spinlock protection for GPU error capture [12/20] drm/i915: Only include active engines in the capture state [13/20] drm/i915: Teach execbuffer to take the engine wakeref not GT [14/20] drm/i915/gt: Track timeline activeness in enter/exit [15/20] drm/i915/gt: Convert timeline tracking to spinlock [16/20] drm/i915/gt: Guard timeline pinning with its own mutex [17/20] drm/i915/gt: Add to timeline requires the timeline mutex [18/20] drm/i915: Protect request retirement with timeline->mutex [19/20] drm/i915: Replace struct_mutex for batch pool serialisation [20/20] drm/i915/gt: Mark context->active_count as protected by timeline->mutex

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 131a52c428de..badbfe8dc7e6 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -49,27 +49,6 @@ #define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN) #define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN) -static inline const struct intel_engine_cs * -engine_lookup(const struct drm_i915_private *i915, unsigned int id) -{ - if (id >= I915_NUM_ENGINES) - return NULL; - - return i915->engine[id]; -} - -static inline const char * -__engine_name(const struct intel_engine_cs *engine) -{ - return engine ? engine->name : ""; -} - -static const char * -engine_name(const struct drm_i915_private *i915, unsigned int id) -{ - return __engine_name(engine_lookup(i915, id)); -} - static void __sg_set_buf(struct scatterlist *sg, void *addr, unsigned int len, loff_t it) { @@ -459,7 +438,7 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m, err_printf(m, " INSTDONE: 0x%08x\n", ee->instdone.instdone); - if (ee->engine_id != RCS0 || INTEL_GEN(m->i915) <= 3) + if (ee->engine->class != RENDER_CLASS || INTEL_GEN(m->i915) <= 3) return; err_printf(m, " SC_INSTDONE: 0x%08x\n", @@ -513,8 +492,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, { int n; - err_printf(m, "%s command stream:\n", - engine_name(m->i915, ee->engine_id)); + err_printf(m, "%s command stream:\n", ee->engine->name); err_printf(m, " IDLE?: %s\n", yesno(ee->idle)); err_printf(m, " START: 0x%08x\n", ee->start); err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head); @@ -590,9 +568,9 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) } static void print_error_obj(struct drm_i915_error_state_buf *m, - struct intel_engine_cs *engine, + const struct intel_engine_cs *engine, const char *name, - struct drm_i915_error_object *obj) + const struct drm_i915_error_object *obj) { char out[ASCII85_BUFSZ]; int page; @@ -689,7 +667,7 @@ static void err_free_sgl(struct scatterlist *sgl) static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, struct i915_gpu_state *error) { - struct drm_i915_error_object *obj; + const struct drm_i915_error_engine *ee; struct timespec64 ts; int i, j; @@ -713,15 +691,12 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, jiffies_to_msecs(jiffies - error->capture), jiffies_to_msecs(error->capture - error->epoch)); - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - if (!error->engine[i].context.pid) - continue; - + for (ee = error->engine; ee; ee = ee->next) err_printf(m, "Active process (on ring %s): %s [%d]\n", - engine_name(m->i915, i), - error->engine[i].context.comm, - error->engine[i].context.pid); - } + ee->engine->name, + ee->context.comm, + ee->context.pid); + err_printf(m, "Reset count: %u\n", error->reset_count); err_printf(m, "Suspend count: %u\n", error->suspend_count); err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform)); @@ -770,17 +745,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, if (IS_GEN(m->i915, 7)) err_printf(m, "ERR_INT: 0x%08x\n", error->err_int); - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - if (error->engine[i].engine_id != -1) - error_print_engine(m, &error->engine[i], error->epoch); - } + for (ee = error->engine; ee; ee = ee->next) + error_print_engine(m, ee, error->epoch); - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - const struct drm_i915_error_engine *ee = &error->engine[i]; + for (ee = error->engine; ee; ee = ee->next) { + const struct drm_i915_error_object *obj; obj = ee->batchbuffer; if (obj) { - err_puts(m, m->i915->engine[i]->name); + err_puts(m, ee->engine->name); if (ee->context.pid) err_printf(m, " (submitted by %s [%d])", ee->context.comm, @@ -788,16 +761,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, err_printf(m, " --- gtt_offset = 0x%08x %08x\n", upper_32_bits(obj->gtt_offset), lower_32_bits(obj->gtt_offset)); - print_error_obj(m, m->i915->engine[i], NULL, obj); + print_error_obj(m, ee->engine, NULL, obj); } for (j = 0; j < ee->user_bo_count; j++) - print_error_obj(m, m->i915->engine[i], - "user", ee->user_bo[j]); + print_error_obj(m, ee->engine, "user", ee->user_bo[j]); if (ee->num_requests) { err_printf(m, "%s --- %d requests\n", - m->i915->engine[i]->name, + ee->engine->name, ee->num_requests); for (j = 0; j < ee->num_requests; j++) error_print_request(m, " ", @@ -805,22 +777,13 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, error->epoch); } - print_error_obj(m, m->i915->engine[i], - "ringbuffer", ee->ringbuffer); - - print_error_obj(m, m->i915->engine[i], - "HW Status", ee->hws_page); - - print_error_obj(m, m->i915->engine[i], - "HW context", ee->ctx); - - print_error_obj(m, m->i915->engine[i], - "WA context", ee->wa_ctx); - - print_error_obj(m, m->i915->engine[i], + print_error_obj(m, ee->engine, "ringbuffer", ee->ringbuffer); + print_error_obj(m, ee->engine, "HW Status", ee->hws_page); + print_error_obj(m, ee->engine, "HW context", ee->ctx); + print_error_obj(m, ee->engine, "WA context", ee->wa_ctx); + print_error_obj(m, ee->engine, "WA batchbuffer", ee->wa_batchbuffer); - - print_error_obj(m, m->i915->engine[i], + print_error_obj(m, ee->engine, "NULL context", ee->default_state); } @@ -969,13 +932,15 @@ void __i915_gpu_state_free(struct kref *error_ref) { struct i915_gpu_state *error = container_of(error_ref, typeof(*error), ref); - long i, j; + long i; - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - struct drm_i915_error_engine *ee = &error->engine[i]; + while (error->engine) { + struct drm_i915_error_engine *ee = error->engine; - for (j = 0; j < ee->user_bo_count; j++) - i915_error_object_free(ee->user_bo[j]); + error->engine = ee->next; + + for (i = 0; i < ee->user_bo_count; i++) + i915_error_object_free(ee->user_bo[i]); kfree(ee->user_bo); i915_error_object_free(ee->batchbuffer); @@ -986,6 +951,7 @@ void __i915_gpu_state_free(struct kref *error_ref) i915_error_object_free(ee->wa_ctx); kfree(ee->requests); + kfree(ee); } kfree(error->overlay); @@ -1065,23 +1031,17 @@ i915_error_object_create(struct drm_i915_private *i915, * * It's only a small step better than a random number in its current form. */ -static u32 i915_error_generate_code(struct i915_gpu_state *error, - intel_engine_mask_t engine_mask) +static u32 i915_error_generate_code(struct i915_gpu_state *error) { + const struct drm_i915_error_engine *ee = error->engine; + /* * IPEHR would be an ideal way to detect errors, as it's the gross * measure of "the command that hung." However, has some very common * synchronization commands which almost always appear in the case * strictly a client bug. Use instdone to differentiate those some. */ - if (engine_mask) { - struct drm_i915_error_engine *ee = - &error->engine[ffs(engine_mask)]; - - return ee->ipehr ^ ee->instdone.instdone; - } - - return 0; + return ee ? ee->ipehr ^ ee->instdone.instdone : 0; } static void gem_record_fences(struct i915_gpu_state *error) @@ -1374,74 +1334,70 @@ static void gem_record_rings(struct i915_gpu_state *error, struct compress *compress) { struct drm_i915_private *i915 = error->i915; - int i; + struct intel_engine_cs *engine; + struct drm_i915_error_engine *ee; + + ee = kzalloc(sizeof(*ee), GFP_KERNEL); + if (!ee) + return; - for (i = 0; i < I915_NUM_ENGINES; i++) { - struct intel_engine_cs *engine = i915->engine[i]; - struct drm_i915_error_engine *ee = &error->engine[i]; + for_each_user_engine(engine, i915) { struct i915_request *request; unsigned long flags; - ee->engine_id = -1; - - if (!engine) - continue; - - ee->engine_id = i; - /* Refill our page pool before entering atomic section */ pool_refill(&compress->pool, ALLOW_FAIL); - error_record_engine_registers(error, engine, ee); - error_record_engine_execlists(engine, ee); - spin_lock_irqsave(&engine->active.lock, flags); request = intel_engine_find_active_request(engine); - if (request) { - struct i915_gem_context *ctx = request->gem_context; - struct intel_ring *ring = request->ring; - - record_context(&ee->context, ctx); + if (!request) { + spin_unlock_irqrestore(&engine->active.lock, flags); + continue; + } - /* We need to copy these to an anonymous buffer - * as the simplest method to avoid being overwritten - * by userspace. - */ - ee->batchbuffer = - i915_error_object_create(i915, - request->batch, - compress); + record_context(&ee->context, request->gem_context); + error->simulated |= + i915_gem_context_no_error_capture(request->gem_context); - if (HAS_BROKEN_CS_TLB(i915)) - ee->wa_batchbuffer = - i915_error_object_create(i915, - engine->gt->scratch, - compress); - request_record_user_bo(request, ee, compress); + /* + * We need to copy these to an anonymous buffer + * as the simplest method to avoid being overwritten + * by userspace. + */ + ee->batchbuffer = + i915_error_object_create(i915, + request->batch, + compress); - ee->ctx = + if (HAS_BROKEN_CS_TLB(i915)) + ee->wa_batchbuffer = i915_error_object_create(i915, - request->hw_context->state, + engine->gt->scratch, compress); + request_record_user_bo(request, ee, compress); - error->simulated |= - i915_gem_context_no_error_capture(ctx); + ee->ctx = + i915_error_object_create(i915, + request->hw_context->state, + compress); - ee->rq_head = request->head; - ee->rq_post = request->postfix; - ee->rq_tail = request->tail; + ee->rq_head = request->head; + ee->rq_post = request->postfix; + ee->rq_tail = request->tail; - ee->cpu_ring_head = ring->head; - ee->cpu_ring_tail = ring->tail; - ee->ringbuffer = - i915_error_object_create(i915, - ring->vma, - compress); + ee->cpu_ring_head = request->ring->head; + ee->cpu_ring_tail = request->ring->tail; + ee->ringbuffer = + i915_error_object_create(i915, + request->ring->vma, + compress); - engine_record_requests(engine, request, ee); - } + engine_record_requests(engine, request, ee); spin_unlock_irqrestore(&engine->active.lock, flags); + error_record_engine_registers(error, engine, ee); + error_record_engine_execlists(engine, ee); + ee->hws_page = i915_error_object_create(i915, engine->status_page.vma, @@ -1454,7 +1410,18 @@ gem_record_rings(struct i915_gpu_state *error, struct compress *compress) ee->default_state = capture_object(i915, engine->default_state, compress); + + ee->engine = engine; + + ee->next = error->engine; + error->engine = ee; + + ee = kzalloc(sizeof(*ee), GFP_KERNEL); + if (!ee) + return; } + + kfree(ee); } static void @@ -1583,24 +1550,18 @@ error_msg(struct i915_gpu_state *error, intel_engine_mask_t engines, const char *msg) { int len; - int i; - - for (i = 0; i < ARRAY_SIZE(error->engine); i++) - if (!error->engine[i].context.pid) - engines &= ~BIT(i); len = scnprintf(error->error_msg, sizeof(error->error_msg), "GPU HANG: ecode %d:%x:0x%08x", INTEL_GEN(error->i915), engines, - i915_error_generate_code(error, engines)); - if (engines) { + i915_error_generate_code(error)); + if (error->engine) { /* Just show the first executing process, more is confusing */ - i = __ffs(engines); len += scnprintf(error->error_msg + len, sizeof(error->error_msg) - len, ", in %s [%d]", - error->engine[i].context.comm, - error->engine[i].context.pid); + error->engine->context.comm, + error->engine->context.pid); } if (msg) len += scnprintf(error->error_msg + len, @@ -1641,12 +1602,10 @@ static void capture_params(struct i915_gpu_state *error) static unsigned long capture_find_epoch(const struct i915_gpu_state *error) { + const struct drm_i915_error_engine *ee; unsigned long epoch = error->capture; - int i; - - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - const struct drm_i915_error_engine *ee = &error->engine[i]; + for (ee = error->engine; ee; ee = ee->next) { if (ee->hangcheck_timestamp && time_before(ee->hangcheck_timestamp, epoch)) epoch = ee->hangcheck_timestamp; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index a24c35107d16..df9f57766626 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -81,7 +81,8 @@ struct i915_gpu_state { struct intel_display_error_state *display; struct drm_i915_error_engine { - int engine_id; + const struct intel_engine_cs *engine; + /* Software tracked state */ bool idle; unsigned long hangcheck_timestamp; @@ -158,7 +159,9 @@ struct i915_gpu_state { u32 pp_dir_base; }; } vm_info; - } engine[I915_NUM_ENGINES]; + + struct drm_i915_error_engine *next; + } *engine; struct scatterlist *sgl, *fit; };

[12/20] drm/i915: Only include active engines in the capture state

Commit Message

Patch