[5/7] drm/i915/gem: Cancel contexts when hangchecking is disabled

Message ID	20191015122250.28884-5-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=HTTV=YI=lists.freedesktop.org=intel-gfx-bounces@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 1BC072067B From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Tue, 15 Oct 2019 13:22:48 +0100 Message-Id: <20191015122250.28884-5-chris@chris-wilson.co.uk> In-Reply-To: <20191015122250.28884-1-chris@chris-wilson.co.uk> References: <20191015122250.28884-1-chris@chris-wilson.co.uk> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH 5/7] drm/i915/gem: Cancel contexts when hangchecking is disabled Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	[1/7] drm/i915: Expose engine properties via sysfs \| expand [1/7] drm/i915: Expose engine properties via sysfs [2/7] drm/i915/execlists: Force preemption [3/7] drm/i915/gt: Introduce barrier pulses along engines [4/7] drm/i915/execlists: Cancel banned contexts on schedule-out [5/7] drm/i915/gem: Cancel contexts when hangchecking is disabled [6/7] drm/i915: Replace hangcheck by heartbeats [7/7] drm/i915/gem: Make context persistence optional

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 5d8221c7ba83..1f58d0b654d8 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -70,6 +70,7 @@ #include <drm/i915_drm.h> #include "gt/intel_lrc_reg.h" +#include "gt/intel_engine_heartbeat.h" #include "gt/intel_engine_user.h" #include "i915_gem_context.h" @@ -269,6 +270,135 @@ void i915_gem_context_release(struct kref *ref) schedule_work(&gc->free_work); } +static inline struct i915_gem_engines * +__context_engines_static(const struct i915_gem_context *ctx) +{ + return rcu_dereference_protected(ctx->engines, true); +} + +static bool __reset_engine(struct intel_engine_cs *engine) +{ + struct intel_gt *gt = engine->gt; + bool success = false; + + if (!intel_has_reset_engine(gt)) + return false; + + if (!test_and_set_bit(I915_RESET_ENGINE + engine->id, + &gt->reset.flags)) { + success = intel_engine_reset(engine, NULL) == 0; + clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, + &gt->reset.flags); + } + + return success; +} + +static void __reset_context(struct i915_gem_context *ctx, + struct intel_engine_cs *engine) +{ + intel_gt_handle_error(engine->gt, engine->mask, 0, + "context closure in %s", ctx->name); +} + +static bool __cancel_engine(struct intel_engine_cs *engine) +{ + /* + * Send a "high priority pulse" down the engine to cause the + * current request to be momentarily preempted. (If it fails to + * be preempted, it will be reset). As we have marked our context + * as banned, any incomplete request, including any running, will + * be skipped following the preemption. + * + * If there is no hangchecking (one of the reasons why we try to + * cancel the context) and no forced preemption, there may be no + * means by which we reset the GPU and evict the persistent hog. + * Ergo if we are unable to inject a preemptive pulse that can + * kill the banned context, we fallback to doing a local reset + * instead. + */ + if (CONFIG_DRM_I915_PREEMPT_TIMEOUT && !intel_engine_pulse(engine)) + return true; + + /* If we are unable to send a pulse, try resetting this engine. */ + return __reset_engine(engine); +} + +static struct intel_engine_cs * +active_engine(struct dma_fence *fence, struct intel_context *ce) +{ + struct i915_request *rq = to_request(fence); + struct intel_engine_cs *engine, *locked; + + /* + * Serialise with __i915_request_submit() so that it sees + * is-banned?, or we know the request is already inflight. + */ + locked = READ_ONCE(rq->engine); + spin_lock_irq(&locked->active.lock); + while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) { + spin_unlock(&locked->active.lock); + spin_lock(&engine->active.lock); + locked = engine; + } + + engine = NULL; + if (i915_request_is_active(rq) && !rq->fence.error) + engine = rq->engine; + + spin_unlock_irq(&locked->active.lock); + + return engine; +} + +static void kill_context(struct i915_gem_context *ctx) +{ + struct i915_gem_engines_iter it; + struct intel_context *ce; + + /* + * If we are already banned, it was due to a guilty request causing + * a reset and the entire context being evicted from the GPU. + */ + if (i915_gem_context_is_banned(ctx)) + return; + + i915_gem_context_set_banned(ctx); + + /* + * Map the user's engine back to the actual engines; one virtual + * engine will be mapped to multiple engines, and using ctx->engine[] + * the same engine may be have multiple instances in the user's map. + * However, we only care about pending requests, so only include + * engines on which there are incomplete requests. + */ + for_each_gem_engine(ce, __context_engines_static(ctx), it) { + struct intel_engine_cs *engine; + struct dma_fence *fence; + + if (!ce->timeline) + continue; + + fence = i915_active_fence_get(&ce->timeline->last_request); + if (!fence) + continue; + + /* Check with the backend if the request is still inflight */ + engine = active_engine(fence, ce); + + /* First attempt to gracefully cancel the context */ + if (engine && !__cancel_engine(engine)) + /* + * If we are unable to send a preemptive pulse to bump + * the context from the GPU, we have to resort to a full + * reset. We hope the collateral damage is worth it. + */ + __reset_context(ctx, engine); + + dma_fence_put(fence); + } +} + static void context_close(struct i915_gem_context *ctx) { struct i915_address_space *vm; @@ -291,6 +421,17 @@ static void context_close(struct i915_gem_context *ctx) lut_close(ctx); mutex_unlock(&ctx->mutex); + + /* + * If the user has disabled hangchecking, we can not be sure that + * the batches will ever complete after the context is closed, + * keeping the context and all resources pinned forever. So in this + * case we opt to forcibly kill off all remaining requests on + * context close. + */ + if (!i915_modparams.enable_hangcheck) + kill_context(ctx); + i915_gem_context_put(ctx); }

[5/7] drm/i915/gem: Cancel contexts when hangchecking is disabled

Commit Message

Patch