[10/12] drm/i915: Cancel non-persistent contexts on close

Message ID	20191006165002.30312-10-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=Zxrt=X7=lists.freedesktop.org=intel-gfx-bounces@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 5EED320835 From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Sun, 6 Oct 2019 17:50:00 +0100 Message-Id: <20191006165002.30312-10-chris@chris-wilson.co.uk> In-Reply-To: <20191006165002.30312-1-chris@chris-wilson.co.uk> References: <20191006165002.30312-1-chris@chris-wilson.co.uk> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH 10/12] drm/i915: Cancel non-persistent contexts on close Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	[01/12] drm/i915/execlists: Fix annotation for decoupling virtual request \| expand [01/12] drm/i915/execlists: Fix annotation for decoupling virtual request [02/12] drm/i915/selftests: Appease lockdep [03/12] drm/i915/gt: Restore dropped 'interruptible' flag [04/12] drm/i915/gt: Treat a busy timeline as 'active' while waiting [05/12] drm/i915: Expose engine properties via sysfs [06/12] drm/i915/execlists: Force preemption [07/12] drm/i915: Mark up "sentinel" requests [08/12] drm/i915/gt: Introduce barrier pulses along engines [09/12] drm/i915/execlists: Cancel banned contexts on schedule-out [10/12] drm/i915: Cancel non-persistent contexts on close [11/12] drm/i915: Replace hangcheck by heartbeats [12/12] drm/i915: Flush idle barriers when waiting

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 5d8221c7ba83..46e5b3b53288 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -70,6 +70,7 @@ #include <drm/i915_drm.h> #include "gt/intel_lrc_reg.h" +#include "gt/intel_engine_heartbeat.h" #include "gt/intel_engine_user.h" #include "i915_gem_context.h" @@ -269,6 +270,78 @@ void i915_gem_context_release(struct kref *ref) schedule_work(&gc->free_work); } +static inline struct i915_gem_engines * +__context_engines_static(struct i915_gem_context *ctx) +{ + return rcu_dereference_protected(ctx->engines, true); +} + +static void kill_context(struct i915_gem_context *ctx) +{ + intel_engine_mask_t tmp, active, reset; + struct intel_gt *gt = &ctx->i915->gt; + struct i915_gem_engines_iter it; + struct intel_engine_cs *engine; + struct intel_context *ce; + + /* + * If we are already banned, it was due to a guilty request causing + * a reset and the entire context being evicted from the GPU. + */ + if (i915_gem_context_is_banned(ctx)) + return; + + i915_gem_context_set_banned(ctx); + + /* + * Map the user's engine back to the actual engines; one virtual + * engine will be mapped to multiple engines, and using ctx->engine[] + * the same engine may be have multiple instances in the user's map. + * However, we only care about pending requests, so only include + * engines on which there are incomplete requests. + */ + active = 0; + for_each_gem_engine(ce, __context_engines_static(ctx), it) { + struct dma_fence *fence; + + if (!ce->timeline) + continue; + + fence = i915_active_fence_get(&ce->timeline->last_request); + if (!fence) + continue; + + engine = to_request(fence)->engine; + if (HAS_EXECLISTS(gt->i915)) + engine = intel_context_inflight(ce); + if (engine) + active |= engine->mask; + + dma_fence_put(fence); + } + + /* + * Send a "high priority pulse" down the engine to cause the + * current request to be momentarily preempted. (If it fails to + * be preempted, it will be reset). As we have marked our context + * as banned, any incomplete request, including any running, will + * be skipped following the preemption. + */ + reset = 0; + for_each_engine_masked(engine, gt->i915, active, tmp) + if (intel_engine_pulse(engine)) + reset |= engine->mask; + + /* + * If we are unable to send a preemptive pulse to bump + * the context from the GPU, we have to resort to a full + * reset. We hope the collateral damage is worth it. + */ + if (reset) + intel_gt_handle_error(gt, reset, 0, + "context closure in %s", ctx->name); +} + static void context_close(struct i915_gem_context *ctx) { struct i915_address_space *vm; @@ -291,9 +364,47 @@ static void context_close(struct i915_gem_context *ctx) lut_close(ctx); mutex_unlock(&ctx->mutex); + + /* + * If the user has disabled hangchecking, we can not be sure that + * the batches will ever complete after the context is closed, + * keep the context and all resources pinned forever. So in this + * case we opt to forcibly kill off all remaining requests on + * context close. + */ + if (!i915_gem_context_is_persistent(ctx) || + !i915_modparams.enable_hangcheck) + kill_context(ctx); + i915_gem_context_put(ctx); } +static int __context_set_persistence(struct i915_gem_context *ctx, bool state) +{ + if (i915_gem_context_is_persistent(ctx) == state) + return 0; + + if (state) { + /* + * Only contexts that are short-lived [that will expire or be + * reset] are allowed to survive past termination. We require + * hangcheck to ensure that the persistent requests are healthy. + */ + if (!i915_modparams.enable_hangcheck) + return -EINVAL; + + i915_gem_context_set_persistence(ctx); + } else { + /* To cancel a context we use "preempt-to-idle" */ + if (!(ctx->i915->caps.scheduler & I915_SCHEDULER_CAP_PREEMPTION)) + return -ENODEV; + + i915_gem_context_clear_persistence(ctx); + } + + return 0; +} + static struct i915_gem_context * __create_context(struct drm_i915_private *i915) { @@ -328,6 +439,7 @@ __create_context(struct drm_i915_private *i915) i915_gem_context_set_bannable(ctx); i915_gem_context_set_recoverable(ctx); + __context_set_persistence(ctx, true /* cgroup hook? */); for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++) ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES; @@ -484,6 +596,7 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio) return ctx; i915_gem_context_clear_bannable(ctx); + i915_gem_context_set_persistence(ctx); ctx->sched.priority = I915_USER_PRIORITY(prio); GEM_BUG_ON(!i915_gem_context_is_kernel(ctx)); @@ -1594,6 +1707,16 @@ get_engines(struct i915_gem_context *ctx, return err; } +static int +set_persistence(struct i915_gem_context *ctx, + const struct drm_i915_gem_context_param *args) +{ + if (args->size) + return -EINVAL; + + return __context_set_persistence(ctx, args->value); +} + static int ctx_setparam(struct drm_i915_file_private *fpriv, struct i915_gem_context *ctx, struct drm_i915_gem_context_param *args) @@ -1671,6 +1794,10 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv, ret = set_engines(ctx, args); break; + case I915_CONTEXT_PARAM_PERSISTENCE: + ret = set_persistence(ctx, args); + break; + case I915_CONTEXT_PARAM_BAN_PERIOD: default: ret = -EINVAL; @@ -2123,6 +2250,11 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, ret = get_engines(ctx, args); break; + case I915_CONTEXT_PARAM_PERSISTENCE: + args->size = 0; + args->value = i915_gem_context_is_persistent(ctx); + break; + case I915_CONTEXT_PARAM_BAN_PERIOD: default: ret = -EINVAL; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h b/drivers/gpu/drm/i915/gem/i915_gem_context.h index 9234586830d1..2eec035382a2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h @@ -76,6 +76,21 @@ static inline void i915_gem_context_clear_recoverable(struct i915_gem_context *c clear_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags); } +static inline bool i915_gem_context_is_persistent(const struct i915_gem_context *ctx) +{ + return test_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags); +} + +static inline void i915_gem_context_set_persistence(struct i915_gem_context *ctx) +{ + set_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags); +} + +static inline void i915_gem_context_clear_persistence(struct i915_gem_context *ctx) +{ + clear_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags); +} + static inline bool i915_gem_context_is_banned(const struct i915_gem_context *ctx) { return test_bit(CONTEXT_BANNED, &ctx->flags); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h index ab8e1367dfc8..a3ecd19f2303 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h @@ -137,6 +137,7 @@ struct i915_gem_context { #define UCONTEXT_NO_ERROR_CAPTURE 1 #define UCONTEXT_BANNABLE 2 #define UCONTEXT_RECOVERABLE 3 +#define UCONTEXT_PERSISTENCE 4 /** * @flags: small set of booleans diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_context.c b/drivers/gpu/drm/i915/gem/selftests/mock_context.c index 74ddd682c9cd..29b8984f0e47 100644 --- a/drivers/gpu/drm/i915/gem/selftests/mock_context.c +++ b/drivers/gpu/drm/i915/gem/selftests/mock_context.c @@ -22,6 +22,8 @@ mock_context(struct drm_i915_private *i915, INIT_LIST_HEAD(&ctx->link); ctx->i915 = i915; + i915_gem_context_set_persistence(ctx); + mutex_init(&ctx->engines_mutex); e = default_engines(ctx); if (IS_ERR(e)) diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 30c542144016..eb9e704d717a 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1565,6 +1565,21 @@ struct drm_i915_gem_context_param { * i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND) */ #define I915_CONTEXT_PARAM_ENGINES 0xa + +/* + * I915_CONTEXT_PARAM_PERSISTENCE: + * + * Allow the context and active rendering to survive the process until + * completion. Persistence allows fire-and-forget clients to queue up a + * bunch of work, hand the output over to a display server and the quit. + * If the context is not marked as persistent, upon closing (either via + * an explicit DRM_I915_GEM_CONTEXT_DESTROY or implicitly from file closure + * or process termination), the context and any outstanding requests will be + * cancelled (and exported fences for cancelled requests marked as -EIO). + * + * By default, new contexts allow persistence. + */ +#define I915_CONTEXT_PARAM_PERSISTENCE 0xb /* Must be kept compact -- no holes and well documented */ __u64 value;

[10/12] drm/i915: Cancel non-persistent contexts on close

Commit Message

Patch