[19/22] drm/i915: Load balancing across a virtual engine

Message ID	20190318095204.9913-19-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Mon, 18 Mar 2019 09:52:01 +0000 Message-Id: <20190318095204.9913-19-chris@chris-wilson.co.uk> In-Reply-To: <20190318095204.9913-1-chris@chris-wilson.co.uk> References: <20190318095204.9913-1-chris@chris-wilson.co.uk> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH 19/22] drm/i915: Load balancing across a virtual engine Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	[01/22] drm/i915: Flush pages on acquisition \| expand [01/22] drm/i915: Flush pages on acquisition [02/22] drm/i915: Move intel_engine_mask_t around for use by i915_request_types.h [03/22] drm/i915: Sanity check mmap length against object size [04/22] drm/i915: Hold a ref to the ring while retiring [05/22] drm/i915: Lock the gem_context->active_list while dropping the link [06/22] drm/i915: Hold a reference to the active HW context [07/22] drm/i915: Stop needlessly acquiring wakeref for debugfs/drop_caches_set [08/22] drm/i915/selftests: Provide stub reset functions [09/22] drm/i915: Switch to use HWS indices rather than addresses [10/22] drm/i915: Separate GEM context construction and registration to userspace [11/22] drm/i915: Introduce a mutex for file_priv->context_idr [12/22] drm/i915: Introduce the i915_user_extension_method [13/22] drm/i915: Create/destroy VM (ppGTT) for use with contexts [14/22] drm/i915: Extend CONTEXT_CREATE to set parameters upon construction [15/22] drm/i915: Allow contexts to share a single timeline across all engines [16/22] drm/i915: Allow userspace to clone contexts on creation [17/22] drm/i915: Allow a context to define its set of engines [18/22] drm/i915: Extend I915_CONTEXT_PARAM_SSEU to support local ctx->engine[] [19/22] drm/i915: Load balancing across a virtual engine [20/22] drm/i915: Extend execution fence to support a callback [21/22] drm/i915/execlists: Virtual engine bonding [22/22] drm/i915: Allow specification of parallel execbuf

diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h index 5c073fe73664..3ca855505715 100644 --- a/drivers/gpu/drm/i915/i915_gem.h +++ b/drivers/gpu/drm/i915/i915_gem.h @@ -96,4 +96,9 @@ static inline bool __tasklet_enable(struct tasklet_struct *t) return atomic_dec_and_test(&t->count); } +static inline bool __tasklet_is_scheduled(struct tasklet_struct *t) +{ + return test_bit(TASKLET_STATE_SCHED, &t->state); +} + #endif /* __I915_GEM_H__ */ diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 313a18e37071..b387b71e2cb5 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -86,6 +86,7 @@ */ #include <linux/log2.h> +#include <linux/nospec.h> #include <drm/i915_drm.h> @@ -94,6 +95,7 @@ #include "i915_trace.h" #include "i915_user_extensions.h" #include "intel_lrc_reg.h" +#include "intel_lrc.h" #include "intel_workarounds.h" #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1 @@ -241,6 +243,20 @@ static void release_hw_id(struct i915_gem_context *ctx) mutex_unlock(&i915->contexts.mutex); } +static void free_engines(struct intel_engine_cs **engines, int count) +{ + int i; + + if (ZERO_OR_NULL_PTR(engines)) + return; + + /* We own the veng we created; regular engines are ignored */ + for (i = 0; i < count; i++) + intel_virtual_engine_destroy(engines[i]); + + kfree(engines); +} + static void i915_gem_context_free(struct i915_gem_context *ctx) { struct intel_context *it, *n; @@ -251,8 +267,7 @@ static void i915_gem_context_free(struct i915_gem_context *ctx) release_hw_id(ctx); i915_ppgtt_put(ctx->ppgtt); - - kfree(ctx->engines); + free_engines(ctx->engines, ctx->num_engines); rbtree_postorder_for_each_entry_safe(it, n, &ctx->hw_contexts, node) intel_context_put(it); @@ -1252,7 +1267,6 @@ __i915_gem_context_reconfigure_sseu(struct i915_gem_context *ctx, int ret = 0; GEM_BUG_ON(INTEL_GEN(ctx->i915) < 8); - GEM_BUG_ON(engine->id != RCS0); ce = intel_context_pin_lock(ctx, engine); if (IS_ERR(ce)) @@ -1446,7 +1460,80 @@ struct set_engines { unsigned int num_engines; }; +static int +set_engines__load_balance(struct i915_user_extension __user *base, void *data) +{ + struct i915_context_engines_load_balance __user *ext = + container_of_user(base, typeof(*ext), base); + const struct set_engines *set = data; + struct intel_engine_cs *ve; + unsigned int n; + u64 mask; + u16 idx; + int err; + + if (!HAS_EXECLISTS(set->ctx->i915)) + return -ENODEV; + + if (USES_GUC_SUBMISSION(set->ctx->i915)) + return -ENODEV; /* not implement yet */ + + if (get_user(idx, &ext->engine_index)) + return -EFAULT; + + if (idx >= set->num_engines) + return -EINVAL; + + idx = array_index_nospec(idx, set->num_engines); + if (set->engines[idx]) + return -EEXIST; + + err = check_user_mbz(&ext->mbz16); + if (err) + return err; + + err = check_user_mbz(&ext->flags); + if (err) + return err; + + for (n = 0; n < ARRAY_SIZE(ext->mbz64); n++) { + err = check_user_mbz(&ext->mbz64[n]); + if (err) + return err; + } + + if (get_user(mask, &ext->engines_mask)) + return -EFAULT; + + mask &= GENMASK_ULL(set->num_engines - 1, 0) & ~BIT_ULL(idx); + if (!mask) + return -EINVAL; + + if (is_power_of_2(mask)) { + ve = set->engines[__ffs64(mask)]; + } else { + struct intel_engine_cs *stack[64]; + int bit; + + n = 0; + for_each_set_bit(bit, (unsigned long *)&mask, set->num_engines) + stack[n++] = set->engines[bit]; + + ve = intel_execlists_create_virtual(set->ctx, stack, n); + } + if (IS_ERR(ve)) + return PTR_ERR(ve); + + if (cmpxchg(&set->engines[idx], NULL, ve)) { + intel_virtual_engine_destroy(ve); + return -EEXIST; + } + + return 0; +} + static const i915_user_extension_fn set_engines__extensions[] = { + [I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance, }; static int @@ -1510,13 +1597,13 @@ set_engines(struct i915_gem_context *ctx, ARRAY_SIZE(set_engines__extensions), &set); if (err) { - kfree(set.engines); + free_engines(set.engines, set.num_engines); return err; } out: mutex_lock(&ctx->i915->drm.struct_mutex); - kfree(ctx->engines); + free_engines(ctx->engines, ctx->num_engines); ctx->engines = set.engines; ctx->num_engines = set.num_engines; mutex_unlock(&ctx->i915->drm.struct_mutex); @@ -1705,7 +1792,7 @@ static int clone_engines(struct i915_gem_context *dst, struct i915_gem_context *src) { struct intel_engine_cs **engines; - unsigned int num_engines; + unsigned int num_engines, i; mutex_lock(&src->i915->drm.struct_mutex); /* serialise src->engine[] */ @@ -1720,11 +1807,36 @@ static int clone_engines(struct i915_gem_context *dst, mutex_unlock(&src->i915->drm.struct_mutex); return -ENOMEM; } + + /* + * Virtual engines are singletons; they can only exist + * inside a single context, because they embed their + * HW context... As each virtual context implies a single + * timeline (each engine can only dequeue a single request + * at any time), it would be surprising for two contexts + * to use the same engine. So let's create a copy of + * the virtual engine instead. + */ + for (i = 0; i < num_engines; i++) { + struct intel_engine_cs *engine = engines[i]; + + if (!engine || !intel_engine_is_virtual(engine)) + continue; + + engine = intel_execlists_clone_virtual(dst, engine); + if (IS_ERR(engine)) { + free_engines(engines, i); + mutex_unlock(&src->i915->drm.struct_mutex); + return PTR_ERR(engine); + } + + engines[i] = engine; + } } mutex_unlock(&src->i915->drm.struct_mutex); - kfree(dst->engines); + free_engines(dst->engines, dst->num_engines); dst->engines = engines; dst->num_engines = num_engines; return 0; diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c index e0f609d01564..8cff4f6d6158 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.c +++ b/drivers/gpu/drm/i915/i915_scheduler.c @@ -247,17 +247,26 @@ sched_lock_engine(const struct i915_sched_node *node, struct intel_engine_cs *locked, struct sched_cache *cache) { - struct intel_engine_cs *engine = node_to_request(node)->engine; + const struct i915_request *rq = node_to_request(node); + struct intel_engine_cs *engine; GEM_BUG_ON(!locked); - if (engine != locked) { + /* + * Virtual engines complicate acquiring the engine timeline lock, + * as their rq->engine pointer is not stable until under that + * engine lock. The simple ploy we use is to take the lock then + * check that the rq still belongs to the newly locked engine. + */ + while (locked != (engine = READ_ONCE(rq->engine))) { spin_unlock(&locked->timeline.lock); memset(cache, 0, sizeof(*cache)); spin_lock(&engine->timeline.lock); + locked = engine; } - return engine; + GEM_BUG_ON(locked != engine); + return locked; } static bool inflight(const struct i915_request *rq, @@ -370,8 +379,11 @@ static void __i915_schedule(struct i915_request *rq, if (prio <= node->attr.priority || node_signaled(node)) continue; + GEM_BUG_ON(node_to_request(node)->engine != engine); + node->attr.priority = prio; if (!list_empty(&node->link)) { + GEM_BUG_ON(intel_engine_is_virtual(engine)); if (!cache.priolist) cache.priolist = i915_sched_lookup_priolist(engine, diff --git a/drivers/gpu/drm/i915/i915_timeline_types.h b/drivers/gpu/drm/i915/i915_timeline_types.h index d42053544d7c..17b5314fb847 100644 --- a/drivers/gpu/drm/i915/i915_timeline_types.h +++ b/drivers/gpu/drm/i915/i915_timeline_types.h @@ -26,6 +26,7 @@ struct i915_timeline { spinlock_t lock; #define TIMELINE_CLIENT 0 /* default subclass */ #define TIMELINE_ENGINE 1 +#define TIMELINE_VIRTUAL 2 struct mutex mutex; /* protects the flow of requests */ unsigned int pin_count; diff --git a/drivers/gpu/drm/i915/intel_engine_types.h b/drivers/gpu/drm/i915/intel_engine_types.h index 549fdfca17aa..66ab1deeb7f5 100644 --- a/drivers/gpu/drm/i915/intel_engine_types.h +++ b/drivers/gpu/drm/i915/intel_engine_types.h @@ -225,6 +225,7 @@ struct intel_engine_execlists { * @queue: queue of requests, in priority lists */ struct rb_root_cached queue; + struct rb_root_cached virtual; /** * @csb_write: control register for Context Switch buffer @@ -430,6 +431,7 @@ struct intel_engine_cs { #define I915_ENGINE_SUPPORTS_STATS BIT(1) #define I915_ENGINE_HAS_PREEMPTION BIT(2) #define I915_ENGINE_HAS_SEMAPHORES BIT(3) +#define I915_ENGINE_IS_VIRTUAL BIT(4) unsigned int flags; /* @@ -513,6 +515,12 @@ intel_engine_has_semaphores(const struct intel_engine_cs *engine) return engine->flags & I915_ENGINE_HAS_SEMAPHORES; } +static inline bool +intel_engine_is_virtual(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_IS_VIRTUAL; +} + #define instdone_slice_mask(dev_priv__) \ (IS_GEN(dev_priv__, 7) ? \ 1 : RUNTIME_INFO(dev_priv__)->sseu.slice_mask) diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index ae89174fd08f..926ef07f5c38 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -166,6 +166,41 @@ #define ACTIVE_PRIORITY (I915_PRIORITY_NEWCLIENT | I915_PRIORITY_NOSEMAPHORE) +struct virtual_engine { + struct intel_engine_cs base; + struct intel_context context; + + /* + * We allow only a single request through the virtual engine at a time + * (each request in the timeline waits for the completion fence of + * the previous before being submitted). By restricting ourselves to + * only submitting a single request, each request is placed on to a + * physical to maximise load spreading (by virtue of the late greedy + * scheduling -- each real engine takes the next available request + * upon idling). + */ + struct i915_request *request; + + /* + * We keep a rbtree of available virtual engines inside each physical + * engine, sorted by priority. Here we preallocate the nodes we need + * for the virtual engine, indexed by physical_engine->id. + */ + struct ve_node { + struct rb_node rb; + int prio; + } nodes[I915_NUM_ENGINES]; + + /* And finally, which physical engines this virtual engine maps onto. */ + unsigned int count; + struct intel_engine_cs *siblings[0]; +}; + +static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) +{ + return container_of(engine, struct virtual_engine, base); +} + static int execlists_context_deferred_alloc(struct intel_context *ce, struct intel_engine_cs *engine); static void execlists_init_reg_state(u32 *reg_state, @@ -229,7 +264,8 @@ static int queue_prio(const struct intel_engine_execlists *execlists) } static inline bool need_preempt(const struct intel_engine_cs *engine, - const struct i915_request *rq) + const struct i915_request *rq, + struct rb_node *rb) { int last_prio; @@ -264,6 +300,22 @@ static inline bool need_preempt(const struct intel_engine_cs *engine, rq_prio(list_next_entry(rq, link)) > last_prio) return true; + if (rb) { /* XXX virtual precedence */ + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + bool preempt = false; + + if (engine == ve->siblings[0]) { /* only preempt one sibling */ + spin_lock(&ve->base.timeline.lock); + if (ve->request) + preempt = rq_prio(ve->request) > last_prio; + spin_unlock(&ve->base.timeline.lock); + } + + if (preempt) + return preempt; + } + /* * If the inflight context did not trigger the preemption, then maybe * it was the set of queued requests? Pick the highest priority in @@ -382,6 +434,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine) list_for_each_entry_safe_reverse(rq, rn, &engine->timeline.requests, link) { + struct intel_engine_cs *owner; + if (i915_request_completed(rq)) break; @@ -390,14 +444,30 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine) GEM_BUG_ON(rq->hw_context->active); - GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); - if (rq_prio(rq) != prio) { - prio = rq_prio(rq); - pl = i915_sched_lookup_priolist(engine, prio); - } - GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); + /* + * Push the request back into the queue for later resubmission. + * If this request is not native to this physical engine (i.e. + * it came from a virtual source), push it back onto the virtual + * engine so that it can be moved across onto another physical + * engine as load dictates. + */ + owner = rq->hw_context->engine; + if (likely(owner == engine)) { + GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); + if (rq_prio(rq) != prio) { + prio = rq_prio(rq); + pl = i915_sched_lookup_priolist(engine, prio); + } + GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); - list_add(&rq->sched.link, pl); + list_add(&rq->sched.link, pl); + } else { + if (__i915_request_has_started(rq)) + rq->sched.attr.priority |= ACTIVE_PRIORITY; + + rq->engine = owner; + owner->submit_request(rq); + } active = rq; } @@ -659,6 +729,50 @@ static void complete_preempt_context(struct intel_engine_execlists *execlists) execlists)); } +static void virtual_update_register_offsets(u32 *regs, + struct intel_engine_cs *engine) +{ + u32 base = engine->mmio_base; + + regs[CTX_CONTEXT_CONTROL] = + i915_mmio_reg_offset(RING_CONTEXT_CONTROL(engine)); + regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base)); + regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base)); + regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base)); + regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base)); + + regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base)); + regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base)); + regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base)); + regs[CTX_SECOND_BB_HEAD_U] = + i915_mmio_reg_offset(RING_SBBADDR_UDW(base)); + regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base)); + regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base)); + + regs[CTX_CTX_TIMESTAMP] = + i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base)); + regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 3)); + regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 3)); + regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 2)); + regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 2)); + regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 1)); + regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 1)); + regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 0)); + regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 0)); + + if (engine->class == RENDER_CLASS) { + regs[CTX_RCS_INDIRECT_CTX] = + i915_mmio_reg_offset(RING_INDIRECT_CTX(base)); + regs[CTX_RCS_INDIRECT_CTX_OFFSET] = + i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base)); + regs[CTX_BB_PER_CTX_PTR] = + i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base)); + + regs[CTX_R_PWR_CLK_STATE] = + i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE); + } +} + static void execlists_dequeue(struct intel_engine_cs *engine) { struct intel_engine_execlists * const execlists = &engine->execlists; @@ -691,6 +805,37 @@ static void execlists_dequeue(struct intel_engine_cs *engine) * and context switches) submission. */ + for (rb = rb_first_cached(&execlists->virtual); rb; ) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + struct i915_request *rq = READ_ONCE(ve->request); + struct intel_engine_cs *active; + + if (!rq) { /* lazily cleanup after another engine handled rq */ + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + rb = rb_first_cached(&execlists->virtual); + continue; + } + + /* + * We track when the HW has completed saving the context image + * (i.e. when we have seen the final CS event switching out of + * the context) and must not overwrite the context image before + * then. This restricts us to only using the active engine + * while the previous virtualized request is inflight (so + * we reuse the register offsets). This is a very small + * hystersis on the greedy seelction algorithm. + */ + active = READ_ONCE(ve->context.active); + if (active && active != engine) { + rb = rb_next(rb); + continue; + } + + break; + } + if (last) { /* * Don't resubmit or switch until all outstanding @@ -712,7 +857,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK)) return; - if (need_preempt(engine, last)) { + if (need_preempt(engine, last, rb)) { inject_preempt_context(engine); return; } @@ -752,6 +897,72 @@ static void execlists_dequeue(struct intel_engine_cs *engine) last->tail = last->wa_tail; } + while (rb) { /* XXX virtual is always taking precedence */ + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + struct i915_request *rq; + + spin_lock(&ve->base.timeline.lock); + + rq = ve->request; + if (unlikely(!rq)) { /* lost the race to a sibling */ + spin_unlock(&ve->base.timeline.lock); + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + rb = rb_first_cached(&execlists->virtual); + continue; + } + + if (rq_prio(rq) >= queue_prio(execlists)) { + if (last && !can_merge_rq(last, rq)) { + spin_unlock(&ve->base.timeline.lock); + return; /* leave this rq for another engine */ + } + + GEM_BUG_ON(rq->engine != &ve->base); + ve->request = NULL; + ve->base.execlists.queue_priority_hint = INT_MIN; + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + + GEM_BUG_ON(rq->hw_context != &ve->context); + rq->engine = engine; + + if (engine != ve->siblings[0]) { + u32 *regs = ve->context.lrc_reg_state; + unsigned int n; + + GEM_BUG_ON(READ_ONCE(ve->context.active)); + virtual_update_register_offsets(regs, engine); + + /* + * Move the bound engine to the top of the list + * for future execution. We then kick this + * tasklet first before checking others, so that + * we preferentially reuse this set of bound + * registers. + */ + for (n = 1; n < ve->count; n++) { + if (ve->siblings[n] == engine) { + swap(ve->siblings[n], + ve->siblings[0]); + break; + } + } + + GEM_BUG_ON(ve->siblings[0] != engine); + } + + __i915_request_submit(rq); + trace_i915_request_in(rq, port_index(port, execlists)); + submit = true; + last = rq; + } + + spin_unlock(&ve->base.timeline.lock); + break; + } + while ((rb = rb_first_cached(&execlists->queue))) { struct i915_priolist *p = to_priolist(rb); struct i915_request *rq, *rn; @@ -971,6 +1182,24 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine) i915_priolist_free(p); } + /* Cancel all attached virtual engines */ + while ((rb = rb_first_cached(&execlists->virtual))) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + + spin_lock(&ve->base.timeline.lock); + if (ve->request) { + __i915_request_submit(ve->request); + dma_fence_set_error(&ve->request->fence, -EIO); + i915_request_mark_complete(ve->request); + ve->request = NULL; + } + spin_unlock(&ve->base.timeline.lock); + } + /* Remaining _unready_ requests will be nop'ed when submitted */ execlists->queue_priority_hint = INT_MIN; @@ -2897,6 +3126,306 @@ void intel_lr_context_resume(struct drm_i915_private *i915) } } +static void virtual_context_destroy(struct kref *kref) +{ + struct virtual_engine *ve = + container_of(kref, typeof(*ve), context.ref); + unsigned int n; + + GEM_BUG_ON(ve->request); + GEM_BUG_ON(ve->context.active); + + for (n = 0; n < ve->count; n++) { + struct intel_engine_cs *sibling = ve->siblings[n]; + struct rb_node *node = &ve->nodes[sibling->id].rb; + + if (RB_EMPTY_NODE(node)) + continue; + + spin_lock_irq(&sibling->timeline.lock); + + if (!RB_EMPTY_NODE(node)) + rb_erase_cached(node, &sibling->execlists.virtual); + + spin_unlock_irq(&sibling->timeline.lock); + } + GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); + + if (ve->context.state) + __execlists_context_fini(&ve->context); + + i915_timeline_fini(&ve->base.timeline); + kfree(ve); +} + +static void virtual_engine_initial_hint(struct virtual_engine *ve) +{ + int swp; + + /* + * Pick a random sibling on starting to help spread the load around. + * + * New contexts are typically created with exactly the same order + * of siblings, and often started in batches. Due to the way we iterate + * the array of sibling when submitting requests, sibling[0] is + * prioritised for dequeuing. If we make sure that sibling[0] is fairly + * randomised across the system, we also help spread the load by the + * first engine we inspect being different each time. + * + * NB This does not force us to execute on this engine, it will just + * typically be the first we inspect for submission. + */ + swp = prandom_u32_max(ve->count); + if (!swp) + return; + + swap(ve->siblings[swp], ve->siblings[0]); + virtual_update_register_offsets(ve->context.lrc_reg_state, + ve->siblings[0]); +} + +static int virtual_context_pin(struct intel_context *ce) +{ + struct virtual_engine *ve = container_of(ce, typeof(*ve), context); + int err; + + /* Note: we must use a real engine class for setting up reg state */ + err = __execlists_context_pin(ce, ve->siblings[0]); + if (err) + return err; + + virtual_engine_initial_hint(ve); + return 0; +} + +static const struct intel_context_ops virtual_context_ops = { + .pin = virtual_context_pin, + .unpin = execlists_context_unpin, + + .destroy = virtual_context_destroy, +}; + +static void virtual_submission_tasklet(unsigned long data) +{ + struct virtual_engine * const ve = (struct virtual_engine *)data; + unsigned int n; + int prio; + + prio = READ_ONCE(ve->base.execlists.queue_priority_hint); + if (prio == INT_MIN) + return; + + local_irq_disable(); + for (n = 0; READ_ONCE(ve->request) && n < ve->count; n++) { + struct intel_engine_cs *sibling = ve->siblings[n]; + struct ve_node * const node = &ve->nodes[sibling->id]; + struct rb_node **parent, *rb; + bool first; + + spin_lock(&sibling->timeline.lock); + + if (!RB_EMPTY_NODE(&node->rb)) { + /* + * Cheat and avoid rebalancing the tree if we can + * reuse this node in situ. + */ + first = rb_first_cached(&sibling->execlists.virtual) == + &node->rb; + if (prio == node->prio || (prio > node->prio && first)) + goto submit_engine; + + rb_erase_cached(&node->rb, &sibling->execlists.virtual); + } + + rb = NULL; + first = true; + parent = &sibling->execlists.virtual.rb_root.rb_node; + while (*parent) { + struct ve_node *other; + + rb = *parent; + other = rb_entry(rb, typeof(*other), rb); + if (prio > other->prio) { + parent = &rb->rb_left; + } else { + parent = &rb->rb_right; + first = false; + } + } + + rb_link_node(&node->rb, rb, parent); + rb_insert_color_cached(&node->rb, + &sibling->execlists.virtual, + first); + +submit_engine: + GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); + node->prio = prio; + if (first && prio > sibling->execlists.queue_priority_hint) { + sibling->execlists.queue_priority_hint = prio; + tasklet_hi_schedule(&sibling->execlists.tasklet); + } + + spin_unlock(&sibling->timeline.lock); + } + local_irq_enable(); +} + +static void virtual_submit_request(struct i915_request *request) +{ + struct virtual_engine *ve = to_virtual_engine(request->engine); + + GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); + + GEM_BUG_ON(ve->request); + ve->base.execlists.queue_priority_hint = rq_prio(request); + WRITE_ONCE(ve->request, request); + + tasklet_schedule(&ve->base.execlists.tasklet); +} + +struct intel_engine_cs * +intel_execlists_create_virtual(struct i915_gem_context *ctx, + struct intel_engine_cs **siblings, + unsigned int count) +{ + struct virtual_engine *ve; + unsigned int n; + int err; + + if (!count) + return ERR_PTR(-EINVAL); + + ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); + if (!ve) + return ERR_PTR(-ENOMEM); + + ve->base.i915 = ctx->i915; + ve->base.id = -1; + ve->base.class = OTHER_CLASS; + ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; + ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; + ve->base.flags = I915_ENGINE_IS_VIRTUAL; + + snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); + + err = i915_timeline_init(ctx->i915, + &ve->base.timeline, + ve->base.name, + NULL); + if (err) + goto err_put; + i915_timeline_set_subclass(&ve->base.timeline, TIMELINE_VIRTUAL); + + ve->base.cops = &virtual_context_ops; + ve->base.request_alloc = execlists_request_alloc; + + ve->base.schedule = i915_schedule; + ve->base.submit_request = virtual_submit_request; + + ve->base.execlists.queue_priority_hint = INT_MIN; + tasklet_init(&ve->base.execlists.tasklet, + virtual_submission_tasklet, + (unsigned long)ve); + + intel_context_init(&ve->context, ctx, &ve->base); + + for (n = 0; n < count; n++) { + struct intel_engine_cs *sibling = siblings[n]; + + GEM_BUG_ON(!is_power_of_2(sibling->mask)); + if (sibling->mask & ve->base.mask) + continue; + + /* + * The virtual engine implementation is tightly coupled to + * the execlists backend -- we push out request directly + * into a tree inside each physical engine. We could support + * layering if we handling cloning of the requests and + * submitting a copy into each backend. + */ + if (sibling->execlists.tasklet.func != + execlists_submission_tasklet) { + err = -ENODEV; + goto err_put; + } + + GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); + RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); + + ve->siblings[ve->count++] = sibling; + ve->base.mask |= sibling->mask; + + /* + * All physical engines must be compatible for their emission + * functions (as we build the instructions during request + * construction and do not alter them before submission + * on the physical engine). We use the engine class as a guide + * here, although that could be refined. + */ + if (ve->base.class != OTHER_CLASS) { + if (ve->base.class != sibling->class) { + err = -EINVAL; + goto err_put; + } + continue; + } + + ve->base.class = sibling->class; + snprintf(ve->base.name, sizeof(ve->base.name), + "v%dx%d", ve->base.class, count); + ve->base.context_size = sibling->context_size; + + ve->base.emit_bb_start = sibling->emit_bb_start; + ve->base.emit_flush = sibling->emit_flush; + ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; + ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; + ve->base.emit_fini_breadcrumb_dw = + sibling->emit_fini_breadcrumb_dw; + } + + /* gracefully replace a degenerate virtual engine */ + if (ve->count == 1) { + struct intel_engine_cs *actual = ve->siblings[0]; + intel_context_put(&ve->context); + return actual; + } + + __intel_context_insert(ctx, &ve->base, &ve->context); + return &ve->base; + +err_put: + intel_context_put(&ve->context); + return ERR_PTR(err); +} + +struct intel_engine_cs * +intel_execlists_clone_virtual(struct i915_gem_context *ctx, + struct intel_engine_cs *src) +{ + struct virtual_engine *se = to_virtual_engine(src); + struct intel_engine_cs *dst; + + dst = intel_execlists_create_virtual(ctx, + se->siblings, + se->count); + if (IS_ERR(dst)) + return dst; + + return dst; +} + +void intel_virtual_engine_destroy(struct intel_engine_cs *engine) +{ + struct virtual_engine *ve = to_virtual_engine(engine); + + if (!engine || !intel_engine_is_virtual(engine)) + return; + + __intel_context_remove(&ve->context); + intel_context_put(&ve->context); +} + void intel_execlists_show_requests(struct intel_engine_cs *engine, struct drm_printer *m, void (*show_request)(struct drm_printer *m, @@ -2954,6 +3483,29 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine, show_request(m, last, "\t\tQ "); } + last = NULL; + count = 0; + for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + struct i915_request *rq = READ_ONCE(ve->request); + + if (rq) { + if (count++ < max - 1) + show_request(m, rq, "\t\tV "); + else + last = rq; + } + } + if (last) { + if (count > max) { + drm_printf(m, + "\t\t...skipping %d virtual requests...\n", + count - max); + } + show_request(m, last, "\t\tV "); + } + spin_unlock_irqrestore(&engine->timeline.lock, flags); } diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h index f1aec8a6986f..9d90dc68e02b 100644 --- a/drivers/gpu/drm/i915/intel_lrc.h +++ b/drivers/gpu/drm/i915/intel_lrc.h @@ -112,6 +112,17 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine, const char *prefix), unsigned int max); +struct intel_engine_cs * +intel_execlists_create_virtual(struct i915_gem_context *ctx, + struct intel_engine_cs **siblings, + unsigned int count); + +struct intel_engine_cs * +intel_execlists_clone_virtual(struct i915_gem_context *ctx, + struct intel_engine_cs *src); + +void intel_virtual_engine_destroy(struct intel_engine_cs *engine); + u32 gen8_make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu); #endif /* _INTEL_LRC_H_ */ diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c index 9e871eb0bfb1..6df033960350 100644 --- a/drivers/gpu/drm/i915/selftests/intel_lrc.c +++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c @@ -10,6 +10,7 @@ #include "../i915_selftest.h" #include "igt_flush_test.h" +#include "igt_live_test.h" #include "igt_spinner.h" #include "i915_random.h" @@ -1057,6 +1058,169 @@ static int live_preempt_smoke(void *arg) return err; } +static int nop_virtual_engine(struct drm_i915_private *i915, + struct intel_engine_cs **siblings, + unsigned int nsibling, + unsigned int nctx, + unsigned int flags) +#define CHAIN BIT(0) +{ + IGT_TIMEOUT(end_time); + struct i915_request *request[16]; + struct i915_gem_context *ctx[16]; + struct intel_engine_cs *ve[16]; + unsigned long n, prime, nc; + struct igt_live_test t; + ktime_t times[2] = {}; + int err; + + GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ctx)); + + for (n = 0; n < nctx; n++) { + ctx[n] = kernel_context(i915); + if (!ctx[n]) + return -ENOMEM; + + ve[n] = intel_execlists_create_virtual(ctx[n], + siblings, nsibling); + if (IS_ERR(ve[n])) + return PTR_ERR(ve[n]); + } + + err = igt_live_test_begin(&t, i915, __func__, ve[0]->name); + if (err) + goto out; + + for_each_prime_number_from(prime, 1, 8192) { + times[1] = ktime_get_raw(); + + if (flags & CHAIN) { + for (nc = 0; nc < nctx; nc++) { + for (n = 0; n < prime; n++) { + request[nc] = + i915_request_alloc(ve[nc], ctx[nc]); + if (IS_ERR(request[nc])) { + err = PTR_ERR(request[nc]); + goto out; + } + + i915_request_add(request[nc]); + } + } + } else { + for (n = 0; n < prime; n++) { + for (nc = 0; nc < nctx; nc++) { + request[nc] = + i915_request_alloc(ve[nc], ctx[nc]); + if (IS_ERR(request[nc])) { + err = PTR_ERR(request[nc]); + goto out; + } + + i915_request_add(request[nc]); + } + } + } + + for (nc = 0; nc < nctx; nc++) { + if (i915_request_wait(request[nc], + I915_WAIT_LOCKED, + HZ / 10) < 0) { + pr_err("%s(%s): wait for %llx:%lld timed out\n", + __func__, ve[0]->name, + request[nc]->fence.context, + request[nc]->fence.seqno); + + GEM_TRACE("%s(%s) failed at request %llx:%lld\n", + __func__, ve[0]->name, + request[nc]->fence.context, + request[nc]->fence.seqno); + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + break; + } + } + + times[1] = ktime_sub(ktime_get_raw(), times[1]); + if (prime == 1) + times[0] = times[1]; + + if (__igt_timeout(end_time, NULL)) + break; + } + + err = igt_live_test_end(&t); + if (err) + goto out; + + pr_info("Requestx%d latencies on %s: 1 = %lluns, %lu = %lluns\n", + nctx, ve[0]->name, ktime_to_ns(times[0]), + prime, div64_u64(ktime_to_ns(times[1]), prime)); + +out: + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + err = -EIO; + + for (nc = 0; nc < nctx; nc++) { + intel_virtual_engine_destroy(ve[nc]); + kernel_context_close(ctx[nc]); + } + return err; +} + +static int live_virtual_engine(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + struct intel_engine_cs *engine; + enum intel_engine_id id; + unsigned int class, inst; + int err = -ENODEV; + + if (USES_GUC_SUBMISSION(i915)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + + for_each_engine(engine, i915, id) { + err = nop_virtual_engine(i915, &engine, 1, 1, 0); + if (err) { + pr_err("Failed to wrap engine %s: err=%d\n", + engine->name, err); + goto out_unlock; + } + } + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + int nsibling, n; + + nsibling = 0; + for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) { + if (!i915->engine_class[class][inst]) + break; + + siblings[nsibling++] = i915->engine_class[class][inst]; + } + if (nsibling < 2) + continue; + + for (n = 1; n <= nsibling + 1; n++) { + err = nop_virtual_engine(i915, siblings, nsibling, + n, 0); + if (err) + goto out_unlock; + } + + err = nop_virtual_engine(i915, siblings, nsibling, n, CHAIN); + if (err) + goto out_unlock; + } + +out_unlock: + mutex_unlock(&i915->drm.struct_mutex); + return err; +} + int intel_execlists_live_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { @@ -1068,6 +1232,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915) SUBTEST(live_chain_preempt), SUBTEST(live_preempt_hang), SUBTEST(live_preempt_smoke), + SUBTEST(live_virtual_engine), }; if (!HAS_EXECLISTS(i915)) diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 8ef6d60929c6..9c94c037d13b 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -127,6 +127,7 @@ enum drm_i915_gem_engine_class { }; #define I915_ENGINE_CLASS_INVALID_NONE -1 +#define I915_ENGINE_CLASS_INVALID_VIRTUAL 0 /** * DOC: perf_events exposed by i915 through /sys/bus/event_sources/drivers/i915 @@ -1598,8 +1599,37 @@ struct drm_i915_gem_context_param_sseu { __u32 rsvd; }; +/* + * i915_context_engines_load_balance: + * + * Enable load balancing across this set of engines. + * + * Into the I915_EXEC_DEFAULT slot [0], a virtual engine is created that when + * used will proxy the execbuffer request onto one of the set of engines + * in such a way as to distribute the load evenly across the set. + * + * The set of engines must be compatible (e.g. the same HW class) as they + * will share the same logical GPU context and ring. + * + * To intermix rendering with the virtual engine and direct rendering onto + * the backing engines (bypassing the load balancing proxy), the context must + * be defined to use a single timeline for all engines. + */ +struct i915_context_engines_load_balance { + struct i915_user_extension base; + + __u16 engine_index; + __u16 mbz16; /* reserved for future use; must be zero */ + __u32 flags; /* all undefined flags must be zero */ + + __u64 engines_mask; /* selection mask of engines[] */ + + __u64 mbz64[4]; /* reserved for future use; must be zero */ +}; + struct i915_context_param_engines { __u64 extensions; /* linked chain of extension blocks, 0 terminates */ +#define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 struct { __u16 engine_class; /* see enum drm_i915_gem_engine_class */

[19/22] drm/i915: Load balancing across a virtual engine

Commit Message

Patch