Message ID | 20190620070559.30076-2-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/3] drm/i915/execlists: Preempt-to-busy | expand |
Chris Wilson <chris@chris-wilson.co.uk> writes: > If we have multiple contexts of equal priority pending execution, > activate a timer to demote the currently executing context in favour of > the next in the queue when that timeslice expires. This enforces > fairness between contexts (so long as they allow preemption -- forced > preemption, in the future, will kick those who do not obey) and allows > us to avoid userspace blocking forward progress with e.g. unbounded > MI_SEMAPHORE_WAIT. > > For the starting point here, we use the jiffie as our timeslice so that > we should be reasonably efficient wrt frequent CPU wakeups. > > Testcase: igt/gem_exec_scheduler/semaphore-resolve > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/gt/intel_engine_types.h | 6 + > drivers/gpu/drm/i915/gt/intel_lrc.c | 111 +++++++++ > drivers/gpu/drm/i915/gt/selftest_lrc.c | 223 +++++++++++++++++++ > drivers/gpu/drm/i915/i915_scheduler.c | 1 + > drivers/gpu/drm/i915/i915_scheduler_types.h | 1 + > 5 files changed, 342 insertions(+) > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h > index 411b7a807b99..6591d6bd2692 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h > @@ -12,6 +12,7 @@ > #include <linux/kref.h> > #include <linux/list.h> > #include <linux/llist.h> > +#include <linux/timer.h> > #include <linux/types.h> > > #include "i915_gem.h" > @@ -149,6 +150,11 @@ struct intel_engine_execlists { > */ > struct tasklet_struct tasklet; > > + /** > + * @timer: kick the current context if its timeslice expires > + */ > + struct timer_list timer; > + > /** > * @default_priolist: priority list for I915_PRIORITY_NORMAL > */ > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c > index dc077b536fa5..fca79adb4aa3 100644 > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c > @@ -255,6 +255,7 @@ static int effective_prio(const struct i915_request *rq) > prio |= I915_PRIORITY_NOSEMAPHORE; > > /* Restrict mere WAIT boosts from triggering preemption */ > + BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */ > return prio | __NO_PREEMPTION; > } > > @@ -813,6 +814,81 @@ last_active(const struct intel_engine_execlists *execlists) > return *last; > } > > +static void > +defer_request(struct i915_request * const rq, struct list_head * const pl) > +{ > + struct i915_dependency *p; > + > + /* > + * We want to move the interrupted request to the back of > + * the round-robin list (i.e. its priority level), but > + * in doing so, we must then move all requests that were in > + * flight and were waiting for the interrupted request to > + * be run after it again. > + */ > + list_move_tail(&rq->sched.link, pl); > + > + list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { > + struct i915_request *w = > + container_of(p->waiter, typeof(*w), sched); > + > + /* Leave semaphores spinning on the other engines */ > + if (w->engine != rq->engine) > + continue; > + > + /* No waiter should start before the active request completed */ > + GEM_BUG_ON(i915_request_started(w)); > + > + GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); > + if (rq_prio(w) < rq_prio(rq)) > + continue; > + > + if (list_empty(&w->sched.link)) > + continue; /* Not yet submitted; unready */ > + > + /* > + * This should be very shallow as it is limited by the > + * number of requests that can fit in a ring (<64) and s/and/or ? > + * the number of contexts that can be in flight on this > + * engine. > + */ > + defer_request(w, pl); So the stack frame will be 64*(3*8 + preample/return) at worst case? can be over 2k > + } > +} > + > +static void defer_active(struct intel_engine_cs *engine) > +{ > + struct i915_request *rq; > + > + rq = __unwind_incomplete_requests(engine); > + if (!rq) > + return; > + > + defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); > +} > + > +static bool > +need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) > +{ > + int hint; > + > + if (list_is_last(&rq->sched.link, &engine->active.requests)) > + return false; > + > + hint = max(rq_prio(list_next_entry(rq, sched.link)), > + engine->execlists.queue_priority_hint); > + > + return hint >= rq_prio(rq); > +} > + > +static bool > +enable_timeslice(struct intel_engine_cs *engine) > +{ > + struct i915_request *last = last_active(&engine->execlists); > + > + return last && need_timeslice(engine, last); > +} > + > static void execlists_dequeue(struct intel_engine_cs *engine) > { > struct intel_engine_execlists * const execlists = &engine->execlists; > @@ -906,6 +982,27 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > */ > last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE; > last = NULL; > + } else if (need_timeslice(engine, last) && > + !timer_pending(&engine->execlists.timer)) { > + GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n", > + engine->name, > + last->fence.context, > + last->fence.seqno, > + last->sched.attr.priority, > + execlists->queue_priority_hint); > + > + ring_pause(engine) = 1; > + defer_active(engine); > + > + /* > + * Unlike for preemption, if we rewind and continue > + * executing the same context as previously active, > + * the order of execution will remain the same and > + * the tail will only advance. We do not need to > + * force a full context restore, as a lite-restore > + * is sufficient to resample the monotonic TAIL. > + */ I would have asked about the force preemption without this fine comment. But this is a similar as the other kind of preemption. So what happens when the contexts are not the same? -Mika > + last = NULL; > } else { > /* > * Otherwise if we already have a request pending > @@ -1229,6 +1326,9 @@ static void process_csb(struct intel_engine_cs *engine) > sizeof(*execlists->pending)); > execlists->pending[0] = NULL; > > + if (enable_timeslice(engine)) > + mod_timer(&execlists->timer, jiffies + 1); > + > if (!inject_preempt_hang(execlists)) > ring_pause(engine) = 0; > } else if (status & GEN8_CTX_STATUS_PREEMPTED) { > @@ -1299,6 +1399,15 @@ static void execlists_submission_tasklet(unsigned long data) > spin_unlock_irqrestore(&engine->active.lock, flags); > } > > +static void execlists_submission_timer(struct timer_list *timer) > +{ > + struct intel_engine_cs *engine = > + from_timer(engine, timer, execlists.timer); > + > + /* Kick the tasklet for some interrupt coalescing and reset handling */ > + tasklet_hi_schedule(&engine->execlists.tasklet); > +} > + > static void queue_request(struct intel_engine_cs *engine, > struct i915_sched_node *node, > int prio) > @@ -2524,6 +2633,7 @@ static int gen8_init_rcs_context(struct i915_request *rq) > > static void execlists_park(struct intel_engine_cs *engine) > { > + del_timer_sync(&engine->execlists.timer); > intel_engine_park(engine); > } > > @@ -2621,6 +2731,7 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine) > > tasklet_init(&engine->execlists.tasklet, > execlists_submission_tasklet, (unsigned long)engine); > + timer_setup(&engine->execlists.timer, execlists_submission_timer, 0); > > logical_ring_default_vfuncs(engine); > logical_ring_default_irqs(engine); > diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c > index 401e8b539297..0c97f953e908 100644 > --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c > +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c > @@ -79,6 +79,225 @@ static int live_sanitycheck(void *arg) > return err; > } > > +static int > +emit_semaphore_chain(struct i915_request *rq, struct i915_vma *vma, int idx) > +{ > + u32 *cs; > + > + cs = intel_ring_begin(rq, 10); > + if (IS_ERR(cs)) > + return PTR_ERR(cs); > + > + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; > + > + *cs++ = MI_SEMAPHORE_WAIT | > + MI_SEMAPHORE_GLOBAL_GTT | > + MI_SEMAPHORE_POLL | > + MI_SEMAPHORE_SAD_NEQ_SDD; > + *cs++ = 0; > + *cs++ = i915_ggtt_offset(vma) + 4 * idx; > + *cs++ = 0; > + > + if (idx > 0) { > + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; > + *cs++ = i915_ggtt_offset(vma) + 4 * (idx - 1); > + *cs++ = 0; > + *cs++ = 1; > + } else { > + *cs++ = MI_NOOP; > + *cs++ = MI_NOOP; > + *cs++ = MI_NOOP; > + *cs++ = MI_NOOP; > + } > + > + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; > + > + intel_ring_advance(rq, cs); > + return 0; > +} > + > +static struct i915_request * > +semaphore_queue(struct intel_engine_cs *engine, struct i915_vma *vma, int idx) > +{ > + struct i915_gem_context *ctx; > + struct i915_request *rq; > + int err; > + > + ctx = kernel_context(engine->i915); > + if (!ctx) > + return ERR_PTR(-ENOMEM); > + > + rq = igt_request_alloc(ctx, engine); > + if (IS_ERR(rq)) > + goto out_ctx; > + > + err = emit_semaphore_chain(rq, vma, idx); > + i915_request_add(rq); > + if (err) > + rq = ERR_PTR(err); > + > +out_ctx: > + kernel_context_close(ctx); > + return rq; > +} > + > +static int > +release_queue(struct intel_engine_cs *engine, > + struct i915_vma *vma, > + int idx) > +{ > + struct i915_sched_attr attr = { > + .priority = I915_USER_PRIORITY(I915_PRIORITY_MAX), > + }; > + struct i915_request *rq; > + u32 *cs; > + > + rq = i915_request_create(engine->kernel_context); > + if (IS_ERR(rq)) > + return PTR_ERR(rq); > + > + cs = intel_ring_begin(rq, 4); > + if (IS_ERR(cs)) { > + i915_request_add(rq); > + return PTR_ERR(cs); > + } > + > + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; > + *cs++ = i915_ggtt_offset(vma) + 4 * (idx - 1); > + *cs++ = 0; > + *cs++ = 1; > + > + intel_ring_advance(rq, cs); > + i915_request_add(rq); > + > + engine->schedule(rq, &attr); > + > + return 0; > +} > + > +static int > +slice_semaphore_queue(struct intel_engine_cs *outer, > + struct i915_vma *vma, > + int count) > +{ > + struct intel_engine_cs *engine; > + struct i915_request *head; > + enum intel_engine_id id; > + int err, i, n = 0; > + > + head = semaphore_queue(outer, vma, n++); > + if (IS_ERR(head)) > + return PTR_ERR(head); > + > + i915_request_get(head); > + for_each_engine(engine, outer->i915, id) { > + for (i = 0; i < count; i++) { > + struct i915_request *rq; > + > + rq = semaphore_queue(engine, vma, n++); > + if (IS_ERR(rq)) { > + err = PTR_ERR(rq); > + goto out; > + } > + } > + } > + > + err = release_queue(outer, vma, n); > + if (err) > + goto out; > + > + if (i915_request_wait(head, > + I915_WAIT_LOCKED, > + 2 * RUNTIME_INFO(outer->i915)->num_engines * (count + 2) * (count + 3)) < 0) { > + pr_err("Failed to slice along semaphore chain of length (%d, %d)!\n", > + count, n); > + GEM_TRACE_DUMP(); > + i915_gem_set_wedged(outer->i915); > + err = -EIO; > + } > + > +out: > + i915_request_put(head); > + return err; > +} > + > +static int live_timeslice_preempt(void *arg) > +{ > + struct drm_i915_private *i915 = arg; > + struct drm_i915_gem_object *obj; > + intel_wakeref_t wakeref; > + struct i915_vma *vma; > + void *vaddr; > + int err = 0; > + int count; > + > + /* > + * If a request takes too long, we would like to give other users > + * a fair go on the GPU. In particular, users may create batches > + * that wait upon external input, where that input may even be > + * supplied by another GPU job. To avoid blocking forever, we > + * need to preempt the current task and replace it with another > + * ready task. > + */ > + > + mutex_lock(&i915->drm.struct_mutex); > + wakeref = intel_runtime_pm_get(&i915->runtime_pm); > + > + obj = i915_gem_object_create_internal(i915, PAGE_SIZE); > + if (IS_ERR(obj)) { > + err = PTR_ERR(obj); > + goto err_unlock; > + } > + > + vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL); > + if (IS_ERR(vma)) { > + err = PTR_ERR(vma); > + goto err_obj; > + } > + > + vaddr = i915_gem_object_pin_map(obj, I915_MAP_WC); > + if (IS_ERR(vaddr)) { > + err = PTR_ERR(vaddr); > + goto err_obj; > + } > + > + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); > + if (err) > + goto err_map; > + > + for_each_prime_number_from(count, 1, 16) { > + struct intel_engine_cs *engine; > + enum intel_engine_id id; > + > + for_each_engine(engine, i915, id) { > + memset(vaddr, 0, PAGE_SIZE); > + > + err = slice_semaphore_queue(engine, vma, count); > + if (err) > + goto err_pin; > + > + if (igt_flush_test(i915, I915_WAIT_LOCKED)) { > + err = -EIO; > + goto err_pin; > + } > + } > + } > + > +err_pin: > + i915_vma_unpin(vma); > +err_map: > + i915_gem_object_unpin_map(obj); > +err_obj: > + i915_gem_object_put(obj); > +err_unlock: > + if (igt_flush_test(i915, I915_WAIT_LOCKED)) > + err = -EIO; > + intel_runtime_pm_put(&i915->runtime_pm, wakeref); > + mutex_unlock(&i915->drm.struct_mutex); > + > + return err; > +} > + > static int live_busywait_preempt(void *arg) > { > struct drm_i915_private *i915 = arg; > @@ -398,6 +617,9 @@ static int live_late_preempt(void *arg) > if (!ctx_lo) > goto err_ctx_hi; > > + /* Make sure ctx_lo stays before ctx_hi until we trigger preemption. */ > + ctx_lo->sched.priority = I915_USER_PRIORITY(1); > + > for_each_engine(engine, i915, id) { > struct igt_live_test t; > struct i915_request *rq; > @@ -1812,6 +2034,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915) > { > static const struct i915_subtest tests[] = { > SUBTEST(live_sanitycheck), > + SUBTEST(live_timeslice_preempt), > SUBTEST(live_busywait_preempt), > SUBTEST(live_preempt), > SUBTEST(live_late_preempt), > diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c > index b1ba3e65cd52..0bd452e851d8 100644 > --- a/drivers/gpu/drm/i915/i915_scheduler.c > +++ b/drivers/gpu/drm/i915/i915_scheduler.c > @@ -394,6 +394,7 @@ bool __i915_sched_node_add_dependency(struct i915_sched_node *node, > list_add(&dep->wait_link, &signal->waiters_list); > list_add(&dep->signal_link, &node->signalers_list); > dep->signaler = signal; > + dep->waiter = node; > dep->flags = flags; > > /* Keep track of whether anyone on this chain has a semaphore */ > diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h > index 3e309631bd0b..aad81acba9dc 100644 > --- a/drivers/gpu/drm/i915/i915_scheduler_types.h > +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h > @@ -62,6 +62,7 @@ struct i915_sched_node { > > struct i915_dependency { > struct i915_sched_node *signaler; > + struct i915_sched_node *waiter; > struct list_head signal_link; > struct list_head wait_link; > struct list_head dfs_link; > -- > 2.20.1
Quoting Mika Kuoppala (2019-06-20 14:51:24) > > +static void > > +defer_request(struct i915_request * const rq, struct list_head * const pl) > > +{ > > + struct i915_dependency *p; > > + > > + /* > > + * We want to move the interrupted request to the back of > > + * the round-robin list (i.e. its priority level), but > > + * in doing so, we must then move all requests that were in > > + * flight and were waiting for the interrupted request to > > + * be run after it again. > > + */ > > + list_move_tail(&rq->sched.link, pl); > > + > > + list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { > > + struct i915_request *w = > > + container_of(p->waiter, typeof(*w), sched); > > + > > + /* Leave semaphores spinning on the other engines */ > > + if (w->engine != rq->engine) > > + continue; > > + > > + /* No waiter should start before the active request completed */ > > + GEM_BUG_ON(i915_request_started(w)); > > + > > + GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); > > + if (rq_prio(w) < rq_prio(rq)) > > + continue; > > + > > + if (list_empty(&w->sched.link)) > > + continue; /* Not yet submitted; unready */ > > + > > + /* > > + * This should be very shallow as it is limited by the > > + * number of requests that can fit in a ring (<64) and > > s/and/or ? I think "and" works better as each context has their own ring, so it's a multiplicative effect. > > + * the number of contexts that can be in flight on this > > + * engine. > > + */ > > + defer_request(w, pl); > > So the stack frame will be 64*(3*8 + preample/return) at worst case? > can be over 2k Ok, that makes it sound scary -- but we are well within the 8k irq limit. (Even interrupts now have 2 pages iirc, but even at 4k we are well within bounds.) > > @@ -906,6 +982,27 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > > */ > > last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE; > > last = NULL; > > + } else if (need_timeslice(engine, last) && > > + !timer_pending(&engine->execlists.timer)) { > > + GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n", > > + engine->name, > > + last->fence.context, > > + last->fence.seqno, > > + last->sched.attr.priority, > > + execlists->queue_priority_hint); > > + > > + ring_pause(engine) = 1; > > + defer_active(engine); > > + > > + /* > > + * Unlike for preemption, if we rewind and continue > > + * executing the same context as previously active, > > + * the order of execution will remain the same and > > + * the tail will only advance. We do not need to > > + * force a full context restore, as a lite-restore > > + * is sufficient to resample the monotonic TAIL. > > + */ > > I would have asked about the force preemption without this fine comment. > > But this is a similar as the other kind of preemption. So what happens > when the contexts are not the same? It's just a normal preemption event. The old ring regs are saved and we don't try and scribble over them. Any future use of the old context will have the same RING_TAIL as before or later (new request) so we will never try to program a backwards step. -Chris
Chris Wilson <chris@chris-wilson.co.uk> writes: > Quoting Mika Kuoppala (2019-06-20 14:51:24) >> > +static void >> > +defer_request(struct i915_request * const rq, struct list_head * const pl) >> > +{ >> > + struct i915_dependency *p; >> > + >> > + /* >> > + * We want to move the interrupted request to the back of >> > + * the round-robin list (i.e. its priority level), but >> > + * in doing so, we must then move all requests that were in >> > + * flight and were waiting for the interrupted request to >> > + * be run after it again. >> > + */ >> > + list_move_tail(&rq->sched.link, pl); >> > + >> > + list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { >> > + struct i915_request *w = >> > + container_of(p->waiter, typeof(*w), sched); >> > + >> > + /* Leave semaphores spinning on the other engines */ >> > + if (w->engine != rq->engine) >> > + continue; >> > + >> > + /* No waiter should start before the active request completed */ >> > + GEM_BUG_ON(i915_request_started(w)); >> > + >> > + GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); >> > + if (rq_prio(w) < rq_prio(rq)) >> > + continue; >> > + >> > + if (list_empty(&w->sched.link)) >> > + continue; /* Not yet submitted; unready */ >> > + >> > + /* >> > + * This should be very shallow as it is limited by the >> > + * number of requests that can fit in a ring (<64) and >> >> s/and/or ? > > I think "and" works better as each context has their own ring, so it's a > multiplicative effect. > I jumped. But got clarity on irc that this are the contexts in flight. >> > + * the number of contexts that can be in flight on this >> > + * engine. >> > + */ >> > + defer_request(w, pl); >> >> So the stack frame will be 64*(3*8 + preample/return) at worst case? >> can be over 2k > > Ok, that makes it sound scary -- but we are well within the 8k irq > limit. (Even interrupts now have 2 pages iirc, but even at 4k we are > well within bounds.) > Should be safe. >> > @@ -906,6 +982,27 @@ static void execlists_dequeue(struct intel_engine_cs *engine) >> > */ >> > last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE; >> > last = NULL; >> > + } else if (need_timeslice(engine, last) && >> > + !timer_pending(&engine->execlists.timer)) { >> > + GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n", >> > + engine->name, >> > + last->fence.context, >> > + last->fence.seqno, >> > + last->sched.attr.priority, >> > + execlists->queue_priority_hint); >> > + >> > + ring_pause(engine) = 1; >> > + defer_active(engine); >> > + >> > + /* >> > + * Unlike for preemption, if we rewind and continue >> > + * executing the same context as previously active, >> > + * the order of execution will remain the same and >> > + * the tail will only advance. We do not need to >> > + * force a full context restore, as a lite-restore >> > + * is sufficient to resample the monotonic TAIL. >> > + */ >> >> I would have asked about the force preemption without this fine comment. >> >> But this is a similar as the other kind of preemption. So what happens >> when the contexts are not the same? > > It's just a normal preemption event. The old ring regs are saved and we > don't try and scribble over them. Any future use of the old context will > have the same RING_TAIL as before or later (new request) so we will > never try to program a backwards step. Ok, Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> > -Chris
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index 411b7a807b99..6591d6bd2692 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -12,6 +12,7 @@ #include <linux/kref.h> #include <linux/list.h> #include <linux/llist.h> +#include <linux/timer.h> #include <linux/types.h> #include "i915_gem.h" @@ -149,6 +150,11 @@ struct intel_engine_execlists { */ struct tasklet_struct tasklet; + /** + * @timer: kick the current context if its timeslice expires + */ + struct timer_list timer; + /** * @default_priolist: priority list for I915_PRIORITY_NORMAL */ diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index dc077b536fa5..fca79adb4aa3 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -255,6 +255,7 @@ static int effective_prio(const struct i915_request *rq) prio |= I915_PRIORITY_NOSEMAPHORE; /* Restrict mere WAIT boosts from triggering preemption */ + BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */ return prio | __NO_PREEMPTION; } @@ -813,6 +814,81 @@ last_active(const struct intel_engine_execlists *execlists) return *last; } +static void +defer_request(struct i915_request * const rq, struct list_head * const pl) +{ + struct i915_dependency *p; + + /* + * We want to move the interrupted request to the back of + * the round-robin list (i.e. its priority level), but + * in doing so, we must then move all requests that were in + * flight and were waiting for the interrupted request to + * be run after it again. + */ + list_move_tail(&rq->sched.link, pl); + + list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { + struct i915_request *w = + container_of(p->waiter, typeof(*w), sched); + + /* Leave semaphores spinning on the other engines */ + if (w->engine != rq->engine) + continue; + + /* No waiter should start before the active request completed */ + GEM_BUG_ON(i915_request_started(w)); + + GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); + if (rq_prio(w) < rq_prio(rq)) + continue; + + if (list_empty(&w->sched.link)) + continue; /* Not yet submitted; unready */ + + /* + * This should be very shallow as it is limited by the + * number of requests that can fit in a ring (<64) and + * the number of contexts that can be in flight on this + * engine. + */ + defer_request(w, pl); + } +} + +static void defer_active(struct intel_engine_cs *engine) +{ + struct i915_request *rq; + + rq = __unwind_incomplete_requests(engine); + if (!rq) + return; + + defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); +} + +static bool +need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) +{ + int hint; + + if (list_is_last(&rq->sched.link, &engine->active.requests)) + return false; + + hint = max(rq_prio(list_next_entry(rq, sched.link)), + engine->execlists.queue_priority_hint); + + return hint >= rq_prio(rq); +} + +static bool +enable_timeslice(struct intel_engine_cs *engine) +{ + struct i915_request *last = last_active(&engine->execlists); + + return last && need_timeslice(engine, last); +} + static void execlists_dequeue(struct intel_engine_cs *engine) { struct intel_engine_execlists * const execlists = &engine->execlists; @@ -906,6 +982,27 @@ static void execlists_dequeue(struct intel_engine_cs *engine) */ last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE; last = NULL; + } else if (need_timeslice(engine, last) && + !timer_pending(&engine->execlists.timer)) { + GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n", + engine->name, + last->fence.context, + last->fence.seqno, + last->sched.attr.priority, + execlists->queue_priority_hint); + + ring_pause(engine) = 1; + defer_active(engine); + + /* + * Unlike for preemption, if we rewind and continue + * executing the same context as previously active, + * the order of execution will remain the same and + * the tail will only advance. We do not need to + * force a full context restore, as a lite-restore + * is sufficient to resample the monotonic TAIL. + */ + last = NULL; } else { /* * Otherwise if we already have a request pending @@ -1229,6 +1326,9 @@ static void process_csb(struct intel_engine_cs *engine) sizeof(*execlists->pending)); execlists->pending[0] = NULL; + if (enable_timeslice(engine)) + mod_timer(&execlists->timer, jiffies + 1); + if (!inject_preempt_hang(execlists)) ring_pause(engine) = 0; } else if (status & GEN8_CTX_STATUS_PREEMPTED) { @@ -1299,6 +1399,15 @@ static void execlists_submission_tasklet(unsigned long data) spin_unlock_irqrestore(&engine->active.lock, flags); } +static void execlists_submission_timer(struct timer_list *timer) +{ + struct intel_engine_cs *engine = + from_timer(engine, timer, execlists.timer); + + /* Kick the tasklet for some interrupt coalescing and reset handling */ + tasklet_hi_schedule(&engine->execlists.tasklet); +} + static void queue_request(struct intel_engine_cs *engine, struct i915_sched_node *node, int prio) @@ -2524,6 +2633,7 @@ static int gen8_init_rcs_context(struct i915_request *rq) static void execlists_park(struct intel_engine_cs *engine) { + del_timer_sync(&engine->execlists.timer); intel_engine_park(engine); } @@ -2621,6 +2731,7 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine) tasklet_init(&engine->execlists.tasklet, execlists_submission_tasklet, (unsigned long)engine); + timer_setup(&engine->execlists.timer, execlists_submission_timer, 0); logical_ring_default_vfuncs(engine); logical_ring_default_irqs(engine); diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c index 401e8b539297..0c97f953e908 100644 --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c @@ -79,6 +79,225 @@ static int live_sanitycheck(void *arg) return err; } +static int +emit_semaphore_chain(struct i915_request *rq, struct i915_vma *vma, int idx) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 10); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_NEQ_SDD; + *cs++ = 0; + *cs++ = i915_ggtt_offset(vma) + 4 * idx; + *cs++ = 0; + + if (idx > 0) { + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = i915_ggtt_offset(vma) + 4 * (idx - 1); + *cs++ = 0; + *cs++ = 1; + } else { + *cs++ = MI_NOOP; + *cs++ = MI_NOOP; + *cs++ = MI_NOOP; + *cs++ = MI_NOOP; + } + + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + + intel_ring_advance(rq, cs); + return 0; +} + +static struct i915_request * +semaphore_queue(struct intel_engine_cs *engine, struct i915_vma *vma, int idx) +{ + struct i915_gem_context *ctx; + struct i915_request *rq; + int err; + + ctx = kernel_context(engine->i915); + if (!ctx) + return ERR_PTR(-ENOMEM); + + rq = igt_request_alloc(ctx, engine); + if (IS_ERR(rq)) + goto out_ctx; + + err = emit_semaphore_chain(rq, vma, idx); + i915_request_add(rq); + if (err) + rq = ERR_PTR(err); + +out_ctx: + kernel_context_close(ctx); + return rq; +} + +static int +release_queue(struct intel_engine_cs *engine, + struct i915_vma *vma, + int idx) +{ + struct i915_sched_attr attr = { + .priority = I915_USER_PRIORITY(I915_PRIORITY_MAX), + }; + struct i915_request *rq; + u32 *cs; + + rq = i915_request_create(engine->kernel_context); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) { + i915_request_add(rq); + return PTR_ERR(cs); + } + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = i915_ggtt_offset(vma) + 4 * (idx - 1); + *cs++ = 0; + *cs++ = 1; + + intel_ring_advance(rq, cs); + i915_request_add(rq); + + engine->schedule(rq, &attr); + + return 0; +} + +static int +slice_semaphore_queue(struct intel_engine_cs *outer, + struct i915_vma *vma, + int count) +{ + struct intel_engine_cs *engine; + struct i915_request *head; + enum intel_engine_id id; + int err, i, n = 0; + + head = semaphore_queue(outer, vma, n++); + if (IS_ERR(head)) + return PTR_ERR(head); + + i915_request_get(head); + for_each_engine(engine, outer->i915, id) { + for (i = 0; i < count; i++) { + struct i915_request *rq; + + rq = semaphore_queue(engine, vma, n++); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + } + } + + err = release_queue(outer, vma, n); + if (err) + goto out; + + if (i915_request_wait(head, + I915_WAIT_LOCKED, + 2 * RUNTIME_INFO(outer->i915)->num_engines * (count + 2) * (count + 3)) < 0) { + pr_err("Failed to slice along semaphore chain of length (%d, %d)!\n", + count, n); + GEM_TRACE_DUMP(); + i915_gem_set_wedged(outer->i915); + err = -EIO; + } + +out: + i915_request_put(head); + return err; +} + +static int live_timeslice_preempt(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct drm_i915_gem_object *obj; + intel_wakeref_t wakeref; + struct i915_vma *vma; + void *vaddr; + int err = 0; + int count; + + /* + * If a request takes too long, we would like to give other users + * a fair go on the GPU. In particular, users may create batches + * that wait upon external input, where that input may even be + * supplied by another GPU job. To avoid blocking forever, we + * need to preempt the current task and replace it with another + * ready task. + */ + + mutex_lock(&i915->drm.struct_mutex); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + obj = i915_gem_object_create_internal(i915, PAGE_SIZE); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto err_unlock; + } + + vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_obj; + } + + vaddr = i915_gem_object_pin_map(obj, I915_MAP_WC); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + goto err_obj; + } + + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); + if (err) + goto err_map; + + for_each_prime_number_from(count, 1, 16) { + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, i915, id) { + memset(vaddr, 0, PAGE_SIZE); + + err = slice_semaphore_queue(engine, vma, count); + if (err) + goto err_pin; + + if (igt_flush_test(i915, I915_WAIT_LOCKED)) { + err = -EIO; + goto err_pin; + } + } + } + +err_pin: + i915_vma_unpin(vma); +err_map: + i915_gem_object_unpin_map(obj); +err_obj: + i915_gem_object_put(obj); +err_unlock: + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + err = -EIO; + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + mutex_unlock(&i915->drm.struct_mutex); + + return err; +} + static int live_busywait_preempt(void *arg) { struct drm_i915_private *i915 = arg; @@ -398,6 +617,9 @@ static int live_late_preempt(void *arg) if (!ctx_lo) goto err_ctx_hi; + /* Make sure ctx_lo stays before ctx_hi until we trigger preemption. */ + ctx_lo->sched.priority = I915_USER_PRIORITY(1); + for_each_engine(engine, i915, id) { struct igt_live_test t; struct i915_request *rq; @@ -1812,6 +2034,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { SUBTEST(live_sanitycheck), + SUBTEST(live_timeslice_preempt), SUBTEST(live_busywait_preempt), SUBTEST(live_preempt), SUBTEST(live_late_preempt), diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c index b1ba3e65cd52..0bd452e851d8 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.c +++ b/drivers/gpu/drm/i915/i915_scheduler.c @@ -394,6 +394,7 @@ bool __i915_sched_node_add_dependency(struct i915_sched_node *node, list_add(&dep->wait_link, &signal->waiters_list); list_add(&dep->signal_link, &node->signalers_list); dep->signaler = signal; + dep->waiter = node; dep->flags = flags; /* Keep track of whether anyone on this chain has a semaphore */ diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h index 3e309631bd0b..aad81acba9dc 100644 --- a/drivers/gpu/drm/i915/i915_scheduler_types.h +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h @@ -62,6 +62,7 @@ struct i915_sched_node { struct i915_dependency { struct i915_sched_node *signaler; + struct i915_sched_node *waiter; struct list_head signal_link; struct list_head wait_link; struct list_head dfs_link;
If we have multiple contexts of equal priority pending execution, activate a timer to demote the currently executing context in favour of the next in the queue when that timeslice expires. This enforces fairness between contexts (so long as they allow preemption -- forced preemption, in the future, will kick those who do not obey) and allows us to avoid userspace blocking forward progress with e.g. unbounded MI_SEMAPHORE_WAIT. For the starting point here, we use the jiffie as our timeslice so that we should be reasonably efficient wrt frequent CPU wakeups. Testcase: igt/gem_exec_scheduler/semaphore-resolve Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/gt/intel_engine_types.h | 6 + drivers/gpu/drm/i915/gt/intel_lrc.c | 111 +++++++++ drivers/gpu/drm/i915/gt/selftest_lrc.c | 223 +++++++++++++++++++ drivers/gpu/drm/i915/i915_scheduler.c | 1 + drivers/gpu/drm/i915/i915_scheduler_types.h | 1 + 5 files changed, 342 insertions(+)