[27/28] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+

Message ID	20190128010245.20148-27-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Mon, 28 Jan 2019 01:02:44 +0000 Message-Id: <20190128010245.20148-27-chris@chris-wilson.co.uk> In-Reply-To: <20190128010245.20148-1-chris@chris-wilson.co.uk> References: <20190128010245.20148-1-chris@chris-wilson.co.uk> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH 27/28] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+ Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	[01/28] drm/i915: Wait for a moment before forcibly resetting the device \| expand [01/28] drm/i915: Wait for a moment before forcibly resetting the device [02/28] drm/i915: Rename execlists->queue_priority to preempt_priority_hint [03/28] drm/i915/execlists: Suppress preempting self [04/28] drm/i915/execlists: Suppress redundant preemption [05/28] drm/i915/selftests: Exercise some AB...BA preemption chains [06/28] drm/i915: Stop tracking MRU activity on VMA [07/28] drm/i915: Pull VM lists under the VM mutex. [08/28] drm/i915: Move vma lookup to its own lock [09/28] drm/i915: Always allocate an object/vma for the HWSP [10/28] drm/i915: Add timeline barrier support [11/28] drm/i915: Move list of timelines under its own lock [12/28] drm/i915: Introduce concept of per-timeline (context) HWSP [13/28] drm/i915: Enlarge vma->pin_count [14/28] drm/i915: Allocate a status page for each timeline [15/28] drm/i915: Share per-timeline HWSP using a slab suballocator [16/28] drm/i915: Track the context's seqno in its own timeline HWSP [17/28] drm/i915: Track active timelines [18/28] drm/i915: Identify active requests [19/28] drm/i915: Remove the intel_engine_notify tracepoint [20/28] drm/i915: Replace global breadcrumbs with per-context interrupt tracking [21/28] drm/i915: Drop fake breadcrumb irq [22/28] drm/i915: Generalise GPU activity tracking [23/28] drm/i915: Allocate active tracking nodes from a slabcache [24/28] drm/i915: Pull i915_gem_active into the i915_active family [25/28] drm/i915: Keep timeline HWSP allocated until the system is idle [26/28] drm/i915/execlists: Refactor out can_merge_rq() [27/28] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+ [28/28] drm/i915: Prioritise non-busywait semaphore workloads

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 07e4c3c68ecd..6d825cd28ae6 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -22,8 +22,9 @@ * */ -#include <linux/prefetch.h> #include <linux/dma-fence-array.h> +#include <linux/irq_work.h> +#include <linux/prefetch.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> @@ -326,6 +327,76 @@ void i915_request_retire_upto(struct i915_request *rq) } while (tmp != rq); } +struct execute_cb { + struct list_head link; + struct irq_work work; + struct i915_sw_fence *fence; +}; + +static void irq_execute_cb(struct irq_work *wrk) +{ + struct execute_cb *cb = container_of(wrk, typeof(*cb), work); + + i915_sw_fence_complete(cb->fence); + kfree(cb); +} + +static void __notify_execute_cb(struct i915_request *rq) +{ + struct execute_cb *cb; + + lockdep_assert_held(&rq->lock); + + if (list_empty(&rq->execute_cb)) + return; + + list_for_each_entry(cb, &rq->execute_cb, link) + irq_work_queue(&cb->work); + + /* + * XXX Rollback on __i915_request_unsubmit() + * + * In the future, perhaps when we have an active time-slicing scheduler, + * it will be interesting to unsubmit parallel execution and remove + * busywaits from the GPU until their master is restarted. This is + * quite hairy, we have to carefully rollback the fence and do a + * preempt-to-idle cycle on the target engine, all the while the + * master execute_cb may refire. + */ + INIT_LIST_HEAD(&rq->execute_cb); +} + +static int +i915_request_await_execution(struct i915_request *rq, + struct i915_request *signal, + gfp_t gfp) +{ + struct execute_cb *cb; + unsigned long flags; + + if (test_bit(I915_FENCE_FLAG_ACTIVE, &signal->fence.flags)) + return 0; + + cb = kmalloc(sizeof(*cb), gfp); + if (!cb) + return -ENOMEM; + + cb->fence = &rq->submit; + i915_sw_fence_await(cb->fence); + init_irq_work(&cb->work, irq_execute_cb); + + spin_lock_irqsave(&signal->lock, flags); + if (test_bit(I915_FENCE_FLAG_ACTIVE, &signal->fence.flags)) { + i915_sw_fence_complete(cb->fence); + kfree(cb); + } else { + list_add_tail(&cb->link, &signal->execute_cb); + } + spin_unlock_irqrestore(&signal->lock, flags); + + return 0; +} + static void move_to_timeline(struct i915_request *request, struct i915_timeline *timeline) { @@ -373,6 +444,7 @@ void __i915_request_submit(struct i915_request *request) if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) && !i915_request_enable_breadcrumb(request)) intel_engine_queue_breadcrumbs(engine); + __notify_execute_cb(request); spin_unlock(&request->lock); engine->emit_fini_breadcrumb(request, @@ -613,6 +685,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx) } INIT_LIST_HEAD(&rq->active_list); + INIT_LIST_HEAD(&rq->execute_cb); tl = ce->ring->timeline; ret = i915_timeline_get_seqno(tl, rq, &seqno); @@ -700,6 +773,81 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx) return ERR_PTR(ret); } +static int +emit_semaphore_wait(struct i915_request *to, + struct i915_request *from, + gfp_t gfp) +{ + u32 *cs; + int err; + + GEM_BUG_ON(!from->timeline->has_initial_breadcrumb); + + err = i915_timeline_read_lock(from->timeline, to); + if (err) + return err; + + /* + * If we know our signaling request has started, we know that it + * must, at least, have passed its initial breadcrumb and that its + * seqno can only increase, therefore any change in its breadcrumb + * must indicate completion. By using a "not equal to start" compare + * we avoid the murky issue of how to handle seqno wraparound in an + * async environment (short answer, we must stop the world whenever + * any context wraps!) as the likelihood of missing one request then + * seeing the same start value for a new request is 1 in 2^31, and + * even then we know that the new request has started and is in + * progress, so we are sure it will complete soon enough (not to + * worry about). + */ + if (i915_request_started(from)) { + cs = intel_ring_begin(to, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_NEQ_SDD; + *cs++ = from->fence.seqno - 1; + *cs++ = from->timeline->hwsp_offset; + *cs++ = 0; + + intel_ring_advance(to, cs); + } else { + int err; + + err = i915_request_await_execution(to, from, gfp); + if (err) + return err; + + cs = intel_ring_begin(to, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* + * Using greater-than-or-equal here means we have to worry + * about seqno wraparound. To side step that issue, we swap + * the timeline HWSP upon wrapping, so that everyone listening + * for the old (pre-wrap) values do not see the much smaller + * (post-wrap) values than they were expecting (and so wait + * forever). + */ + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_GTE_SDD; + *cs++ = from->fence.seqno; + *cs++ = from->timeline->hwsp_offset; + *cs++ = 0; + + intel_ring_advance(to, cs); + } + + to->sched.semaphore = true; + return 0; +} + static int i915_request_await_request(struct i915_request *to, struct i915_request *from) { @@ -723,6 +871,9 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from) ret = i915_sw_fence_await_sw_fence_gfp(&to->submit, &from->submit, I915_FENCE_GFP); + } else if (HAS_EXECLISTS(to->i915) && + to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) { + ret = emit_semaphore_wait(to, from, I915_FENCE_GFP); } else { ret = i915_sw_fence_await_dma_fence(&to->submit, &from->fence, 0, diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 40f3e8dcbdd5..66a374ee177a 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -127,6 +127,7 @@ struct i915_request { */ struct i915_sw_fence submit; wait_queue_entry_t submitq; + struct list_head execute_cb; /* * A list of everyone we wait upon, and everyone who waits upon us. diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c index 8ae68d2fc134..7f9c4018530b 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.c +++ b/drivers/gpu/drm/i915/i915_scheduler.c @@ -29,6 +29,7 @@ void i915_sched_node_init(struct i915_sched_node *node) INIT_LIST_HEAD(&node->waiters_list); INIT_LIST_HEAD(&node->link); node->attr.priority = I915_PRIORITY_INVALID; + node->semaphore = false; } static struct i915_dependency * diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h index dbe9cb7ecd82..d764cf10536f 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.h +++ b/drivers/gpu/drm/i915/i915_scheduler.h @@ -72,6 +72,7 @@ struct i915_sched_node { struct list_head waiters_list; /* those after us, they depend upon us */ struct list_head link; struct i915_sched_attr attr; + bool semaphore; }; struct i915_dependency { diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c index 7c58b049ecb5..8d1400d378d7 100644 --- a/drivers/gpu/drm/i915/i915_sw_fence.c +++ b/drivers/gpu/drm/i915/i915_sw_fence.c @@ -192,7 +192,7 @@ static void __i915_sw_fence_complete(struct i915_sw_fence *fence, __i915_sw_fence_notify(fence, FENCE_FREE); } -static void i915_sw_fence_complete(struct i915_sw_fence *fence) +void i915_sw_fence_complete(struct i915_sw_fence *fence) { debug_fence_assert(fence); @@ -202,7 +202,7 @@ static void i915_sw_fence_complete(struct i915_sw_fence *fence) __i915_sw_fence_complete(fence, NULL); } -static void i915_sw_fence_await(struct i915_sw_fence *fence) +void i915_sw_fence_await(struct i915_sw_fence *fence) { debug_fence_assert(fence); WARN_ON(atomic_inc_return(&fence->pending) <= 1); diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h index 0e055ea0179f..6dec9e1d1102 100644 --- a/drivers/gpu/drm/i915/i915_sw_fence.h +++ b/drivers/gpu/drm/i915/i915_sw_fence.h @@ -79,6 +79,9 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence, unsigned long timeout, gfp_t gfp); +void i915_sw_fence_await(struct i915_sw_fence *fence); +void i915_sw_fence_complete(struct i915_sw_fence *fence); + static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence) { return atomic_read(&fence->pending) <= 0; diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h index b96a31bc1080..0efaadd3bc32 100644 --- a/drivers/gpu/drm/i915/intel_gpu_commands.h +++ b/drivers/gpu/drm/i915/intel_gpu_commands.h @@ -106,7 +106,12 @@ #define MI_SEMAPHORE_TARGET(engine) ((engine)<<15) #define MI_SEMAPHORE_WAIT MI_INSTR(0x1c, 2) /* GEN8+ */ #define MI_SEMAPHORE_POLL (1<<15) +#define MI_SEMAPHORE_SAD_GT_SDD (0<<12) #define MI_SEMAPHORE_SAD_GTE_SDD (1<<12) +#define MI_SEMAPHORE_SAD_LT_SDD (2<<12) +#define MI_SEMAPHORE_SAD_LTE_SDD (3<<12) +#define MI_SEMAPHORE_SAD_EQ_SDD (4<<12) +#define MI_SEMAPHORE_SAD_NEQ_SDD (5<<12) #define MI_STORE_DWORD_IMM MI_INSTR(0x20, 1) #define MI_STORE_DWORD_IMM_GEN4 MI_INSTR(0x20, 2) #define MI_MEM_VIRTUAL (1 << 22) /* 945,g33,965 */ diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 9b330cb17c76..f7f16b8d3422 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -415,7 +415,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine) * stream, so give it the equivalent small priority bump to prevent * it being gazumped a second time by another peer. */ - if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY) { + if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY && + i915_request_started(active)) { prio |= ACTIVE_PRIORITY; active->sched.attr.priority = prio; list_move_tail(&active->sched.link, @@ -599,6 +600,17 @@ static bool can_merge_rq(const struct i915_request *prev, { GEM_BUG_ON(!assert_priority_queue(prev, next)); + /* + * To avoid AB-BA deadlocks, we simply restrict ourselves to only + * submitting one semaphore (think HW spinlock) to HW at a time. This + * prevents the execution callback on a later sempahore from being + * queued on another engine, so no cycle can be formed. Preemption + * rules should mean that if this semaphore is preempted, its + * dependency chain is preserved and suitably promoted via PI. + */ + if (prev->sched.semaphore && !i915_request_started(prev)) + return false; + if (!can_merge_ctx(prev->hw_context, next->hw_context)) return false;

[27/28] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+

Commit Message

Patch