[19/25] drm/i915/guc: Implement no mid batch preemption for multi-lrc

Message ID	20211014172005.27155-20-matthew.brost@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=d1l9=PC=lists.freedesktop.org=intel-gfx-bounces@kernel.org> DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 4410A604DC From: Matthew Brost <matthew.brost@intel.com> To: <intel-gfx@lists.freedesktop.org>, <dri-devel@lists.freedesktop.org> Cc: <john.c.harrison@intel.com> Date: Thu, 14 Oct 2021 10:19:59 -0700 Message-Id: <20211014172005.27155-20-matthew.brost@intel.com> In-Reply-To: <20211014172005.27155-1-matthew.brost@intel.com> References: <20211014172005.27155-1-matthew.brost@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [Intel-gfx] [PATCH 19/25] drm/i915/guc: Implement no mid batch preemption for multi-lrc Precedence: list Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	Parallel submission aka multi-bb execbuf \| expand [00/25] Parallel submission aka multi-bb execbuf [01/25] drm/i915/guc: Move GuC guc_id allocation under submission state sub-struct [02/25] drm/i915/guc: Take GT PM ref when deregistering context [03/25] drm/i915/guc: Take engine PM when a context is pinned with GuC submission [04/25] drm/i915/guc: Don't call switch_to_kernel_context with GuC submission [05/25] drm/i915: Add logical engine mapping [06/25] drm/i915: Expose logical engine instance to user [07/25] drm/i915/guc: Introduce context parent-child relationship [08/25] drm/i915/guc: Add multi-lrc context registration [09/25] drm/i915/guc: Ensure GuC schedule operations do not operate on child contexts [10/25] drm/i915/guc: Assign contexts in parent-child relationship consecutive guc_ids [11/25] drm/i915/guc: Implement parallel context pin / unpin functions [12/25] drm/i915/guc: Implement multi-lrc submission [13/25] drm/i915/guc: Insert submit fences between requests in parent-child relationship [14/25] drm/i915/guc: Implement multi-lrc reset [15/25] drm/i915/guc: Update debugfs for GuC multi-lrc [16/25] drm/i915/guc: Connect UAPI to GuC multi-lrc interface [17/25] drm/i915/doc: Update parallel submit doc to point to i915_drm.h [18/25] drm/i915/guc: Add basic GuC multi-lrc selftest [19/25] drm/i915/guc: Implement no mid batch preemption for multi-lrc [20/25] drm/i915: Multi-BB execbuf [21/25] drm/i915/guc: Handle errors in multi-lrc requests [22/25] drm/i915: Make request conflict tracking understand parallel submits [23/25] drm/i915: Update I915_GEM_BUSY IOCTL to understand composite fences [24/25] drm/i915: Enable multi-bb execbuf [25/25] drm/i915/execlists: Weak parallel submission support for execlists

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index 6aab60584ee5..5634d14052bc 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -570,7 +570,7 @@ void intel_context_bind_parent_child(struct intel_context *parent, GEM_BUG_ON(intel_context_is_child(child)); GEM_BUG_ON(intel_context_is_parent(child)); - parent->parallel.number_children++; + parent->parallel.child_index = parent->parallel.number_children++; list_add_tail(&child->parallel.child_link, &parent->parallel.child_list); child->parallel.parent = parent; diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 1d880303a7e4..95a5b94b4ece 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -250,6 +250,8 @@ struct intel_context { struct i915_request *last_rq; /** @number_children: number of children if parent */ u8 number_children; + /** @child_index: index into child_list if child */ + u8 child_index; /** @guc: GuC specific members for parallel submission */ struct { /** @wqi_head: head pointer in work queue */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h index 18da67cfcd92..722933e26347 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h @@ -186,7 +186,7 @@ struct guc_process_desc { u32 wq_status; u32 engine_presence; u32 priority; - u32 reserved[30]; + u32 reserved[36]; } __packed; #define CONTEXT_REGISTRATION_FLAG_KMD BIT(0) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 57eb5f8bc8bb..50f0f4eba03b 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -11,6 +11,7 @@ #include "gt/intel_context.h" #include "gt/intel_engine_pm.h" #include "gt/intel_engine_heartbeat.h" +#include "gt/intel_gpu_commands.h" #include "gt/intel_gt.h" #include "gt/intel_gt_irq.h" #include "gt/intel_gt_pm.h" @@ -368,11 +369,16 @@ static inline struct i915_priolist *to_priolist(struct rb_node *rb) /* * When using multi-lrc submission a scratch memory area is reserved in the - * parent's context state for the process descriptor and work queue. Currently - * the scratch area is sized to a page. + * parent's context state for the process descriptor, work queue, and handshake + * between the parent + children contexts to insert safe preemption points + * between each of the BBs. Currently the scratch area is sized to a page. * * The layout of this scratch area is below: * 0 guc_process_desc + * + sizeof(struct guc_process_desc) child go + * + CACHELINE_BYTES child join[0] + * ... + * + CACHELINE_BYTES child join[n - 1] * ... unused * PARENT_SCRATCH_SIZE / 2 work queue start * ... work queue @@ -380,7 +386,25 @@ static inline struct i915_priolist *to_priolist(struct rb_node *rb) */ #define WQ_SIZE (PARENT_SCRATCH_SIZE / 2) #define WQ_OFFSET (PARENT_SCRATCH_SIZE - WQ_SIZE) -static u32 __get_process_desc_offset(struct intel_context *ce) + +struct sync_semaphore { + u32 semaphore; + u8 unused[CACHELINE_BYTES - sizeof(u32)]; +}; + +struct parent_scratch { + struct guc_process_desc pdesc; + + struct sync_semaphore go; + struct sync_semaphore join[MAX_ENGINE_INSTANCE + 1]; + + u8 unused[WQ_OFFSET - sizeof(struct guc_process_desc) - + sizeof(struct sync_semaphore) * (MAX_ENGINE_INSTANCE + 2)]; + + u32 wq[WQ_SIZE / sizeof(u32)]; +}; + +static u32 __get_parent_scratch_offset(struct intel_context *ce) { GEM_BUG_ON(!ce->parallel.guc.parent_page); @@ -389,23 +413,36 @@ static u32 __get_process_desc_offset(struct intel_context *ce) static u32 __get_wq_offset(struct intel_context *ce) { - return __get_process_desc_offset(ce) + WQ_OFFSET; + BUILD_BUG_ON(offsetof(struct parent_scratch, wq) != WQ_OFFSET); + + return __get_parent_scratch_offset(ce) + WQ_OFFSET; } -static struct guc_process_desc * -__get_process_desc(struct intel_context *ce) +static struct parent_scratch * +__get_parent_scratch(struct intel_context *ce) { + BUILD_BUG_ON(sizeof(struct parent_scratch) != PARENT_SCRATCH_SIZE); + BUILD_BUG_ON(sizeof(struct sync_semaphore) != CACHELINE_BYTES); + /* * Need to subtract LRC_STATE_OFFSET here as the * parallel.guc.parent_page is the offset into ce->state while * ce->lrc_reg_reg is ce->state + LRC_STATE_OFFSET. */ - return (struct guc_process_desc *) + return (struct parent_scratch *) (ce->lrc_reg_state + - ((__get_process_desc_offset(ce) - + ((__get_parent_scratch_offset(ce) - LRC_STATE_OFFSET) / sizeof(u32))); } +static struct guc_process_desc * +__get_process_desc(struct intel_context *ce) +{ + struct parent_scratch *ps = __get_parent_scratch(ce); + + return &ps->pdesc; +} + static u32 *get_wq_pointer(struct guc_process_desc *desc, struct intel_context *ce, u32 wqi_size) @@ -425,8 +462,7 @@ static u32 *get_wq_pointer(struct guc_process_desc *desc, } #undef AVAILABLE_SPACE - return ((u32 *)__get_process_desc(ce)) + - ((WQ_OFFSET + ce->parallel.guc.wqi_tail) / sizeof(u32)); + return &__get_parent_scratch(ce)->wq[ce->parallel.guc.wqi_tail / sizeof(u32)]; } static struct guc_lrc_desc *__get_lrc_desc(struct intel_guc *guc, u32 index) @@ -1827,6 +1863,27 @@ static int deregister_context(struct intel_context *ce, u32 guc_id) return __guc_action_deregister_context(guc, guc_id); } +static inline void clear_children_join_go_memory(struct intel_context *ce) +{ + struct parent_scratch *ps = __get_parent_scratch(ce); + int i; + + ps->go.semaphore = 0; + for (i = 0; i < ce->parallel.number_children + 1; ++i) + ps->join[i].semaphore = 0; +} + +static inline u32 get_children_go_value(struct intel_context *ce) +{ + return __get_parent_scratch(ce)->go.semaphore; +} + +static inline u32 get_children_join_value(struct intel_context *ce, + u8 child_index) +{ + return __get_parent_scratch(ce)->join[child_index].semaphore; +} + static void guc_context_policy_init(struct intel_engine_cs *engine, struct guc_lrc_desc *desc) { @@ -1886,7 +1943,7 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) ce->parallel.guc.wqi_head = 0; desc->process_desc = i915_ggtt_offset(ce->state) + - __get_process_desc_offset(ce); + __get_parent_scratch_offset(ce); desc->wq_addr = i915_ggtt_offset(ce->state) + __get_wq_offset(ce); desc->wq_size = WQ_SIZE; @@ -1908,6 +1965,8 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; guc_context_policy_init(engine, desc); } + + clear_children_join_go_memory(ce); } /* @@ -2974,6 +3033,31 @@ static const struct intel_context_ops virtual_child_context_ops = { .get_sibling = guc_virtual_get_sibling, }; +/* + * The below override of the breadcrumbs is enabled when the user configures a + * context for parallel submission (multi-lrc, parent-child). + * + * The overridden breadcrumbs implements an algorithm which allows the GuC to + * safely preempt all the hw contexts configured for parallel submission + * between each BB. The contract between the i915 and GuC is if the parent + * context can be preempted, all the children can be preempted, and the GuC will + * always try to preempt the parent before the children. A handshake between the + * parent / children breadcrumbs ensures the i915 holds up its end of the deal + * creating a window to preempt between each set of BBs. + */ +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags); +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags); +static u32 * +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs); +static u32 * +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs); + static struct intel_context * guc_create_parallel(struct intel_engine_cs **engines, unsigned int num_siblings, @@ -3009,6 +3093,20 @@ guc_create_parallel(struct intel_engine_cs **engines, } } + parent->engine->emit_bb_start = + emit_bb_start_parent_no_preempt_mid_batch; + parent->engine->emit_fini_breadcrumb = + emit_fini_breadcrumb_parent_no_preempt_mid_batch; + parent->engine->emit_fini_breadcrumb_dw = + 12 + 4 * parent->parallel.number_children; + for_each_child(parent, ce) { + ce->engine->emit_bb_start = + emit_bb_start_child_no_preempt_mid_batch; + ce->engine->emit_fini_breadcrumb = + emit_fini_breadcrumb_child_no_preempt_mid_batch; + ce->engine->emit_fini_breadcrumb_dw = 16; + } + kfree(siblings); return parent; @@ -3837,6 +3935,17 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc, drm_printf(p, "\t\tWQI Status: %u\n\n", READ_ONCE(desc->wq_status)); + if (ce->engine->emit_bb_start == + emit_bb_start_parent_no_preempt_mid_batch) { + u8 i; + + drm_printf(p, "\t\tChildren Go: %u\n\n", + get_children_go_value(ce)); + for (i = 0; i < ce->parallel.number_children; ++i) + drm_printf(p, "\t\tChildren Join: %u\n", + get_children_join_value(ce, i)); + } + for_each_child(ce, child) guc_log_context(p, child); } @@ -3844,6 +3953,208 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc, xa_unlock_irqrestore(&guc->context_lookup, flags); } +static inline u32 get_children_go_addr(struct intel_context *ce) +{ + GEM_BUG_ON(!intel_context_is_parent(ce)); + + return i915_ggtt_offset(ce->state) + + __get_parent_scratch_offset(ce) + + offsetof(struct parent_scratch, go.semaphore); +} + +static inline u32 get_children_join_addr(struct intel_context *ce, + u8 child_index) +{ + GEM_BUG_ON(!intel_context_is_parent(ce)); + + return i915_ggtt_offset(ce->state) + + __get_parent_scratch_offset(ce) + + offsetof(struct parent_scratch, join[child_index].semaphore); +} + +#define PARENT_GO_BB 1 +#define PARENT_GO_FINI_BREADCRUMB 0 +#define CHILD_GO_BB 1 +#define CHILD_GO_FINI_BREADCRUMB 0 +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags) +{ + struct intel_context *ce = rq->context; + u32 *cs; + u8 i; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + cs = intel_ring_begin(rq, 10 + 4 * ce->parallel.number_children); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Wait on children */ + for (i = 0; i < ce->parallel.number_children; ++i) { + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = PARENT_GO_BB; + *cs++ = get_children_join_addr(ce, i); + *cs++ = 0; + } + + /* Turn off preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + *cs++ = MI_NOOP; + + /* Tell children go */ + cs = gen8_emit_ggtt_write(cs, + CHILD_GO_BB, + get_children_go_addr(ce), + 0); + + /* Jump to batch */ + *cs++ = MI_BATCH_BUFFER_START_GEN8 | + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + return 0; +} + +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags) +{ + struct intel_context *ce = rq->context; + struct intel_context *parent = intel_context_to_parent(ce); + u32 *cs; + + GEM_BUG_ON(!intel_context_is_child(ce)); + + cs = intel_ring_begin(rq, 12); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Signal parent */ + cs = gen8_emit_ggtt_write(cs, + PARENT_GO_BB, + get_children_join_addr(parent, + ce->parallel.child_index), + 0); + + /* Wait on parent for go */ + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = CHILD_GO_BB; + *cs++ = get_children_go_addr(parent); + *cs++ = 0; + + /* Turn off preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + + /* Jump to batch */ + *cs++ = MI_BATCH_BUFFER_START_GEN8 | + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + + intel_ring_advance(rq, cs); + + return 0; +} + +static u32 * +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs) +{ + struct intel_context *ce = rq->context; + u8 i; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + /* Wait on children */ + for (i = 0; i < ce->parallel.number_children; ++i) { + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = PARENT_GO_FINI_BREADCRUMB; + *cs++ = get_children_join_addr(ce, i); + *cs++ = 0; + } + + /* Turn on preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + *cs++ = MI_NOOP; + + /* Tell children go */ + cs = gen8_emit_ggtt_write(cs, + CHILD_GO_FINI_BREADCRUMB, + get_children_go_addr(ce), + 0); + + /* Emit fini breadcrumb */ + cs = gen8_emit_ggtt_write(cs, + rq->fence.seqno, + i915_request_active_timeline(rq)->hwsp_offset, + 0); + + /* User interrupt */ + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + + return cs; +} + +static u32 * +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, u32 *cs) +{ + struct intel_context *ce = rq->context; + struct intel_context *parent = intel_context_to_parent(ce); + + GEM_BUG_ON(!intel_context_is_child(ce)); + + /* Turn on preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + *cs++ = MI_NOOP; + + /* Signal parent */ + cs = gen8_emit_ggtt_write(cs, + PARENT_GO_FINI_BREADCRUMB, + get_children_join_addr(parent, + ce->parallel.child_index), + 0); + + /* Wait parent on for go */ + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = CHILD_GO_FINI_BREADCRUMB; + *cs++ = get_children_go_addr(parent); + *cs++ = 0; + + /* Emit fini breadcrumb */ + cs = gen8_emit_ggtt_write(cs, + rq->fence.seqno, + i915_request_active_timeline(rq)->hwsp_offset, + 0); + + /* User interrupt */ + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + + return cs; +} + static struct intel_context * guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count, unsigned long flags)

[19/25] drm/i915/guc: Implement no mid batch preemption for multi-lrc

Commit Message

Patch