diff mbox series

[23/27] drm/i915/guc: Implement no mid batch preemption for multi-lrc

Message ID 20210820224446.30620-24-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Parallel submission aka multi-bb execbuf | expand

Commit Message

Matthew Brost Aug. 20, 2021, 10:44 p.m. UTC
For some users of multi-lrc, e.g. split frame, it isn't safe to preempt
mid BB. To safely enable preemption at the BB boundary, a handshake
between to parent and child is needed. This is implemented via custom
emit_bb_start & emit_fini_breadcrumb functions and enabled via by
default if a context is configured by set parallel extension.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_context.c       |   2 +-
 drivers/gpu/drm/i915/gt/intel_context_types.h |   3 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   2 +-
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 283 +++++++++++++++++-
 4 files changed, 287 insertions(+), 3 deletions(-)

Comments

Tvrtko Ursulin Sept. 10, 2021, 11:25 a.m. UTC | #1
On 20/08/2021 23:44, Matthew Brost wrote:
> For some users of multi-lrc, e.g. split frame, it isn't safe to preempt
> mid BB. To safely enable preemption at the BB boundary, a handshake
> between to parent and child is needed. This is implemented via custom
> emit_bb_start & emit_fini_breadcrumb functions and enabled via by
> default if a context is configured by set parallel extension.

FWIW I think it's wrong to hardcode the requirements of a particular 
hardware generation fixed media pipeline into the uapi. IMO better 
solution was when concept of parallel submission was decoupled from the 
no preemption mid batch preambles. Otherwise might as well call the 
extension I915_CONTEXT_ENGINES_EXT_MEDIA_SPLIT_FRAME_SUBMIT or something.

Regards,

Tvrtko
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_context.c       |   2 +-
>   drivers/gpu/drm/i915/gt/intel_context_types.h |   3 +
>   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   2 +-
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 283 +++++++++++++++++-
>   4 files changed, 287 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> index 5615be32879c..2de62649e275 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> @@ -561,7 +561,7 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>   	GEM_BUG_ON(intel_context_is_child(child));
>   	GEM_BUG_ON(intel_context_is_parent(child));
>   
> -	parent->guc_number_children++;
> +	child->guc_child_index = parent->guc_number_children++;
>   	list_add_tail(&child->guc_child_link,
>   		      &parent->guc_child_list);
>   	child->parent = parent;
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index 713d85b0b364..727f91e7f7c2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -246,6 +246,9 @@ struct intel_context {
>   		/** @guc_number_children: number of children if parent */
>   		u8 guc_number_children;
>   
> +		/** @guc_child_index: index into guc_child_list if child */
> +		u8 guc_child_index;
> +
>   		/**
>   		 * @parent_page: page in context used by parent for work queue,
>   		 * work queue descriptor
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> index 6cd26dc060d1..9f61cfa5566a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> @@ -188,7 +188,7 @@ struct guc_process_desc {
>   	u32 wq_status;
>   	u32 engine_presence;
>   	u32 priority;
> -	u32 reserved[30];
> +	u32 reserved[36];
>   } __packed;
>   
>   #define CONTEXT_REGISTRATION_FLAG_KMD	BIT(0)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 91330525330d..1a18f99bf12a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -11,6 +11,7 @@
>   #include "gt/intel_context.h"
>   #include "gt/intel_engine_pm.h"
>   #include "gt/intel_engine_heartbeat.h"
> +#include "gt/intel_gpu_commands.h"
>   #include "gt/intel_gt.h"
>   #include "gt/intel_gt_irq.h"
>   #include "gt/intel_gt_pm.h"
> @@ -366,10 +367,14 @@ static struct i915_priolist *to_priolist(struct rb_node *rb)
>   
>   /*
>    * When using multi-lrc submission an extra page in the context state is
> - * reserved for the process descriptor and work queue.
> + * reserved for the process descriptor, work queue, and preempt BB boundary
> + * handshake between the parent + childlren contexts.
>    *
>    * The layout of this page is below:
>    * 0						guc_process_desc
> + * + sizeof(struct guc_process_desc)		child go
> + * + CACHELINE_BYTES				child join ...
> + * + CACHELINE_BYTES ...
>    * ...						unused
>    * PAGE_SIZE / 2				work queue start
>    * ...						work queue
> @@ -1785,6 +1790,30 @@ static int deregister_context(struct intel_context *ce, u32 guc_id, bool loop)
>   	return __guc_action_deregister_context(guc, guc_id, loop);
>   }
>   
> +static inline void clear_children_join_go_memory(struct intel_context *ce)
> +{
> +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> +	u8 i;
> +
> +	for (i = 0; i < ce->guc_number_children + 1; ++i)
> +		mem[i * (CACHELINE_BYTES / sizeof(u32))] = 0;
> +}
> +
> +static inline u32 get_children_go_value(struct intel_context *ce)
> +{
> +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> +
> +	return mem[0];
> +}
> +
> +static inline u32 get_children_join_value(struct intel_context *ce,
> +					  u8 child_index)
> +{
> +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> +
> +	return mem[(child_index + 1) * (CACHELINE_BYTES / sizeof(u32))];
> +}
> +
>   static void guc_context_policy_init(struct intel_engine_cs *engine,
>   				    struct guc_lrc_desc *desc)
>   {
> @@ -1867,6 +1896,8 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop)
>   			desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD;
>   			guc_context_policy_init(engine, desc);
>   		}
> +
> +		clear_children_join_go_memory(ce);
>   	}
>   
>   	/*
> @@ -2943,6 +2974,31 @@ static const struct intel_context_ops virtual_child_context_ops = {
>   	.get_sibling = guc_virtual_get_sibling,
>   };
>   
> +/*
> + * The below override of the breadcrumbs is enabled when the user configures a
> + * context for parallel submission (multi-lrc, parent-child).
> + *
> + * The overridden breadcrumbs implements an algorithm which allows the GuC to
> + * safely preempt all the hw contexts configured for parallel submission
> + * between each BB. The contract between the i915 and GuC is if the parent
> + * context can be preempted, all the children can be preempted, and the GuC will
> + * always try to preempt the parent before the children. A handshake between the
> + * parent / children breadcrumbs ensures the i915 holds up its end of the deal
> + * creating a window to preempt between each set of BBs.
> + */
> +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
> +						     u64 offset, u32 len,
> +						     const unsigned int flags);
> +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
> +						    u64 offset, u32 len,
> +						    const unsigned int flags);
> +static u32 *
> +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
> +						 u32 *cs);
> +static u32 *
> +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq,
> +						u32 *cs);
> +
>   static struct intel_context *
>   guc_create_parallel(struct intel_engine_cs **engines,
>   		    unsigned int num_siblings,
> @@ -2978,6 +3034,20 @@ guc_create_parallel(struct intel_engine_cs **engines,
>   		}
>   	}
>   
> +	parent->engine->emit_bb_start =
> +		emit_bb_start_parent_no_preempt_mid_batch;
> +	parent->engine->emit_fini_breadcrumb =
> +		emit_fini_breadcrumb_parent_no_preempt_mid_batch;
> +	parent->engine->emit_fini_breadcrumb_dw =
> +		12 + 4 * parent->guc_number_children;
> +	for_each_child(parent, ce) {
> +		ce->engine->emit_bb_start =
> +			emit_bb_start_child_no_preempt_mid_batch;
> +		ce->engine->emit_fini_breadcrumb =
> +			emit_fini_breadcrumb_child_no_preempt_mid_batch;
> +		ce->engine->emit_fini_breadcrumb_dw = 16;
> +	}
> +
>   	kfree(siblings);
>   	return parent;
>   
> @@ -3362,6 +3432,204 @@ void intel_guc_submission_init_early(struct intel_guc *guc)
>   	guc->submission_selected = __guc_submission_selected(guc);
>   }
>   
> +static inline u32 get_children_go_addr(struct intel_context *ce)
> +{
> +	GEM_BUG_ON(!intel_context_is_parent(ce));
> +
> +	return i915_ggtt_offset(ce->state) +
> +		__get_process_desc_offset(ce) +
> +		sizeof(struct guc_process_desc);
> +}
> +
> +static inline u32 get_children_join_addr(struct intel_context *ce,
> +					 u8 child_index)
> +{
> +	GEM_BUG_ON(!intel_context_is_parent(ce));
> +
> +	return get_children_go_addr(ce) + (child_index + 1) * CACHELINE_BYTES;
> +}
> +
> +#define PARENT_GO_BB			1
> +#define PARENT_GO_FINI_BREADCRUMB	0
> +#define CHILD_GO_BB			1
> +#define CHILD_GO_FINI_BREADCRUMB	0
> +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
> +						     u64 offset, u32 len,
> +						     const unsigned int flags)
> +{
> +	struct intel_context *ce = rq->context;
> +	u32 *cs;
> +	u8 i;
> +
> +	GEM_BUG_ON(!intel_context_is_parent(ce));
> +
> +	cs = intel_ring_begin(rq, 10 + 4 * ce->guc_number_children);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	/* Wait on chidlren */
> +	for (i = 0; i < ce->guc_number_children; ++i) {
> +		*cs++ = (MI_SEMAPHORE_WAIT |
> +			 MI_SEMAPHORE_GLOBAL_GTT |
> +			 MI_SEMAPHORE_POLL |
> +			 MI_SEMAPHORE_SAD_EQ_SDD);
> +		*cs++ = PARENT_GO_BB;
> +		*cs++ = get_children_join_addr(ce, i);
> +		*cs++ = 0;
> +	}
> +
> +	/* Turn off preemption */
> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> +	*cs++ = MI_NOOP;
> +
> +	/* Tell children go */
> +	cs = gen8_emit_ggtt_write(cs,
> +				  CHILD_GO_BB,
> +				  get_children_go_addr(ce),
> +				  0);
> +
> +	/* Jump to batch */
> +	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
> +		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
> +	*cs++ = lower_32_bits(offset);
> +	*cs++ = upper_32_bits(offset);
> +	*cs++ = MI_NOOP;
> +
> +	intel_ring_advance(rq, cs);
> +
> +	return 0;
> +}
> +
> +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
> +						    u64 offset, u32 len,
> +						    const unsigned int flags)
> +{
> +	struct intel_context *ce = rq->context;
> +	u32 *cs;
> +
> +	GEM_BUG_ON(!intel_context_is_child(ce));
> +
> +	cs = intel_ring_begin(rq, 12);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	/* Signal parent */
> +	cs = gen8_emit_ggtt_write(cs,
> +				  PARENT_GO_BB,
> +				  get_children_join_addr(ce->parent,
> +							 ce->guc_child_index),
> +				  0);
> +
> +	/* Wait parent on for go */
> +	*cs++ = (MI_SEMAPHORE_WAIT |
> +		 MI_SEMAPHORE_GLOBAL_GTT |
> +		 MI_SEMAPHORE_POLL |
> +		 MI_SEMAPHORE_SAD_EQ_SDD);
> +	*cs++ = CHILD_GO_BB;
> +	*cs++ = get_children_go_addr(ce->parent);
> +	*cs++ = 0;
> +
> +	/* Turn off preemption */
> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> +
> +	/* Jump to batch */
> +	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
> +		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
> +	*cs++ = lower_32_bits(offset);
> +	*cs++ = upper_32_bits(offset);
> +
> +	intel_ring_advance(rq, cs);
> +
> +	return 0;
> +}
> +
> +static u32 *
> +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
> +						 u32 *cs)
> +{
> +	struct intel_context *ce = rq->context;
> +	u8 i;
> +
> +	GEM_BUG_ON(!intel_context_is_parent(ce));
> +
> +	/* Wait on children */
> +	for (i = 0; i < ce->guc_number_children; ++i) {
> +		*cs++ = (MI_SEMAPHORE_WAIT |
> +			 MI_SEMAPHORE_GLOBAL_GTT |
> +			 MI_SEMAPHORE_POLL |
> +			 MI_SEMAPHORE_SAD_EQ_SDD);
> +		*cs++ = PARENT_GO_FINI_BREADCRUMB;
> +		*cs++ = get_children_join_addr(ce, i);
> +		*cs++ = 0;
> +	}
> +
> +	/* Turn on preemption */
> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> +	*cs++ = MI_NOOP;
> +
> +	/* Tell children go */
> +	cs = gen8_emit_ggtt_write(cs,
> +				  CHILD_GO_FINI_BREADCRUMB,
> +				  get_children_go_addr(ce),
> +				  0);
> +
> +	/* Emit fini breadcrumb */
> +	cs = gen8_emit_ggtt_write(cs,
> +				  rq->fence.seqno,
> +				  i915_request_active_timeline(rq)->hwsp_offset,
> +				  0);
> +
> +	/* User interrupt */
> +	*cs++ = MI_USER_INTERRUPT;
> +	*cs++ = MI_NOOP;
> +
> +	rq->tail = intel_ring_offset(rq, cs);
> +
> +	return cs;
> +}
> +
> +static u32 *
> +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, u32 *cs)
> +{
> +	struct intel_context *ce = rq->context;
> +
> +	GEM_BUG_ON(!intel_context_is_child(ce));
> +
> +	/* Turn on preemption */
> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> +	*cs++ = MI_NOOP;
> +
> +	/* Signal parent */
> +	cs = gen8_emit_ggtt_write(cs,
> +				  PARENT_GO_FINI_BREADCRUMB,
> +				  get_children_join_addr(ce->parent,
> +							 ce->guc_child_index),
> +				  0);
> +
> +	/* Wait parent on for go */
> +	*cs++ = (MI_SEMAPHORE_WAIT |
> +		 MI_SEMAPHORE_GLOBAL_GTT |
> +		 MI_SEMAPHORE_POLL |
> +		 MI_SEMAPHORE_SAD_EQ_SDD);
> +	*cs++ = CHILD_GO_FINI_BREADCRUMB;
> +	*cs++ = get_children_go_addr(ce->parent);
> +	*cs++ = 0;
> +
> +	/* Emit fini breadcrumb */
> +	cs = gen8_emit_ggtt_write(cs,
> +				  rq->fence.seqno,
> +				  i915_request_active_timeline(rq)->hwsp_offset,
> +				  0);
> +
> +	/* User interrupt */
> +	*cs++ = MI_USER_INTERRUPT;
> +	*cs++ = MI_NOOP;
> +
> +	rq->tail = intel_ring_offset(rq, cs);
> +
> +	return cs;
> +}
> +
>   static struct intel_context *
>   g2h_context_lookup(struct intel_guc *guc, u32 desc_idx)
>   {
> @@ -3807,6 +4075,19 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
>   			drm_printf(p, "\t\tWQI Status: %u\n\n",
>   				   READ_ONCE(desc->wq_status));
>   
> +			drm_printf(p, "\t\tNumber Children: %u\n\n",
> +				   ce->guc_number_children);
> +			if (ce->engine->emit_bb_start ==
> +			    emit_bb_start_parent_no_preempt_mid_batch) {
> +				u8 i;
> +
> +				drm_printf(p, "\t\tChildren Go: %u\n\n",
> +					   get_children_go_value(ce));
> +				for (i = 0; i < ce->guc_number_children; ++i)
> +					drm_printf(p, "\t\tChildren Join: %u\n",
> +						   get_children_join_value(ce, i));
> +			}
> +
>   			for_each_child(ce, child)
>   				guc_log_context(p, child);
>   		}
>
Matthew Brost Sept. 10, 2021, 8:49 p.m. UTC | #2
On Fri, Sep 10, 2021 at 12:25:43PM +0100, Tvrtko Ursulin wrote:
> 
> On 20/08/2021 23:44, Matthew Brost wrote:
> > For some users of multi-lrc, e.g. split frame, it isn't safe to preempt
> > mid BB. To safely enable preemption at the BB boundary, a handshake
> > between to parent and child is needed. This is implemented via custom
> > emit_bb_start & emit_fini_breadcrumb functions and enabled via by
> > default if a context is configured by set parallel extension.
> 
> FWIW I think it's wrong to hardcode the requirements of a particular
> hardware generation fixed media pipeline into the uapi. IMO better solution
> was when concept of parallel submission was decoupled from the no preemption
> mid batch preambles. Otherwise might as well call the extension
> I915_CONTEXT_ENGINES_EXT_MEDIA_SPLIT_FRAME_SUBMIT or something.
> 

I don't disagree but this where we landed per Daniel Vetter's feedback -
default to what our current hardware supports and extend it later to
newer hardware / requirements as needed.

Matt

> Regards,
> 
> Tvrtko
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/intel_context.c       |   2 +-
> >   drivers/gpu/drm/i915/gt/intel_context_types.h |   3 +
> >   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   2 +-
> >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 283 +++++++++++++++++-
> >   4 files changed, 287 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> > index 5615be32879c..2de62649e275 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_context.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> > @@ -561,7 +561,7 @@ void intel_context_bind_parent_child(struct intel_context *parent,
> >   	GEM_BUG_ON(intel_context_is_child(child));
> >   	GEM_BUG_ON(intel_context_is_parent(child));
> > -	parent->guc_number_children++;
> > +	child->guc_child_index = parent->guc_number_children++;
> >   	list_add_tail(&child->guc_child_link,
> >   		      &parent->guc_child_list);
> >   	child->parent = parent;
> > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > index 713d85b0b364..727f91e7f7c2 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > @@ -246,6 +246,9 @@ struct intel_context {
> >   		/** @guc_number_children: number of children if parent */
> >   		u8 guc_number_children;
> > +		/** @guc_child_index: index into guc_child_list if child */
> > +		u8 guc_child_index;
> > +
> >   		/**
> >   		 * @parent_page: page in context used by parent for work queue,
> >   		 * work queue descriptor
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> > index 6cd26dc060d1..9f61cfa5566a 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> > @@ -188,7 +188,7 @@ struct guc_process_desc {
> >   	u32 wq_status;
> >   	u32 engine_presence;
> >   	u32 priority;
> > -	u32 reserved[30];
> > +	u32 reserved[36];
> >   } __packed;
> >   #define CONTEXT_REGISTRATION_FLAG_KMD	BIT(0)
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > index 91330525330d..1a18f99bf12a 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > @@ -11,6 +11,7 @@
> >   #include "gt/intel_context.h"
> >   #include "gt/intel_engine_pm.h"
> >   #include "gt/intel_engine_heartbeat.h"
> > +#include "gt/intel_gpu_commands.h"
> >   #include "gt/intel_gt.h"
> >   #include "gt/intel_gt_irq.h"
> >   #include "gt/intel_gt_pm.h"
> > @@ -366,10 +367,14 @@ static struct i915_priolist *to_priolist(struct rb_node *rb)
> >   /*
> >    * When using multi-lrc submission an extra page in the context state is
> > - * reserved for the process descriptor and work queue.
> > + * reserved for the process descriptor, work queue, and preempt BB boundary
> > + * handshake between the parent + childlren contexts.
> >    *
> >    * The layout of this page is below:
> >    * 0						guc_process_desc
> > + * + sizeof(struct guc_process_desc)		child go
> > + * + CACHELINE_BYTES				child join ...
> > + * + CACHELINE_BYTES ...
> >    * ...						unused
> >    * PAGE_SIZE / 2				work queue start
> >    * ...						work queue
> > @@ -1785,6 +1790,30 @@ static int deregister_context(struct intel_context *ce, u32 guc_id, bool loop)
> >   	return __guc_action_deregister_context(guc, guc_id, loop);
> >   }
> > +static inline void clear_children_join_go_memory(struct intel_context *ce)
> > +{
> > +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> > +	u8 i;
> > +
> > +	for (i = 0; i < ce->guc_number_children + 1; ++i)
> > +		mem[i * (CACHELINE_BYTES / sizeof(u32))] = 0;
> > +}
> > +
> > +static inline u32 get_children_go_value(struct intel_context *ce)
> > +{
> > +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> > +
> > +	return mem[0];
> > +}
> > +
> > +static inline u32 get_children_join_value(struct intel_context *ce,
> > +					  u8 child_index)
> > +{
> > +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> > +
> > +	return mem[(child_index + 1) * (CACHELINE_BYTES / sizeof(u32))];
> > +}
> > +
> >   static void guc_context_policy_init(struct intel_engine_cs *engine,
> >   				    struct guc_lrc_desc *desc)
> >   {
> > @@ -1867,6 +1896,8 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop)
> >   			desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD;
> >   			guc_context_policy_init(engine, desc);
> >   		}
> > +
> > +		clear_children_join_go_memory(ce);
> >   	}
> >   	/*
> > @@ -2943,6 +2974,31 @@ static const struct intel_context_ops virtual_child_context_ops = {
> >   	.get_sibling = guc_virtual_get_sibling,
> >   };
> > +/*
> > + * The below override of the breadcrumbs is enabled when the user configures a
> > + * context for parallel submission (multi-lrc, parent-child).
> > + *
> > + * The overridden breadcrumbs implements an algorithm which allows the GuC to
> > + * safely preempt all the hw contexts configured for parallel submission
> > + * between each BB. The contract between the i915 and GuC is if the parent
> > + * context can be preempted, all the children can be preempted, and the GuC will
> > + * always try to preempt the parent before the children. A handshake between the
> > + * parent / children breadcrumbs ensures the i915 holds up its end of the deal
> > + * creating a window to preempt between each set of BBs.
> > + */
> > +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
> > +						     u64 offset, u32 len,
> > +						     const unsigned int flags);
> > +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
> > +						    u64 offset, u32 len,
> > +						    const unsigned int flags);
> > +static u32 *
> > +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
> > +						 u32 *cs);
> > +static u32 *
> > +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq,
> > +						u32 *cs);
> > +
> >   static struct intel_context *
> >   guc_create_parallel(struct intel_engine_cs **engines,
> >   		    unsigned int num_siblings,
> > @@ -2978,6 +3034,20 @@ guc_create_parallel(struct intel_engine_cs **engines,
> >   		}
> >   	}
> > +	parent->engine->emit_bb_start =
> > +		emit_bb_start_parent_no_preempt_mid_batch;
> > +	parent->engine->emit_fini_breadcrumb =
> > +		emit_fini_breadcrumb_parent_no_preempt_mid_batch;
> > +	parent->engine->emit_fini_breadcrumb_dw =
> > +		12 + 4 * parent->guc_number_children;
> > +	for_each_child(parent, ce) {
> > +		ce->engine->emit_bb_start =
> > +			emit_bb_start_child_no_preempt_mid_batch;
> > +		ce->engine->emit_fini_breadcrumb =
> > +			emit_fini_breadcrumb_child_no_preempt_mid_batch;
> > +		ce->engine->emit_fini_breadcrumb_dw = 16;
> > +	}
> > +
> >   	kfree(siblings);
> >   	return parent;
> > @@ -3362,6 +3432,204 @@ void intel_guc_submission_init_early(struct intel_guc *guc)
> >   	guc->submission_selected = __guc_submission_selected(guc);
> >   }
> > +static inline u32 get_children_go_addr(struct intel_context *ce)
> > +{
> > +	GEM_BUG_ON(!intel_context_is_parent(ce));
> > +
> > +	return i915_ggtt_offset(ce->state) +
> > +		__get_process_desc_offset(ce) +
> > +		sizeof(struct guc_process_desc);
> > +}
> > +
> > +static inline u32 get_children_join_addr(struct intel_context *ce,
> > +					 u8 child_index)
> > +{
> > +	GEM_BUG_ON(!intel_context_is_parent(ce));
> > +
> > +	return get_children_go_addr(ce) + (child_index + 1) * CACHELINE_BYTES;
> > +}
> > +
> > +#define PARENT_GO_BB			1
> > +#define PARENT_GO_FINI_BREADCRUMB	0
> > +#define CHILD_GO_BB			1
> > +#define CHILD_GO_FINI_BREADCRUMB	0
> > +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
> > +						     u64 offset, u32 len,
> > +						     const unsigned int flags)
> > +{
> > +	struct intel_context *ce = rq->context;
> > +	u32 *cs;
> > +	u8 i;
> > +
> > +	GEM_BUG_ON(!intel_context_is_parent(ce));
> > +
> > +	cs = intel_ring_begin(rq, 10 + 4 * ce->guc_number_children);
> > +	if (IS_ERR(cs))
> > +		return PTR_ERR(cs);
> > +
> > +	/* Wait on chidlren */
> > +	for (i = 0; i < ce->guc_number_children; ++i) {
> > +		*cs++ = (MI_SEMAPHORE_WAIT |
> > +			 MI_SEMAPHORE_GLOBAL_GTT |
> > +			 MI_SEMAPHORE_POLL |
> > +			 MI_SEMAPHORE_SAD_EQ_SDD);
> > +		*cs++ = PARENT_GO_BB;
> > +		*cs++ = get_children_join_addr(ce, i);
> > +		*cs++ = 0;
> > +	}
> > +
> > +	/* Turn off preemption */
> > +	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> > +	*cs++ = MI_NOOP;
> > +
> > +	/* Tell children go */
> > +	cs = gen8_emit_ggtt_write(cs,
> > +				  CHILD_GO_BB,
> > +				  get_children_go_addr(ce),
> > +				  0);
> > +
> > +	/* Jump to batch */
> > +	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
> > +		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
> > +	*cs++ = lower_32_bits(offset);
> > +	*cs++ = upper_32_bits(offset);
> > +	*cs++ = MI_NOOP;
> > +
> > +	intel_ring_advance(rq, cs);
> > +
> > +	return 0;
> > +}
> > +
> > +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
> > +						    u64 offset, u32 len,
> > +						    const unsigned int flags)
> > +{
> > +	struct intel_context *ce = rq->context;
> > +	u32 *cs;
> > +
> > +	GEM_BUG_ON(!intel_context_is_child(ce));
> > +
> > +	cs = intel_ring_begin(rq, 12);
> > +	if (IS_ERR(cs))
> > +		return PTR_ERR(cs);
> > +
> > +	/* Signal parent */
> > +	cs = gen8_emit_ggtt_write(cs,
> > +				  PARENT_GO_BB,
> > +				  get_children_join_addr(ce->parent,
> > +							 ce->guc_child_index),
> > +				  0);
> > +
> > +	/* Wait parent on for go */
> > +	*cs++ = (MI_SEMAPHORE_WAIT |
> > +		 MI_SEMAPHORE_GLOBAL_GTT |
> > +		 MI_SEMAPHORE_POLL |
> > +		 MI_SEMAPHORE_SAD_EQ_SDD);
> > +	*cs++ = CHILD_GO_BB;
> > +	*cs++ = get_children_go_addr(ce->parent);
> > +	*cs++ = 0;
> > +
> > +	/* Turn off preemption */
> > +	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> > +
> > +	/* Jump to batch */
> > +	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
> > +		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
> > +	*cs++ = lower_32_bits(offset);
> > +	*cs++ = upper_32_bits(offset);
> > +
> > +	intel_ring_advance(rq, cs);
> > +
> > +	return 0;
> > +}
> > +
> > +static u32 *
> > +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
> > +						 u32 *cs)
> > +{
> > +	struct intel_context *ce = rq->context;
> > +	u8 i;
> > +
> > +	GEM_BUG_ON(!intel_context_is_parent(ce));
> > +
> > +	/* Wait on children */
> > +	for (i = 0; i < ce->guc_number_children; ++i) {
> > +		*cs++ = (MI_SEMAPHORE_WAIT |
> > +			 MI_SEMAPHORE_GLOBAL_GTT |
> > +			 MI_SEMAPHORE_POLL |
> > +			 MI_SEMAPHORE_SAD_EQ_SDD);
> > +		*cs++ = PARENT_GO_FINI_BREADCRUMB;
> > +		*cs++ = get_children_join_addr(ce, i);
> > +		*cs++ = 0;
> > +	}
> > +
> > +	/* Turn on preemption */
> > +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> > +	*cs++ = MI_NOOP;
> > +
> > +	/* Tell children go */
> > +	cs = gen8_emit_ggtt_write(cs,
> > +				  CHILD_GO_FINI_BREADCRUMB,
> > +				  get_children_go_addr(ce),
> > +				  0);
> > +
> > +	/* Emit fini breadcrumb */
> > +	cs = gen8_emit_ggtt_write(cs,
> > +				  rq->fence.seqno,
> > +				  i915_request_active_timeline(rq)->hwsp_offset,
> > +				  0);
> > +
> > +	/* User interrupt */
> > +	*cs++ = MI_USER_INTERRUPT;
> > +	*cs++ = MI_NOOP;
> > +
> > +	rq->tail = intel_ring_offset(rq, cs);
> > +
> > +	return cs;
> > +}
> > +
> > +static u32 *
> > +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, u32 *cs)
> > +{
> > +	struct intel_context *ce = rq->context;
> > +
> > +	GEM_BUG_ON(!intel_context_is_child(ce));
> > +
> > +	/* Turn on preemption */
> > +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> > +	*cs++ = MI_NOOP;
> > +
> > +	/* Signal parent */
> > +	cs = gen8_emit_ggtt_write(cs,
> > +				  PARENT_GO_FINI_BREADCRUMB,
> > +				  get_children_join_addr(ce->parent,
> > +							 ce->guc_child_index),
> > +				  0);
> > +
> > +	/* Wait parent on for go */
> > +	*cs++ = (MI_SEMAPHORE_WAIT |
> > +		 MI_SEMAPHORE_GLOBAL_GTT |
> > +		 MI_SEMAPHORE_POLL |
> > +		 MI_SEMAPHORE_SAD_EQ_SDD);
> > +	*cs++ = CHILD_GO_FINI_BREADCRUMB;
> > +	*cs++ = get_children_go_addr(ce->parent);
> > +	*cs++ = 0;
> > +
> > +	/* Emit fini breadcrumb */
> > +	cs = gen8_emit_ggtt_write(cs,
> > +				  rq->fence.seqno,
> > +				  i915_request_active_timeline(rq)->hwsp_offset,
> > +				  0);
> > +
> > +	/* User interrupt */
> > +	*cs++ = MI_USER_INTERRUPT;
> > +	*cs++ = MI_NOOP;
> > +
> > +	rq->tail = intel_ring_offset(rq, cs);
> > +
> > +	return cs;
> > +}
> > +
> >   static struct intel_context *
> >   g2h_context_lookup(struct intel_guc *guc, u32 desc_idx)
> >   {
> > @@ -3807,6 +4075,19 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
> >   			drm_printf(p, "\t\tWQI Status: %u\n\n",
> >   				   READ_ONCE(desc->wq_status));
> > +			drm_printf(p, "\t\tNumber Children: %u\n\n",
> > +				   ce->guc_number_children);
> > +			if (ce->engine->emit_bb_start ==
> > +			    emit_bb_start_parent_no_preempt_mid_batch) {
> > +				u8 i;
> > +
> > +				drm_printf(p, "\t\tChildren Go: %u\n\n",
> > +					   get_children_go_value(ce));
> > +				for (i = 0; i < ce->guc_number_children; ++i)
> > +					drm_printf(p, "\t\tChildren Join: %u\n",
> > +						   get_children_join_value(ce, i));
> > +			}
> > +
> >   			for_each_child(ce, child)
> >   				guc_log_context(p, child);
> >   		}
> >
Tvrtko Ursulin Sept. 13, 2021, 10:52 a.m. UTC | #3
On 10/09/2021 21:49, Matthew Brost wrote:
> On Fri, Sep 10, 2021 at 12:25:43PM +0100, Tvrtko Ursulin wrote:
>>
>> On 20/08/2021 23:44, Matthew Brost wrote:
>>> For some users of multi-lrc, e.g. split frame, it isn't safe to preempt
>>> mid BB. To safely enable preemption at the BB boundary, a handshake
>>> between to parent and child is needed. This is implemented via custom
>>> emit_bb_start & emit_fini_breadcrumb functions and enabled via by
>>> default if a context is configured by set parallel extension.
>>
>> FWIW I think it's wrong to hardcode the requirements of a particular
>> hardware generation fixed media pipeline into the uapi. IMO better solution
>> was when concept of parallel submission was decoupled from the no preemption
>> mid batch preambles. Otherwise might as well call the extension
>> I915_CONTEXT_ENGINES_EXT_MEDIA_SPLIT_FRAME_SUBMIT or something.
>>
> 
> I don't disagree but this where we landed per Daniel Vetter's feedback -
> default to what our current hardware supports and extend it later to
> newer hardware / requirements as needed.

I think this only re-affirms my argument - no point giving the extension 
a generic name if it is so tightly coupled to a specific use case. But I 
wrote FWIW so whatever.

It will be mighty awkward if compute multi-lrc will need to specify a 
flag to allow preemption, when turning off preemption is otherwise not 
something we allow unprivileged clients to do. So it will be kind of 
opt-out from unfriendly/dangerous default behaviour instead of explicit 
requesting it.

Regards,

Tvrtko
John Harrison Sept. 28, 2021, 10:20 p.m. UTC | #4
On 8/20/2021 15:44, Matthew Brost wrote:
> For some users of multi-lrc, e.g. split frame, it isn't safe to preempt
> mid BB. To safely enable preemption at the BB boundary, a handshake
> between to parent and child is needed. This is implemented via custom
> emit_bb_start & emit_fini_breadcrumb functions and enabled via by
via by -> by

> default if a context is configured by set parallel extension.
I tend to agree with Tvrtko that this should probably be an opt in 
change. Is there a flags word passed in when creating the context?

Also, it's not just a change in pre-emption behaviour but a change in 
synchronisation too, right? Previously, if you had a whole bunch of back 
to back submissions then one child could run ahead of another and/or the 
parent. After this change, there is a forced regroup at the end of each 
batch. So while one could end sooner/later than the others, they can't 
ever get an entire batch (or more) ahead or behind. Or was that 
synchronisation already in there through other means anyway?

>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_context.c       |   2 +-
>   drivers/gpu/drm/i915/gt/intel_context_types.h |   3 +
>   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   2 +-
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 283 +++++++++++++++++-
>   4 files changed, 287 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> index 5615be32879c..2de62649e275 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> @@ -561,7 +561,7 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>   	GEM_BUG_ON(intel_context_is_child(child));
>   	GEM_BUG_ON(intel_context_is_parent(child));
>   
> -	parent->guc_number_children++;
> +	child->guc_child_index = parent->guc_number_children++;
>   	list_add_tail(&child->guc_child_link,
>   		      &parent->guc_child_list);
>   	child->parent = parent;
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index 713d85b0b364..727f91e7f7c2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -246,6 +246,9 @@ struct intel_context {
>   		/** @guc_number_children: number of children if parent */
>   		u8 guc_number_children;
>   
> +		/** @guc_child_index: index into guc_child_list if child */
> +		u8 guc_child_index;
> +
>   		/**
>   		 * @parent_page: page in context used by parent for work queue,
>   		 * work queue descriptor
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> index 6cd26dc060d1..9f61cfa5566a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> @@ -188,7 +188,7 @@ struct guc_process_desc {
>   	u32 wq_status;
>   	u32 engine_presence;
>   	u32 priority;
> -	u32 reserved[30];
> +	u32 reserved[36];
What is this extra space for? All the extra storage is grabbed from 
after the end of this structure, isn't it?

>   } __packed;
>   
>   #define CONTEXT_REGISTRATION_FLAG_KMD	BIT(0)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 91330525330d..1a18f99bf12a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -11,6 +11,7 @@
>   #include "gt/intel_context.h"
>   #include "gt/intel_engine_pm.h"
>   #include "gt/intel_engine_heartbeat.h"
> +#include "gt/intel_gpu_commands.h"
>   #include "gt/intel_gt.h"
>   #include "gt/intel_gt_irq.h"
>   #include "gt/intel_gt_pm.h"
> @@ -366,10 +367,14 @@ static struct i915_priolist *to_priolist(struct rb_node *rb)
>   
>   /*
>    * When using multi-lrc submission an extra page in the context state is
> - * reserved for the process descriptor and work queue.
> + * reserved for the process descriptor, work queue, and preempt BB boundary
> + * handshake between the parent + childlren contexts.
>    *
>    * The layout of this page is below:
>    * 0						guc_process_desc
> + * + sizeof(struct guc_process_desc)		child go
> + * + CACHELINE_BYTES				child join ...
> + * + CACHELINE_BYTES ...
Would be better written as '[num_children]' instead of '...' to make it 
clear it is a per child array.

Also, maybe create a struct for this to get rid of the magic '+1's and 
'BYTES / sizeof' constructs in the functions below.

>    * ...						unused
>    * PAGE_SIZE / 2				work queue start
>    * ...						work queue
> @@ -1785,6 +1790,30 @@ static int deregister_context(struct intel_context *ce, u32 guc_id, bool loop)
>   	return __guc_action_deregister_context(guc, guc_id, loop);
>   }
>   
> +static inline void clear_children_join_go_memory(struct intel_context *ce)
> +{
> +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> +	u8 i;
> +
> +	for (i = 0; i < ce->guc_number_children + 1; ++i)
> +		mem[i * (CACHELINE_BYTES / sizeof(u32))] = 0;
> +}
> +
> +static inline u32 get_children_go_value(struct intel_context *ce)
> +{
> +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> +
> +	return mem[0];
> +}
> +
> +static inline u32 get_children_join_value(struct intel_context *ce,
> +					  u8 child_index)
> +{
> +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> +
> +	return mem[(child_index + 1) * (CACHELINE_BYTES / sizeof(u32))];
> +}
> +
>   static void guc_context_policy_init(struct intel_engine_cs *engine,
>   				    struct guc_lrc_desc *desc)
>   {
> @@ -1867,6 +1896,8 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop)
>   			desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD;
>   			guc_context_policy_init(engine, desc);
>   		}
> +
> +		clear_children_join_go_memory(ce);
>   	}
>   
>   	/*
> @@ -2943,6 +2974,31 @@ static const struct intel_context_ops virtual_child_context_ops = {
>   	.get_sibling = guc_virtual_get_sibling,
>   };
>   
> +/*
> + * The below override of the breadcrumbs is enabled when the user configures a
> + * context for parallel submission (multi-lrc, parent-child).
> + *
> + * The overridden breadcrumbs implements an algorithm which allows the GuC to
> + * safely preempt all the hw contexts configured for parallel submission
> + * between each BB. The contract between the i915 and GuC is if the parent
> + * context can be preempted, all the children can be preempted, and the GuC will
> + * always try to preempt the parent before the children. A handshake between the
> + * parent / children breadcrumbs ensures the i915 holds up its end of the deal
> + * creating a window to preempt between each set of BBs.
> + */
> +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
> +						     u64 offset, u32 len,
> +						     const unsigned int flags);
> +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
> +						    u64 offset, u32 len,
> +						    const unsigned int flags);
> +static u32 *
> +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
> +						 u32 *cs);
> +static u32 *
> +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq,
> +						u32 *cs);
> +
>   static struct intel_context *
>   guc_create_parallel(struct intel_engine_cs **engines,
>   		    unsigned int num_siblings,
> @@ -2978,6 +3034,20 @@ guc_create_parallel(struct intel_engine_cs **engines,
>   		}
>   	}
>   
> +	parent->engine->emit_bb_start =
> +		emit_bb_start_parent_no_preempt_mid_batch;
> +	parent->engine->emit_fini_breadcrumb =
> +		emit_fini_breadcrumb_parent_no_preempt_mid_batch;
> +	parent->engine->emit_fini_breadcrumb_dw =
> +		12 + 4 * parent->guc_number_children;
> +	for_each_child(parent, ce) {
> +		ce->engine->emit_bb_start =
> +			emit_bb_start_child_no_preempt_mid_batch;
> +		ce->engine->emit_fini_breadcrumb =
> +			emit_fini_breadcrumb_child_no_preempt_mid_batch;
> +		ce->engine->emit_fini_breadcrumb_dw = 16;
> +	}
> +
>   	kfree(siblings);
>   	return parent;
>   
> @@ -3362,6 +3432,204 @@ void intel_guc_submission_init_early(struct intel_guc *guc)
>   	guc->submission_selected = __guc_submission_selected(guc);
>   }
>   
> +static inline u32 get_children_go_addr(struct intel_context *ce)
> +{
> +	GEM_BUG_ON(!intel_context_is_parent(ce));
> +
> +	return i915_ggtt_offset(ce->state) +
> +		__get_process_desc_offset(ce) +
> +		sizeof(struct guc_process_desc);
> +}
> +
> +static inline u32 get_children_join_addr(struct intel_context *ce,
> +					 u8 child_index)
> +{
> +	GEM_BUG_ON(!intel_context_is_parent(ce));
> +
> +	return get_children_go_addr(ce) + (child_index + 1) * CACHELINE_BYTES;
> +}
> +
> +#define PARENT_GO_BB			1
> +#define PARENT_GO_FINI_BREADCRUMB	0
> +#define CHILD_GO_BB			1
> +#define CHILD_GO_FINI_BREADCRUMB	0
> +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
> +						     u64 offset, u32 len,
> +						     const unsigned int flags)
> +{
> +	struct intel_context *ce = rq->context;
> +	u32 *cs;
> +	u8 i;
> +
> +	GEM_BUG_ON(!intel_context_is_parent(ce));
> +
> +	cs = intel_ring_begin(rq, 10 + 4 * ce->guc_number_children);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	/* Wait on chidlren */
chidlren -> children

> +	for (i = 0; i < ce->guc_number_children; ++i) {
> +		*cs++ = (MI_SEMAPHORE_WAIT |
> +			 MI_SEMAPHORE_GLOBAL_GTT |
> +			 MI_SEMAPHORE_POLL |
> +			 MI_SEMAPHORE_SAD_EQ_SDD);
> +		*cs++ = PARENT_GO_BB;
> +		*cs++ = get_children_join_addr(ce, i);
> +		*cs++ = 0;
> +	}
> +
> +	/* Turn off preemption */
> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> +	*cs++ = MI_NOOP;
> +
> +	/* Tell children go */
> +	cs = gen8_emit_ggtt_write(cs,
> +				  CHILD_GO_BB,
> +				  get_children_go_addr(ce),
> +				  0);
> +
> +	/* Jump to batch */
> +	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
> +		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
> +	*cs++ = lower_32_bits(offset);
> +	*cs++ = upper_32_bits(offset);
> +	*cs++ = MI_NOOP;
> +
> +	intel_ring_advance(rq, cs);
> +
> +	return 0;
> +}
> +
> +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
> +						    u64 offset, u32 len,
> +						    const unsigned int flags)
> +{
> +	struct intel_context *ce = rq->context;
> +	u32 *cs;
> +
> +	GEM_BUG_ON(!intel_context_is_child(ce));
> +
> +	cs = intel_ring_begin(rq, 12);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	/* Signal parent */
> +	cs = gen8_emit_ggtt_write(cs,
> +				  PARENT_GO_BB,
> +				  get_children_join_addr(ce->parent,
> +							 ce->guc_child_index),
> +				  0);
> +
> +	/* Wait parent on for go */
parent on -> on parent

> +	*cs++ = (MI_SEMAPHORE_WAIT |
> +		 MI_SEMAPHORE_GLOBAL_GTT |
> +		 MI_SEMAPHORE_POLL |
> +		 MI_SEMAPHORE_SAD_EQ_SDD);
> +	*cs++ = CHILD_GO_BB;
> +	*cs++ = get_children_go_addr(ce->parent);
> +	*cs++ = 0;
> +
> +	/* Turn off preemption */
> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> +
> +	/* Jump to batch */
> +	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
> +		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
> +	*cs++ = lower_32_bits(offset);
> +	*cs++ = upper_32_bits(offset);
> +
> +	intel_ring_advance(rq, cs);
> +
> +	return 0;
> +}
> +
> +static u32 *
> +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
> +						 u32 *cs)
> +{
> +	struct intel_context *ce = rq->context;
> +	u8 i;
> +
> +	GEM_BUG_ON(!intel_context_is_parent(ce));
> +
> +	/* Wait on children */
> +	for (i = 0; i < ce->guc_number_children; ++i) {
> +		*cs++ = (MI_SEMAPHORE_WAIT |
> +			 MI_SEMAPHORE_GLOBAL_GTT |
> +			 MI_SEMAPHORE_POLL |
> +			 MI_SEMAPHORE_SAD_EQ_SDD);
> +		*cs++ = PARENT_GO_FINI_BREADCRUMB;
> +		*cs++ = get_children_join_addr(ce, i);
> +		*cs++ = 0;
> +	}
> +
> +	/* Turn on preemption */
> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> +	*cs++ = MI_NOOP;
> +
> +	/* Tell children go */
> +	cs = gen8_emit_ggtt_write(cs,
> +				  CHILD_GO_FINI_BREADCRUMB,
> +				  get_children_go_addr(ce),
> +				  0);
> +
> +	/* Emit fini breadcrumb */
> +	cs = gen8_emit_ggtt_write(cs,
> +				  rq->fence.seqno,
> +				  i915_request_active_timeline(rq)->hwsp_offset,
> +				  0);
> +
> +	/* User interrupt */
> +	*cs++ = MI_USER_INTERRUPT;
> +	*cs++ = MI_NOOP;
> +
> +	rq->tail = intel_ring_offset(rq, cs);
> +
> +	return cs;
> +}
> +
> +static u32 *
> +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, u32 *cs)
> +{
> +	struct intel_context *ce = rq->context;
> +
> +	GEM_BUG_ON(!intel_context_is_child(ce));
> +
> +	/* Turn on preemption */
> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> +	*cs++ = MI_NOOP;
> +
> +	/* Signal parent */
> +	cs = gen8_emit_ggtt_write(cs,
> +				  PARENT_GO_FINI_BREADCRUMB,
> +				  get_children_join_addr(ce->parent,
> +							 ce->guc_child_index),
> +				  0);
> +
This is backwards compared to the parent?

Parent: wait on children then enable pre-emption
Child: enable pre-emption then signal parent

Makes for a window where the parent is waiting in atomic context for a 
signal from a child that has been pre-empted and might not get to run 
for some time?

John.


> +	/* Wait parent on for go */
> +	*cs++ = (MI_SEMAPHORE_WAIT |
> +		 MI_SEMAPHORE_GLOBAL_GTT |
> +		 MI_SEMAPHORE_POLL |
> +		 MI_SEMAPHORE_SAD_EQ_SDD);
> +	*cs++ = CHILD_GO_FINI_BREADCRUMB;
> +	*cs++ = get_children_go_addr(ce->parent);
> +	*cs++ = 0;
> +
> +	/* Emit fini breadcrumb */
> +	cs = gen8_emit_ggtt_write(cs,
> +				  rq->fence.seqno,
> +				  i915_request_active_timeline(rq)->hwsp_offset,
> +				  0);
> +
> +	/* User interrupt */
> +	*cs++ = MI_USER_INTERRUPT;
> +	*cs++ = MI_NOOP;
> +
> +	rq->tail = intel_ring_offset(rq, cs);
> +
> +	return cs;
> +}
> +
>   static struct intel_context *
>   g2h_context_lookup(struct intel_guc *guc, u32 desc_idx)
>   {
> @@ -3807,6 +4075,19 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
>   			drm_printf(p, "\t\tWQI Status: %u\n\n",
>   				   READ_ONCE(desc->wq_status));
>   
> +			drm_printf(p, "\t\tNumber Children: %u\n\n",
> +				   ce->guc_number_children);
> +			if (ce->engine->emit_bb_start ==
> +			    emit_bb_start_parent_no_preempt_mid_batch) {
> +				u8 i;
> +
> +				drm_printf(p, "\t\tChildren Go: %u\n\n",
> +					   get_children_go_value(ce));
> +				for (i = 0; i < ce->guc_number_children; ++i)
> +					drm_printf(p, "\t\tChildren Join: %u\n",
> +						   get_children_join_value(ce, i));
> +			}
> +
>   			for_each_child(ce, child)
>   				guc_log_context(p, child);
>   		}
Matthew Brost Sept. 28, 2021, 10:33 p.m. UTC | #5
On Tue, Sep 28, 2021 at 03:20:42PM -0700, John Harrison wrote:
> On 8/20/2021 15:44, Matthew Brost wrote:
> > For some users of multi-lrc, e.g. split frame, it isn't safe to preempt
> > mid BB. To safely enable preemption at the BB boundary, a handshake
> > between to parent and child is needed. This is implemented via custom
> > emit_bb_start & emit_fini_breadcrumb functions and enabled via by
> via by -> by
> 
> > default if a context is configured by set parallel extension.
> I tend to agree with Tvrtko that this should probably be an opt in change.
> Is there a flags word passed in when creating the context?
> 

I don't disagree but the uAPI in this series is where we landed. It has
been acked all by the relevant parties in the RFC, ported to our
internal tree, and the media UMD has been updated / posted. Concerns
with the uAPI should've been raised in the RFC phase, not now. I really
don't feel like changing this uAPI another time.

> Also, it's not just a change in pre-emption behaviour but a change in
> synchronisation too, right? Previously, if you had a whole bunch of back to
> back submissions then one child could run ahead of another and/or the
> parent. After this change, there is a forced regroup at the end of each
> batch. So while one could end sooner/later than the others, they can't ever
> get an entire batch (or more) ahead or behind. Or was that synchronisation
> already in there through other means anyway?
> 

Yes, each parent / child sync at the of each batch - this is the only
way safely insert preemption points. Without this the GuC could attempt
a preemption and hang the batches.

> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/intel_context.c       |   2 +-
> >   drivers/gpu/drm/i915/gt/intel_context_types.h |   3 +
> >   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   2 +-
> >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 283 +++++++++++++++++-
> >   4 files changed, 287 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> > index 5615be32879c..2de62649e275 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_context.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> > @@ -561,7 +561,7 @@ void intel_context_bind_parent_child(struct intel_context *parent,
> >   	GEM_BUG_ON(intel_context_is_child(child));
> >   	GEM_BUG_ON(intel_context_is_parent(child));
> > -	parent->guc_number_children++;
> > +	child->guc_child_index = parent->guc_number_children++;
> >   	list_add_tail(&child->guc_child_link,
> >   		      &parent->guc_child_list);
> >   	child->parent = parent;
> > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > index 713d85b0b364..727f91e7f7c2 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > @@ -246,6 +246,9 @@ struct intel_context {
> >   		/** @guc_number_children: number of children if parent */
> >   		u8 guc_number_children;
> > +		/** @guc_child_index: index into guc_child_list if child */
> > +		u8 guc_child_index;
> > +
> >   		/**
> >   		 * @parent_page: page in context used by parent for work queue,
> >   		 * work queue descriptor
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> > index 6cd26dc060d1..9f61cfa5566a 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> > @@ -188,7 +188,7 @@ struct guc_process_desc {
> >   	u32 wq_status;
> >   	u32 engine_presence;
> >   	u32 priority;
> > -	u32 reserved[30];
> > +	u32 reserved[36];
> What is this extra space for? All the extra storage is grabbed from after
> the end of this structure, isn't it?
> 

This is the size of process descriptor in the GuC spec. Even though this
is unused space we really don't want the child go / join memory using
anything within the process descriptor.

> >   } __packed;
> >   #define CONTEXT_REGISTRATION_FLAG_KMD	BIT(0)
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > index 91330525330d..1a18f99bf12a 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > @@ -11,6 +11,7 @@
> >   #include "gt/intel_context.h"
> >   #include "gt/intel_engine_pm.h"
> >   #include "gt/intel_engine_heartbeat.h"
> > +#include "gt/intel_gpu_commands.h"
> >   #include "gt/intel_gt.h"
> >   #include "gt/intel_gt_irq.h"
> >   #include "gt/intel_gt_pm.h"
> > @@ -366,10 +367,14 @@ static struct i915_priolist *to_priolist(struct rb_node *rb)
> >   /*
> >    * When using multi-lrc submission an extra page in the context state is
> > - * reserved for the process descriptor and work queue.
> > + * reserved for the process descriptor, work queue, and preempt BB boundary
> > + * handshake between the parent + childlren contexts.
> >    *
> >    * The layout of this page is below:
> >    * 0						guc_process_desc
> > + * + sizeof(struct guc_process_desc)		child go
> > + * + CACHELINE_BYTES				child join ...
> > + * + CACHELINE_BYTES ...
> Would be better written as '[num_children]' instead of '...' to make it
> clear it is a per child array.
>

I think this description is pretty clear.

> Also, maybe create a struct for this to get rid of the magic '+1's and
> 'BYTES / sizeof' constructs in the functions below.
> 

Let me see if I can create a struct that describes the layout.

> >    * ...						unused
> >    * PAGE_SIZE / 2				work queue start
> >    * ...						work queue
> > @@ -1785,6 +1790,30 @@ static int deregister_context(struct intel_context *ce, u32 guc_id, bool loop)
> >   	return __guc_action_deregister_context(guc, guc_id, loop);
> >   }
> > +static inline void clear_children_join_go_memory(struct intel_context *ce)
> > +{
> > +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> > +	u8 i;
> > +
> > +	for (i = 0; i < ce->guc_number_children + 1; ++i)
> > +		mem[i * (CACHELINE_BYTES / sizeof(u32))] = 0;
> > +}
> > +
> > +static inline u32 get_children_go_value(struct intel_context *ce)
> > +{
> > +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> > +
> > +	return mem[0];
> > +}
> > +
> > +static inline u32 get_children_join_value(struct intel_context *ce,
> > +					  u8 child_index)
> > +{
> > +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> > +
> > +	return mem[(child_index + 1) * (CACHELINE_BYTES / sizeof(u32))];
> > +}
> > +
> >   static void guc_context_policy_init(struct intel_engine_cs *engine,
> >   				    struct guc_lrc_desc *desc)
> >   {
> > @@ -1867,6 +1896,8 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop)
> >   			desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD;
> >   			guc_context_policy_init(engine, desc);
> >   		}
> > +
> > +		clear_children_join_go_memory(ce);
> >   	}
> >   	/*
> > @@ -2943,6 +2974,31 @@ static const struct intel_context_ops virtual_child_context_ops = {
> >   	.get_sibling = guc_virtual_get_sibling,
> >   };
> > +/*
> > + * The below override of the breadcrumbs is enabled when the user configures a
> > + * context for parallel submission (multi-lrc, parent-child).
> > + *
> > + * The overridden breadcrumbs implements an algorithm which allows the GuC to
> > + * safely preempt all the hw contexts configured for parallel submission
> > + * between each BB. The contract between the i915 and GuC is if the parent
> > + * context can be preempted, all the children can be preempted, and the GuC will
> > + * always try to preempt the parent before the children. A handshake between the
> > + * parent / children breadcrumbs ensures the i915 holds up its end of the deal
> > + * creating a window to preempt between each set of BBs.
> > + */
> > +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
> > +						     u64 offset, u32 len,
> > +						     const unsigned int flags);
> > +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
> > +						    u64 offset, u32 len,
> > +						    const unsigned int flags);
> > +static u32 *
> > +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
> > +						 u32 *cs);
> > +static u32 *
> > +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq,
> > +						u32 *cs);
> > +
> >   static struct intel_context *
> >   guc_create_parallel(struct intel_engine_cs **engines,
> >   		    unsigned int num_siblings,
> > @@ -2978,6 +3034,20 @@ guc_create_parallel(struct intel_engine_cs **engines,
> >   		}
> >   	}
> > +	parent->engine->emit_bb_start =
> > +		emit_bb_start_parent_no_preempt_mid_batch;
> > +	parent->engine->emit_fini_breadcrumb =
> > +		emit_fini_breadcrumb_parent_no_preempt_mid_batch;
> > +	parent->engine->emit_fini_breadcrumb_dw =
> > +		12 + 4 * parent->guc_number_children;
> > +	for_each_child(parent, ce) {
> > +		ce->engine->emit_bb_start =
> > +			emit_bb_start_child_no_preempt_mid_batch;
> > +		ce->engine->emit_fini_breadcrumb =
> > +			emit_fini_breadcrumb_child_no_preempt_mid_batch;
> > +		ce->engine->emit_fini_breadcrumb_dw = 16;
> > +	}
> > +
> >   	kfree(siblings);
> >   	return parent;
> > @@ -3362,6 +3432,204 @@ void intel_guc_submission_init_early(struct intel_guc *guc)
> >   	guc->submission_selected = __guc_submission_selected(guc);
> >   }
> > +static inline u32 get_children_go_addr(struct intel_context *ce)
> > +{
> > +	GEM_BUG_ON(!intel_context_is_parent(ce));
> > +
> > +	return i915_ggtt_offset(ce->state) +
> > +		__get_process_desc_offset(ce) +
> > +		sizeof(struct guc_process_desc);
> > +}
> > +
> > +static inline u32 get_children_join_addr(struct intel_context *ce,
> > +					 u8 child_index)
> > +{
> > +	GEM_BUG_ON(!intel_context_is_parent(ce));
> > +
> > +	return get_children_go_addr(ce) + (child_index + 1) * CACHELINE_BYTES;
> > +}
> > +
> > +#define PARENT_GO_BB			1
> > +#define PARENT_GO_FINI_BREADCRUMB	0
> > +#define CHILD_GO_BB			1
> > +#define CHILD_GO_FINI_BREADCRUMB	0
> > +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
> > +						     u64 offset, u32 len,
> > +						     const unsigned int flags)
> > +{
> > +	struct intel_context *ce = rq->context;
> > +	u32 *cs;
> > +	u8 i;
> > +
> > +	GEM_BUG_ON(!intel_context_is_parent(ce));
> > +
> > +	cs = intel_ring_begin(rq, 10 + 4 * ce->guc_number_children);
> > +	if (IS_ERR(cs))
> > +		return PTR_ERR(cs);
> > +
> > +	/* Wait on chidlren */
> chidlren -> children
>

Yep.
 
> > +	for (i = 0; i < ce->guc_number_children; ++i) {
> > +		*cs++ = (MI_SEMAPHORE_WAIT |
> > +			 MI_SEMAPHORE_GLOBAL_GTT |
> > +			 MI_SEMAPHORE_POLL |
> > +			 MI_SEMAPHORE_SAD_EQ_SDD);
> > +		*cs++ = PARENT_GO_BB;
> > +		*cs++ = get_children_join_addr(ce, i);
> > +		*cs++ = 0;
> > +	}
> > +
> > +	/* Turn off preemption */
> > +	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> > +	*cs++ = MI_NOOP;
> > +
> > +	/* Tell children go */
> > +	cs = gen8_emit_ggtt_write(cs,
> > +				  CHILD_GO_BB,
> > +				  get_children_go_addr(ce),
> > +				  0);
> > +
> > +	/* Jump to batch */
> > +	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
> > +		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
> > +	*cs++ = lower_32_bits(offset);
> > +	*cs++ = upper_32_bits(offset);
> > +	*cs++ = MI_NOOP;
> > +
> > +	intel_ring_advance(rq, cs);
> > +
> > +	return 0;
> > +}
> > +
> > +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
> > +						    u64 offset, u32 len,
> > +						    const unsigned int flags)
> > +{
> > +	struct intel_context *ce = rq->context;
> > +	u32 *cs;
> > +
> > +	GEM_BUG_ON(!intel_context_is_child(ce));
> > +
> > +	cs = intel_ring_begin(rq, 12);
> > +	if (IS_ERR(cs))
> > +		return PTR_ERR(cs);
> > +
> > +	/* Signal parent */
> > +	cs = gen8_emit_ggtt_write(cs,
> > +				  PARENT_GO_BB,
> > +				  get_children_join_addr(ce->parent,
> > +							 ce->guc_child_index),
> > +				  0);
> > +
> > +	/* Wait parent on for go */
> parent on -> on parent
>

Yep.
 
> > +	*cs++ = (MI_SEMAPHORE_WAIT |
> > +		 MI_SEMAPHORE_GLOBAL_GTT |
> > +		 MI_SEMAPHORE_POLL |
> > +		 MI_SEMAPHORE_SAD_EQ_SDD);
> > +	*cs++ = CHILD_GO_BB;
> > +	*cs++ = get_children_go_addr(ce->parent);
> > +	*cs++ = 0;
> > +
> > +	/* Turn off preemption */
> > +	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> > +
> > +	/* Jump to batch */
> > +	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
> > +		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
> > +	*cs++ = lower_32_bits(offset);
> > +	*cs++ = upper_32_bits(offset);
> > +
> > +	intel_ring_advance(rq, cs);
> > +
> > +	return 0;
> > +}
> > +
> > +static u32 *
> > +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
> > +						 u32 *cs)
> > +{
> > +	struct intel_context *ce = rq->context;
> > +	u8 i;
> > +
> > +	GEM_BUG_ON(!intel_context_is_parent(ce));
> > +
> > +	/* Wait on children */
> > +	for (i = 0; i < ce->guc_number_children; ++i) {
> > +		*cs++ = (MI_SEMAPHORE_WAIT |
> > +			 MI_SEMAPHORE_GLOBAL_GTT |
> > +			 MI_SEMAPHORE_POLL |
> > +			 MI_SEMAPHORE_SAD_EQ_SDD);
> > +		*cs++ = PARENT_GO_FINI_BREADCRUMB;
> > +		*cs++ = get_children_join_addr(ce, i);
> > +		*cs++ = 0;
> > +	}
> > +
> > +	/* Turn on preemption */
> > +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> > +	*cs++ = MI_NOOP;
> > +
> > +	/* Tell children go */
> > +	cs = gen8_emit_ggtt_write(cs,
> > +				  CHILD_GO_FINI_BREADCRUMB,
> > +				  get_children_go_addr(ce),
> > +				  0);
> > +
> > +	/* Emit fini breadcrumb */
> > +	cs = gen8_emit_ggtt_write(cs,
> > +				  rq->fence.seqno,
> > +				  i915_request_active_timeline(rq)->hwsp_offset,
> > +				  0);
> > +
> > +	/* User interrupt */
> > +	*cs++ = MI_USER_INTERRUPT;
> > +	*cs++ = MI_NOOP;
> > +
> > +	rq->tail = intel_ring_offset(rq, cs);
> > +
> > +	return cs;
> > +}
> > +
> > +static u32 *
> > +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, u32 *cs)
> > +{
> > +	struct intel_context *ce = rq->context;
> > +
> > +	GEM_BUG_ON(!intel_context_is_child(ce));
> > +
> > +	/* Turn on preemption */
> > +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> > +	*cs++ = MI_NOOP;
> > +
> > +	/* Signal parent */
> > +	cs = gen8_emit_ggtt_write(cs,
> > +				  PARENT_GO_FINI_BREADCRUMB,
> > +				  get_children_join_addr(ce->parent,
> > +							 ce->guc_child_index),
> > +				  0);
> > +
> This is backwards compared to the parent?
> 
> Parent: wait on children then enable pre-emption
> Child: enable pre-emption then signal parent
> 
> Makes for a window where the parent is waiting in atomic context for a
> signal from a child that has been pre-empted and might not get to run for
> some time?
>

No, this is correct. The rule is if the parent can be preempted all the
children can be preempted, thus we can't enable preemption on the parent
until all the children have preemption enabled, thus the parent waits
for all the children to join before enabling its preemption.

Matt
 
> John.
> 
> 
> > +	/* Wait parent on for go */
> > +	*cs++ = (MI_SEMAPHORE_WAIT |
> > +		 MI_SEMAPHORE_GLOBAL_GTT |
> > +		 MI_SEMAPHORE_POLL |
> > +		 MI_SEMAPHORE_SAD_EQ_SDD);
> > +	*cs++ = CHILD_GO_FINI_BREADCRUMB;
> > +	*cs++ = get_children_go_addr(ce->parent);
> > +	*cs++ = 0;
> > +
> > +	/* Emit fini breadcrumb */
> > +	cs = gen8_emit_ggtt_write(cs,
> > +				  rq->fence.seqno,
> > +				  i915_request_active_timeline(rq)->hwsp_offset,
> > +				  0);
> > +
> > +	/* User interrupt */
> > +	*cs++ = MI_USER_INTERRUPT;
> > +	*cs++ = MI_NOOP;
> > +
> > +	rq->tail = intel_ring_offset(rq, cs);
> > +
> > +	return cs;
> > +}
> > +
> >   static struct intel_context *
> >   g2h_context_lookup(struct intel_guc *guc, u32 desc_idx)
> >   {
> > @@ -3807,6 +4075,19 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
> >   			drm_printf(p, "\t\tWQI Status: %u\n\n",
> >   				   READ_ONCE(desc->wq_status));
> > +			drm_printf(p, "\t\tNumber Children: %u\n\n",
> > +				   ce->guc_number_children);
> > +			if (ce->engine->emit_bb_start ==
> > +			    emit_bb_start_parent_no_preempt_mid_batch) {
> > +				u8 i;
> > +
> > +				drm_printf(p, "\t\tChildren Go: %u\n\n",
> > +					   get_children_go_value(ce));
> > +				for (i = 0; i < ce->guc_number_children; ++i)
> > +					drm_printf(p, "\t\tChildren Join: %u\n",
> > +						   get_children_join_value(ce, i));
> > +			}
> > +
> >   			for_each_child(ce, child)
> >   				guc_log_context(p, child);
> >   		}
>
John Harrison Sept. 28, 2021, 11:33 p.m. UTC | #6
On 9/28/2021 15:33, Matthew Brost wrote:
> On Tue, Sep 28, 2021 at 03:20:42PM -0700, John Harrison wrote:
>> On 8/20/2021 15:44, Matthew Brost wrote:
>>> For some users of multi-lrc, e.g. split frame, it isn't safe to preempt
>>> mid BB. To safely enable preemption at the BB boundary, a handshake
>>> between to parent and child is needed. This is implemented via custom
>>> emit_bb_start & emit_fini_breadcrumb functions and enabled via by
>> via by -> by
>>
>>> default if a context is configured by set parallel extension.
>> I tend to agree with Tvrtko that this should probably be an opt in change.
>> Is there a flags word passed in when creating the context?
>>
> I don't disagree but the uAPI in this series is where we landed. It has
> been acked all by the relevant parties in the RFC, ported to our
> internal tree, and the media UMD has been updated / posted. Concerns
> with the uAPI should've been raised in the RFC phase, not now. I really
> don't feel like changing this uAPI another time.
The counter argument is that once a UAPI has been merged, it cannot be 
changed. Ever. So it is worth taking the trouble to get it right first 
time.

The proposal isn't a major re-write of the interface. It is simply a 
request to set an extra flag when creating the context.


>
>> Also, it's not just a change in pre-emption behaviour but a change in
>> synchronisation too, right? Previously, if you had a whole bunch of back to
>> back submissions then one child could run ahead of another and/or the
>> parent. After this change, there is a forced regroup at the end of each
>> batch. So while one could end sooner/later than the others, they can't ever
>> get an entire batch (or more) ahead or behind. Or was that synchronisation
>> already in there through other means anyway?
>>
> Yes, each parent / child sync at the of each batch - this is the only
> way safely insert preemption points. Without this the GuC could attempt
> a preemption and hang the batches.
To be clear, I'm not saying that this is wrong. I'm just saying that 
this appears to be new behaviour with this patch but it is not 
explicitly called out in the description of the patch.


>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/intel_context.c       |   2 +-
>>>    drivers/gpu/drm/i915/gt/intel_context_types.h |   3 +
>>>    drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   2 +-
>>>    .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 283 +++++++++++++++++-
>>>    4 files changed, 287 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
>>> index 5615be32879c..2de62649e275 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_context.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>> @@ -561,7 +561,7 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>>>    	GEM_BUG_ON(intel_context_is_child(child));
>>>    	GEM_BUG_ON(intel_context_is_parent(child));
>>> -	parent->guc_number_children++;
>>> +	child->guc_child_index = parent->guc_number_children++;
>>>    	list_add_tail(&child->guc_child_link,
>>>    		      &parent->guc_child_list);
>>>    	child->parent = parent;
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>> index 713d85b0b364..727f91e7f7c2 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>> @@ -246,6 +246,9 @@ struct intel_context {
>>>    		/** @guc_number_children: number of children if parent */
>>>    		u8 guc_number_children;
>>> +		/** @guc_child_index: index into guc_child_list if child */
>>> +		u8 guc_child_index;
>>> +
>>>    		/**
>>>    		 * @parent_page: page in context used by parent for work queue,
>>>    		 * work queue descriptor
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>> index 6cd26dc060d1..9f61cfa5566a 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>> @@ -188,7 +188,7 @@ struct guc_process_desc {
>>>    	u32 wq_status;
>>>    	u32 engine_presence;
>>>    	u32 priority;
>>> -	u32 reserved[30];
>>> +	u32 reserved[36];
>> What is this extra space for? All the extra storage is grabbed from after
>> the end of this structure, isn't it?
>>
> This is the size of process descriptor in the GuC spec. Even though this
> is unused space we really don't want the child go / join memory using
> anything within the process descriptor.
Okay. So it's more that the code was previously broken and we just 
hadn't hit a problem because of it? Again, worth adding a comment in the 
description to call it out as a bug fix.

>
>>>    } __packed;
>>>    #define CONTEXT_REGISTRATION_FLAG_KMD	BIT(0)
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> index 91330525330d..1a18f99bf12a 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> @@ -11,6 +11,7 @@
>>>    #include "gt/intel_context.h"
>>>    #include "gt/intel_engine_pm.h"
>>>    #include "gt/intel_engine_heartbeat.h"
>>> +#include "gt/intel_gpu_commands.h"
>>>    #include "gt/intel_gt.h"
>>>    #include "gt/intel_gt_irq.h"
>>>    #include "gt/intel_gt_pm.h"
>>> @@ -366,10 +367,14 @@ static struct i915_priolist *to_priolist(struct rb_node *rb)
>>>    /*
>>>     * When using multi-lrc submission an extra page in the context state is
>>> - * reserved for the process descriptor and work queue.
>>> + * reserved for the process descriptor, work queue, and preempt BB boundary
>>> + * handshake between the parent + childlren contexts.
>>>     *
>>>     * The layout of this page is below:
>>>     * 0						guc_process_desc
>>> + * + sizeof(struct guc_process_desc)		child go
>>> + * + CACHELINE_BYTES				child join ...
>>> + * + CACHELINE_BYTES ...
>> Would be better written as '[num_children]' instead of '...' to make it
>> clear it is a per child array.
>>
> I think this description is pretty clear.
Evidently not because it confused me for a moment.

>
>> Also, maybe create a struct for this to get rid of the magic '+1's and
>> 'BYTES / sizeof' constructs in the functions below.
>>
> Let me see if I can create a struct that describes the layout.
That would definitely make the code a lot clearer.

>
>>>     * ...						unused
>>>     * PAGE_SIZE / 2				work queue start
>>>     * ...						work queue
>>> @@ -1785,6 +1790,30 @@ static int deregister_context(struct intel_context *ce, u32 guc_id, bool loop)
>>>    	return __guc_action_deregister_context(guc, guc_id, loop);
>>>    }
>>> +static inline void clear_children_join_go_memory(struct intel_context *ce)
>>> +{
>>> +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
>>> +	u8 i;
>>> +
>>> +	for (i = 0; i < ce->guc_number_children + 1; ++i)
>>> +		mem[i * (CACHELINE_BYTES / sizeof(u32))] = 0;
>>> +}
>>> +
>>> +static inline u32 get_children_go_value(struct intel_context *ce)
>>> +{
>>> +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
>>> +
>>> +	return mem[0];
>>> +}
>>> +
>>> +static inline u32 get_children_join_value(struct intel_context *ce,
>>> +					  u8 child_index)
>>> +{
>>> +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
>>> +
>>> +	return mem[(child_index + 1) * (CACHELINE_BYTES / sizeof(u32))];
>>> +}
>>> +
>>>    static void guc_context_policy_init(struct intel_engine_cs *engine,
>>>    				    struct guc_lrc_desc *desc)
>>>    {
>>> @@ -1867,6 +1896,8 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop)
>>>    			desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD;
>>>    			guc_context_policy_init(engine, desc);
>>>    		}
>>> +
>>> +		clear_children_join_go_memory(ce);
>>>    	}
>>>    	/*
>>> @@ -2943,6 +2974,31 @@ static const struct intel_context_ops virtual_child_context_ops = {
>>>    	.get_sibling = guc_virtual_get_sibling,
>>>    };
>>> +/*
>>> + * The below override of the breadcrumbs is enabled when the user configures a
>>> + * context for parallel submission (multi-lrc, parent-child).
>>> + *
>>> + * The overridden breadcrumbs implements an algorithm which allows the GuC to
>>> + * safely preempt all the hw contexts configured for parallel submission
>>> + * between each BB. The contract between the i915 and GuC is if the parent
>>> + * context can be preempted, all the children can be preempted, and the GuC will
>>> + * always try to preempt the parent before the children. A handshake between the
>>> + * parent / children breadcrumbs ensures the i915 holds up its end of the deal
>>> + * creating a window to preempt between each set of BBs.
>>> + */
>>> +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
>>> +						     u64 offset, u32 len,
>>> +						     const unsigned int flags);
>>> +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
>>> +						    u64 offset, u32 len,
>>> +						    const unsigned int flags);
>>> +static u32 *
>>> +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
>>> +						 u32 *cs);
>>> +static u32 *
>>> +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq,
>>> +						u32 *cs);
>>> +
>>>    static struct intel_context *
>>>    guc_create_parallel(struct intel_engine_cs **engines,
>>>    		    unsigned int num_siblings,
>>> @@ -2978,6 +3034,20 @@ guc_create_parallel(struct intel_engine_cs **engines,
>>>    		}
>>>    	}
>>> +	parent->engine->emit_bb_start =
>>> +		emit_bb_start_parent_no_preempt_mid_batch;
>>> +	parent->engine->emit_fini_breadcrumb =
>>> +		emit_fini_breadcrumb_parent_no_preempt_mid_batch;
>>> +	parent->engine->emit_fini_breadcrumb_dw =
>>> +		12 + 4 * parent->guc_number_children;
>>> +	for_each_child(parent, ce) {
>>> +		ce->engine->emit_bb_start =
>>> +			emit_bb_start_child_no_preempt_mid_batch;
>>> +		ce->engine->emit_fini_breadcrumb =
>>> +			emit_fini_breadcrumb_child_no_preempt_mid_batch;
>>> +		ce->engine->emit_fini_breadcrumb_dw = 16;
>>> +	}
>>> +
>>>    	kfree(siblings);
>>>    	return parent;
>>> @@ -3362,6 +3432,204 @@ void intel_guc_submission_init_early(struct intel_guc *guc)
>>>    	guc->submission_selected = __guc_submission_selected(guc);
>>>    }
>>> +static inline u32 get_children_go_addr(struct intel_context *ce)
>>> +{
>>> +	GEM_BUG_ON(!intel_context_is_parent(ce));
>>> +
>>> +	return i915_ggtt_offset(ce->state) +
>>> +		__get_process_desc_offset(ce) +
>>> +		sizeof(struct guc_process_desc);
>>> +}
>>> +
>>> +static inline u32 get_children_join_addr(struct intel_context *ce,
>>> +					 u8 child_index)
>>> +{
>>> +	GEM_BUG_ON(!intel_context_is_parent(ce));
>>> +
>>> +	return get_children_go_addr(ce) + (child_index + 1) * CACHELINE_BYTES;
>>> +}
>>> +
>>> +#define PARENT_GO_BB			1
>>> +#define PARENT_GO_FINI_BREADCRUMB	0
>>> +#define CHILD_GO_BB			1
>>> +#define CHILD_GO_FINI_BREADCRUMB	0
>>> +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
>>> +						     u64 offset, u32 len,
>>> +						     const unsigned int flags)
>>> +{
>>> +	struct intel_context *ce = rq->context;
>>> +	u32 *cs;
>>> +	u8 i;
>>> +
>>> +	GEM_BUG_ON(!intel_context_is_parent(ce));
>>> +
>>> +	cs = intel_ring_begin(rq, 10 + 4 * ce->guc_number_children);
>>> +	if (IS_ERR(cs))
>>> +		return PTR_ERR(cs);
>>> +
>>> +	/* Wait on chidlren */
>> chidlren -> children
>>
> Yep.
>   
>>> +	for (i = 0; i < ce->guc_number_children; ++i) {
>>> +		*cs++ = (MI_SEMAPHORE_WAIT |
>>> +			 MI_SEMAPHORE_GLOBAL_GTT |
>>> +			 MI_SEMAPHORE_POLL |
>>> +			 MI_SEMAPHORE_SAD_EQ_SDD);
>>> +		*cs++ = PARENT_GO_BB;
>>> +		*cs++ = get_children_join_addr(ce, i);
>>> +		*cs++ = 0;
>>> +	}
>>> +
>>> +	/* Turn off preemption */
>>> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
>>> +	*cs++ = MI_NOOP;
>>> +
>>> +	/* Tell children go */
>>> +	cs = gen8_emit_ggtt_write(cs,
>>> +				  CHILD_GO_BB,
>>> +				  get_children_go_addr(ce),
>>> +				  0);
>>> +
>>> +	/* Jump to batch */
>>> +	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
>>> +		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
>>> +	*cs++ = lower_32_bits(offset);
>>> +	*cs++ = upper_32_bits(offset);
>>> +	*cs++ = MI_NOOP;
>>> +
>>> +	intel_ring_advance(rq, cs);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
>>> +						    u64 offset, u32 len,
>>> +						    const unsigned int flags)
>>> +{
>>> +	struct intel_context *ce = rq->context;
>>> +	u32 *cs;
>>> +
>>> +	GEM_BUG_ON(!intel_context_is_child(ce));
>>> +
>>> +	cs = intel_ring_begin(rq, 12);
>>> +	if (IS_ERR(cs))
>>> +		return PTR_ERR(cs);
>>> +
>>> +	/* Signal parent */
>>> +	cs = gen8_emit_ggtt_write(cs,
>>> +				  PARENT_GO_BB,
>>> +				  get_children_join_addr(ce->parent,
>>> +							 ce->guc_child_index),
>>> +				  0);
>>> +
>>> +	/* Wait parent on for go */
>> parent on -> on parent
>>
> Yep.
>   
>>> +	*cs++ = (MI_SEMAPHORE_WAIT |
>>> +		 MI_SEMAPHORE_GLOBAL_GTT |
>>> +		 MI_SEMAPHORE_POLL |
>>> +		 MI_SEMAPHORE_SAD_EQ_SDD);
>>> +	*cs++ = CHILD_GO_BB;
>>> +	*cs++ = get_children_go_addr(ce->parent);
>>> +	*cs++ = 0;
>>> +
>>> +	/* Turn off preemption */
>>> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
>>> +
>>> +	/* Jump to batch */
>>> +	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
>>> +		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
>>> +	*cs++ = lower_32_bits(offset);
>>> +	*cs++ = upper_32_bits(offset);
>>> +
>>> +	intel_ring_advance(rq, cs);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static u32 *
>>> +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
>>> +						 u32 *cs)
>>> +{
>>> +	struct intel_context *ce = rq->context;
>>> +	u8 i;
>>> +
>>> +	GEM_BUG_ON(!intel_context_is_parent(ce));
>>> +
>>> +	/* Wait on children */
>>> +	for (i = 0; i < ce->guc_number_children; ++i) {
>>> +		*cs++ = (MI_SEMAPHORE_WAIT |
>>> +			 MI_SEMAPHORE_GLOBAL_GTT |
>>> +			 MI_SEMAPHORE_POLL |
>>> +			 MI_SEMAPHORE_SAD_EQ_SDD);
>>> +		*cs++ = PARENT_GO_FINI_BREADCRUMB;
>>> +		*cs++ = get_children_join_addr(ce, i);
>>> +		*cs++ = 0;
>>> +	}
>>> +
>>> +	/* Turn on preemption */
>>> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
>>> +	*cs++ = MI_NOOP;
>>> +
>>> +	/* Tell children go */
>>> +	cs = gen8_emit_ggtt_write(cs,
>>> +				  CHILD_GO_FINI_BREADCRUMB,
>>> +				  get_children_go_addr(ce),
>>> +				  0);
>>> +
>>> +	/* Emit fini breadcrumb */
>>> +	cs = gen8_emit_ggtt_write(cs,
>>> +				  rq->fence.seqno,
>>> +				  i915_request_active_timeline(rq)->hwsp_offset,
>>> +				  0);
>>> +
>>> +	/* User interrupt */
>>> +	*cs++ = MI_USER_INTERRUPT;
>>> +	*cs++ = MI_NOOP;
>>> +
>>> +	rq->tail = intel_ring_offset(rq, cs);
>>> +
>>> +	return cs;
>>> +}
>>> +
>>> +static u32 *
>>> +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, u32 *cs)
>>> +{
>>> +	struct intel_context *ce = rq->context;
>>> +
>>> +	GEM_BUG_ON(!intel_context_is_child(ce));
>>> +
>>> +	/* Turn on preemption */
>>> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
>>> +	*cs++ = MI_NOOP;
>>> +
>>> +	/* Signal parent */
>>> +	cs = gen8_emit_ggtt_write(cs,
>>> +				  PARENT_GO_FINI_BREADCRUMB,
>>> +				  get_children_join_addr(ce->parent,
>>> +							 ce->guc_child_index),
>>> +				  0);
>>> +
>> This is backwards compared to the parent?
>>
>> Parent: wait on children then enable pre-emption
>> Child: enable pre-emption then signal parent
>>
>> Makes for a window where the parent is waiting in atomic context for a
>> signal from a child that has been pre-empted and might not get to run for
>> some time?
>>
> No, this is correct. The rule is if the parent can be preempted all the
> children can be preempted, thus we can't enable preemption on the parent
> until all the children have preemption enabled, thus the parent waits
> for all the children to join before enabling its preemption.
>
> Matt
But,

The write to PARENT_GO_FINI can't fail or stall, right? So if it happens 
before the ARB_ON then the child is guaranteed to execute the ARB_ON 
once it has signalled the parent. Indeed, by the time the parent context 
gets to see the update memory value, the child is practically certain to 
have passed the ARB_ON. So, by the time the parent becomes pre-emptible, 
the children will all be pre-emptible. Even if the parent is superfast, 
the children are guaranteed to become pre-emptible immediately - 
certainly before any fail-to-preempt timeout could occur.

Whereas, with the current ordering, it is possible for the child to be 
preempted before it has issued the signal to the parent. So now you have 
a non-preemptible parent hogging the hardware, waiting for a signal that 
isn't going to come for an entire execution quantum. Indeed, it is 
actually quite likely the child would be preempted before it can signal 
the parent because any pre-emption request that was issued at any time 
during the child's execution will take effect immediately on the ARB_ON 
instruction.

John.


>   
>> John.
>>
>>
>>> +	/* Wait parent on for go */
>>> +	*cs++ = (MI_SEMAPHORE_WAIT |
>>> +		 MI_SEMAPHORE_GLOBAL_GTT |
>>> +		 MI_SEMAPHORE_POLL |
>>> +		 MI_SEMAPHORE_SAD_EQ_SDD);
>>> +	*cs++ = CHILD_GO_FINI_BREADCRUMB;
>>> +	*cs++ = get_children_go_addr(ce->parent);
>>> +	*cs++ = 0;
>>> +
>>> +	/* Emit fini breadcrumb */
>>> +	cs = gen8_emit_ggtt_write(cs,
>>> +				  rq->fence.seqno,
>>> +				  i915_request_active_timeline(rq)->hwsp_offset,
>>> +				  0);
>>> +
>>> +	/* User interrupt */
>>> +	*cs++ = MI_USER_INTERRUPT;
>>> +	*cs++ = MI_NOOP;
>>> +
>>> +	rq->tail = intel_ring_offset(rq, cs);
>>> +
>>> +	return cs;
>>> +}
>>> +
>>>    static struct intel_context *
>>>    g2h_context_lookup(struct intel_guc *guc, u32 desc_idx)
>>>    {
>>> @@ -3807,6 +4075,19 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
>>>    			drm_printf(p, "\t\tWQI Status: %u\n\n",
>>>    				   READ_ONCE(desc->wq_status));
>>> +			drm_printf(p, "\t\tNumber Children: %u\n\n",
>>> +				   ce->guc_number_children);
>>> +			if (ce->engine->emit_bb_start ==
>>> +			    emit_bb_start_parent_no_preempt_mid_batch) {
>>> +				u8 i;
>>> +
>>> +				drm_printf(p, "\t\tChildren Go: %u\n\n",
>>> +					   get_children_go_value(ce));
>>> +				for (i = 0; i < ce->guc_number_children; ++i)
>>> +					drm_printf(p, "\t\tChildren Join: %u\n",
>>> +						   get_children_join_value(ce, i));
>>> +			}
>>> +
>>>    			for_each_child(ce, child)
>>>    				guc_log_context(p, child);
>>>    		}
Matthew Brost Sept. 29, 2021, 12:22 a.m. UTC | #7
On Tue, Sep 28, 2021 at 04:33:24PM -0700, John Harrison wrote:
> On 9/28/2021 15:33, Matthew Brost wrote:
> > On Tue, Sep 28, 2021 at 03:20:42PM -0700, John Harrison wrote:
> > > On 8/20/2021 15:44, Matthew Brost wrote:
> > > > For some users of multi-lrc, e.g. split frame, it isn't safe to preempt
> > > > mid BB. To safely enable preemption at the BB boundary, a handshake
> > > > between to parent and child is needed. This is implemented via custom
> > > > emit_bb_start & emit_fini_breadcrumb functions and enabled via by
> > > via by -> by
> > > 
> > > > default if a context is configured by set parallel extension.
> > > I tend to agree with Tvrtko that this should probably be an opt in change.
> > > Is there a flags word passed in when creating the context?
> > > 
> > I don't disagree but the uAPI in this series is where we landed. It has
> > been acked all by the relevant parties in the RFC, ported to our
> > internal tree, and the media UMD has been updated / posted. Concerns
> > with the uAPI should've been raised in the RFC phase, not now. I really
> > don't feel like changing this uAPI another time.
> The counter argument is that once a UAPI has been merged, it cannot be
> changed. Ever. So it is worth taking the trouble to get it right first time.
> 
> The proposal isn't a major re-write of the interface. It is simply a request
> to set an extra flag when creating the context.
> 

We are basically just talking about the polarity of a flag at this
point. Either by default you can't be preempted mid batch (current GPU /
UMD requirement) or by default you can be preempted mid-batch (no
current GPU / UMD can do this yet but add flags that everyone opts
into). I think Daniel's opinion was just default to what the current GPU
/ UMD wants and if future requirements arise we add flags to the
interface. I understand both points of view for flag / not flag but in
the end it doesn't really matter. Either way the interface works now and
will in the future too.

> 
> > 
> > > Also, it's not just a change in pre-emption behaviour but a change in
> > > synchronisation too, right? Previously, if you had a whole bunch of back to
> > > back submissions then one child could run ahead of another and/or the
> > > parent. After this change, there is a forced regroup at the end of each
> > > batch. So while one could end sooner/later than the others, they can't ever
> > > get an entire batch (or more) ahead or behind. Or was that synchronisation
> > > already in there through other means anyway?
> > > 
> > Yes, each parent / child sync at the of each batch - this is the only
> > way safely insert preemption points. Without this the GuC could attempt
> > a preemption and hang the batches.
> To be clear, I'm not saying that this is wrong. I'm just saying that this
> appears to be new behaviour with this patch but it is not explicitly called
> out in the description of the patch.
> 

Will add some comments explaining this behavior (unless I already have
them).

> 
> > 
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > >    drivers/gpu/drm/i915/gt/intel_context.c       |   2 +-
> > > >    drivers/gpu/drm/i915/gt/intel_context_types.h |   3 +
> > > >    drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   2 +-
> > > >    .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 283 +++++++++++++++++-
> > > >    4 files changed, 287 insertions(+), 3 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> > > > index 5615be32879c..2de62649e275 100644
> > > > --- a/drivers/gpu/drm/i915/gt/intel_context.c
> > > > +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> > > > @@ -561,7 +561,7 @@ void intel_context_bind_parent_child(struct intel_context *parent,
> > > >    	GEM_BUG_ON(intel_context_is_child(child));
> > > >    	GEM_BUG_ON(intel_context_is_parent(child));
> > > > -	parent->guc_number_children++;
> > > > +	child->guc_child_index = parent->guc_number_children++;
> > > >    	list_add_tail(&child->guc_child_link,
> > > >    		      &parent->guc_child_list);
> > > >    	child->parent = parent;
> > > > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > > index 713d85b0b364..727f91e7f7c2 100644
> > > > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > > @@ -246,6 +246,9 @@ struct intel_context {
> > > >    		/** @guc_number_children: number of children if parent */
> > > >    		u8 guc_number_children;
> > > > +		/** @guc_child_index: index into guc_child_list if child */
> > > > +		u8 guc_child_index;
> > > > +
> > > >    		/**
> > > >    		 * @parent_page: page in context used by parent for work queue,
> > > >    		 * work queue descriptor
> > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> > > > index 6cd26dc060d1..9f61cfa5566a 100644
> > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> > > > @@ -188,7 +188,7 @@ struct guc_process_desc {
> > > >    	u32 wq_status;
> > > >    	u32 engine_presence;
> > > >    	u32 priority;
> > > > -	u32 reserved[30];
> > > > +	u32 reserved[36];
> > > What is this extra space for? All the extra storage is grabbed from after
> > > the end of this structure, isn't it?
> > > 
> > This is the size of process descriptor in the GuC spec. Even though this
> > is unused space we really don't want the child go / join memory using
> > anything within the process descriptor.
> Okay. So it's more that the code was previously broken and we just hadn't
> hit a problem because of it? Again, worth adding a comment in the
> description to call it out as a bug fix.
>

Sure.
 
> > 
> > > >    } __packed;
> > > >    #define CONTEXT_REGISTRATION_FLAG_KMD	BIT(0)
> > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > > index 91330525330d..1a18f99bf12a 100644
> > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > > @@ -11,6 +11,7 @@
> > > >    #include "gt/intel_context.h"
> > > >    #include "gt/intel_engine_pm.h"
> > > >    #include "gt/intel_engine_heartbeat.h"
> > > > +#include "gt/intel_gpu_commands.h"
> > > >    #include "gt/intel_gt.h"
> > > >    #include "gt/intel_gt_irq.h"
> > > >    #include "gt/intel_gt_pm.h"
> > > > @@ -366,10 +367,14 @@ static struct i915_priolist *to_priolist(struct rb_node *rb)
> > > >    /*
> > > >     * When using multi-lrc submission an extra page in the context state is
> > > > - * reserved for the process descriptor and work queue.
> > > > + * reserved for the process descriptor, work queue, and preempt BB boundary
> > > > + * handshake between the parent + childlren contexts.
> > > >     *
> > > >     * The layout of this page is below:
> > > >     * 0						guc_process_desc
> > > > + * + sizeof(struct guc_process_desc)		child go
> > > > + * + CACHELINE_BYTES				child join ...
> > > > + * + CACHELINE_BYTES ...
> > > Would be better written as '[num_children]' instead of '...' to make it
> > > clear it is a per child array.
> > > 
> > I think this description is pretty clear.
> Evidently not because it confused me for a moment.
> 

Ok, let me see if I can make this a bit more clear.

> > 
> > > Also, maybe create a struct for this to get rid of the magic '+1's and
> > > 'BYTES / sizeof' constructs in the functions below.
> > > 
> > Let me see if I can create a struct that describes the layout.
> That would definitely make the code a lot clearer.
> 
> > 
> > > >     * ...						unused
> > > >     * PAGE_SIZE / 2				work queue start
> > > >     * ...						work queue
> > > > @@ -1785,6 +1790,30 @@ static int deregister_context(struct intel_context *ce, u32 guc_id, bool loop)
> > > >    	return __guc_action_deregister_context(guc, guc_id, loop);
> > > >    }
> > > > +static inline void clear_children_join_go_memory(struct intel_context *ce)
> > > > +{
> > > > +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> > > > +	u8 i;
> > > > +
> > > > +	for (i = 0; i < ce->guc_number_children + 1; ++i)
> > > > +		mem[i * (CACHELINE_BYTES / sizeof(u32))] = 0;
> > > > +}
> > > > +
> > > > +static inline u32 get_children_go_value(struct intel_context *ce)
> > > > +{
> > > > +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> > > > +
> > > > +	return mem[0];
> > > > +}
> > > > +
> > > > +static inline u32 get_children_join_value(struct intel_context *ce,
> > > > +					  u8 child_index)
> > > > +{
> > > > +	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
> > > > +
> > > > +	return mem[(child_index + 1) * (CACHELINE_BYTES / sizeof(u32))];
> > > > +}
> > > > +
> > > >    static void guc_context_policy_init(struct intel_engine_cs *engine,
> > > >    				    struct guc_lrc_desc *desc)
> > > >    {
> > > > @@ -1867,6 +1896,8 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop)
> > > >    			desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD;
> > > >    			guc_context_policy_init(engine, desc);
> > > >    		}
> > > > +
> > > > +		clear_children_join_go_memory(ce);
> > > >    	}
> > > >    	/*
> > > > @@ -2943,6 +2974,31 @@ static const struct intel_context_ops virtual_child_context_ops = {
> > > >    	.get_sibling = guc_virtual_get_sibling,
> > > >    };
> > > > +/*
> > > > + * The below override of the breadcrumbs is enabled when the user configures a
> > > > + * context for parallel submission (multi-lrc, parent-child).
> > > > + *
> > > > + * The overridden breadcrumbs implements an algorithm which allows the GuC to
> > > > + * safely preempt all the hw contexts configured for parallel submission
> > > > + * between each BB. The contract between the i915 and GuC is if the parent
> > > > + * context can be preempted, all the children can be preempted, and the GuC will
> > > > + * always try to preempt the parent before the children. A handshake between the
> > > > + * parent / children breadcrumbs ensures the i915 holds up its end of the deal
> > > > + * creating a window to preempt between each set of BBs.
> > > > + */
> > > > +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
> > > > +						     u64 offset, u32 len,
> > > > +						     const unsigned int flags);
> > > > +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
> > > > +						    u64 offset, u32 len,
> > > > +						    const unsigned int flags);
> > > > +static u32 *
> > > > +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
> > > > +						 u32 *cs);
> > > > +static u32 *
> > > > +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq,
> > > > +						u32 *cs);
> > > > +
> > > >    static struct intel_context *
> > > >    guc_create_parallel(struct intel_engine_cs **engines,
> > > >    		    unsigned int num_siblings,
> > > > @@ -2978,6 +3034,20 @@ guc_create_parallel(struct intel_engine_cs **engines,
> > > >    		}
> > > >    	}
> > > > +	parent->engine->emit_bb_start =
> > > > +		emit_bb_start_parent_no_preempt_mid_batch;
> > > > +	parent->engine->emit_fini_breadcrumb =
> > > > +		emit_fini_breadcrumb_parent_no_preempt_mid_batch;
> > > > +	parent->engine->emit_fini_breadcrumb_dw =
> > > > +		12 + 4 * parent->guc_number_children;
> > > > +	for_each_child(parent, ce) {
> > > > +		ce->engine->emit_bb_start =
> > > > +			emit_bb_start_child_no_preempt_mid_batch;
> > > > +		ce->engine->emit_fini_breadcrumb =
> > > > +			emit_fini_breadcrumb_child_no_preempt_mid_batch;
> > > > +		ce->engine->emit_fini_breadcrumb_dw = 16;
> > > > +	}
> > > > +
> > > >    	kfree(siblings);
> > > >    	return parent;
> > > > @@ -3362,6 +3432,204 @@ void intel_guc_submission_init_early(struct intel_guc *guc)
> > > >    	guc->submission_selected = __guc_submission_selected(guc);
> > > >    }
> > > > +static inline u32 get_children_go_addr(struct intel_context *ce)
> > > > +{
> > > > +	GEM_BUG_ON(!intel_context_is_parent(ce));
> > > > +
> > > > +	return i915_ggtt_offset(ce->state) +
> > > > +		__get_process_desc_offset(ce) +
> > > > +		sizeof(struct guc_process_desc);
> > > > +}
> > > > +
> > > > +static inline u32 get_children_join_addr(struct intel_context *ce,
> > > > +					 u8 child_index)
> > > > +{
> > > > +	GEM_BUG_ON(!intel_context_is_parent(ce));
> > > > +
> > > > +	return get_children_go_addr(ce) + (child_index + 1) * CACHELINE_BYTES;
> > > > +}
> > > > +
> > > > +#define PARENT_GO_BB			1
> > > > +#define PARENT_GO_FINI_BREADCRUMB	0
> > > > +#define CHILD_GO_BB			1
> > > > +#define CHILD_GO_FINI_BREADCRUMB	0
> > > > +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
> > > > +						     u64 offset, u32 len,
> > > > +						     const unsigned int flags)
> > > > +{
> > > > +	struct intel_context *ce = rq->context;
> > > > +	u32 *cs;
> > > > +	u8 i;
> > > > +
> > > > +	GEM_BUG_ON(!intel_context_is_parent(ce));
> > > > +
> > > > +	cs = intel_ring_begin(rq, 10 + 4 * ce->guc_number_children);
> > > > +	if (IS_ERR(cs))
> > > > +		return PTR_ERR(cs);
> > > > +
> > > > +	/* Wait on chidlren */
> > > chidlren -> children
> > > 
> > Yep.
> > > > +	for (i = 0; i < ce->guc_number_children; ++i) {
> > > > +		*cs++ = (MI_SEMAPHORE_WAIT |
> > > > +			 MI_SEMAPHORE_GLOBAL_GTT |
> > > > +			 MI_SEMAPHORE_POLL |
> > > > +			 MI_SEMAPHORE_SAD_EQ_SDD);
> > > > +		*cs++ = PARENT_GO_BB;
> > > > +		*cs++ = get_children_join_addr(ce, i);
> > > > +		*cs++ = 0;
> > > > +	}
> > > > +
> > > > +	/* Turn off preemption */
> > > > +	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> > > > +	*cs++ = MI_NOOP;
> > > > +
> > > > +	/* Tell children go */
> > > > +	cs = gen8_emit_ggtt_write(cs,
> > > > +				  CHILD_GO_BB,
> > > > +				  get_children_go_addr(ce),
> > > > +				  0);
> > > > +
> > > > +	/* Jump to batch */
> > > > +	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
> > > > +		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
> > > > +	*cs++ = lower_32_bits(offset);
> > > > +	*cs++ = upper_32_bits(offset);
> > > > +	*cs++ = MI_NOOP;
> > > > +
> > > > +	intel_ring_advance(rq, cs);
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
> > > > +						    u64 offset, u32 len,
> > > > +						    const unsigned int flags)
> > > > +{
> > > > +	struct intel_context *ce = rq->context;
> > > > +	u32 *cs;
> > > > +
> > > > +	GEM_BUG_ON(!intel_context_is_child(ce));
> > > > +
> > > > +	cs = intel_ring_begin(rq, 12);
> > > > +	if (IS_ERR(cs))
> > > > +		return PTR_ERR(cs);
> > > > +
> > > > +	/* Signal parent */
> > > > +	cs = gen8_emit_ggtt_write(cs,
> > > > +				  PARENT_GO_BB,
> > > > +				  get_children_join_addr(ce->parent,
> > > > +							 ce->guc_child_index),
> > > > +				  0);
> > > > +
> > > > +	/* Wait parent on for go */
> > > parent on -> on parent
> > > 
> > Yep.
> > > > +	*cs++ = (MI_SEMAPHORE_WAIT |
> > > > +		 MI_SEMAPHORE_GLOBAL_GTT |
> > > > +		 MI_SEMAPHORE_POLL |
> > > > +		 MI_SEMAPHORE_SAD_EQ_SDD);
> > > > +	*cs++ = CHILD_GO_BB;
> > > > +	*cs++ = get_children_go_addr(ce->parent);
> > > > +	*cs++ = 0;
> > > > +
> > > > +	/* Turn off preemption */
> > > > +	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> > > > +
> > > > +	/* Jump to batch */
> > > > +	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
> > > > +		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
> > > > +	*cs++ = lower_32_bits(offset);
> > > > +	*cs++ = upper_32_bits(offset);
> > > > +
> > > > +	intel_ring_advance(rq, cs);
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static u32 *
> > > > +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
> > > > +						 u32 *cs)
> > > > +{
> > > > +	struct intel_context *ce = rq->context;
> > > > +	u8 i;
> > > > +
> > > > +	GEM_BUG_ON(!intel_context_is_parent(ce));
> > > > +
> > > > +	/* Wait on children */
> > > > +	for (i = 0; i < ce->guc_number_children; ++i) {
> > > > +		*cs++ = (MI_SEMAPHORE_WAIT |
> > > > +			 MI_SEMAPHORE_GLOBAL_GTT |
> > > > +			 MI_SEMAPHORE_POLL |
> > > > +			 MI_SEMAPHORE_SAD_EQ_SDD);
> > > > +		*cs++ = PARENT_GO_FINI_BREADCRUMB;
> > > > +		*cs++ = get_children_join_addr(ce, i);
> > > > +		*cs++ = 0;
> > > > +	}
> > > > +
> > > > +	/* Turn on preemption */
> > > > +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> > > > +	*cs++ = MI_NOOP;
> > > > +
> > > > +	/* Tell children go */
> > > > +	cs = gen8_emit_ggtt_write(cs,
> > > > +				  CHILD_GO_FINI_BREADCRUMB,
> > > > +				  get_children_go_addr(ce),
> > > > +				  0);
> > > > +
> > > > +	/* Emit fini breadcrumb */
> > > > +	cs = gen8_emit_ggtt_write(cs,
> > > > +				  rq->fence.seqno,
> > > > +				  i915_request_active_timeline(rq)->hwsp_offset,
> > > > +				  0);
> > > > +
> > > > +	/* User interrupt */
> > > > +	*cs++ = MI_USER_INTERRUPT;
> > > > +	*cs++ = MI_NOOP;
> > > > +
> > > > +	rq->tail = intel_ring_offset(rq, cs);
> > > > +
> > > > +	return cs;
> > > > +}
> > > > +
> > > > +static u32 *
> > > > +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, u32 *cs)
> > > > +{
> > > > +	struct intel_context *ce = rq->context;
> > > > +
> > > > +	GEM_BUG_ON(!intel_context_is_child(ce));
> > > > +
> > > > +	/* Turn on preemption */
> > > > +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> > > > +	*cs++ = MI_NOOP;
> > > > +
> > > > +	/* Signal parent */
> > > > +	cs = gen8_emit_ggtt_write(cs,
> > > > +				  PARENT_GO_FINI_BREADCRUMB,
> > > > +				  get_children_join_addr(ce->parent,
> > > > +							 ce->guc_child_index),
> > > > +				  0);
> > > > +
> > > This is backwards compared to the parent?
> > > 
> > > Parent: wait on children then enable pre-emption
> > > Child: enable pre-emption then signal parent
> > > 
> > > Makes for a window where the parent is waiting in atomic context for a
> > > signal from a child that has been pre-empted and might not get to run for
> > > some time?
> > > 
> > No, this is correct. The rule is if the parent can be preempted all the
> > children can be preempted, thus we can't enable preemption on the parent
> > until all the children have preemption enabled, thus the parent waits
> > for all the children to join before enabling its preemption.
> > 
> > Matt
> But,
> 
> The write to PARENT_GO_FINI can't fail or stall, right? So if it happens
> before the ARB_ON then the child is guaranteed to execute the ARB_ON once it
> has signalled the parent. Indeed, by the time the parent context gets to see
> the update memory value, the child is practically certain to have passed the
> ARB_ON. So, by the time the parent becomes pre-emptible, the children will
> all be pre-emptible. Even if the parent is superfast, the children are
> guaranteed to become pre-emptible immediately - certainly before any
> fail-to-preempt timeout could occur.
>
> Whereas, with the current ordering, it is possible for the child to be
> preempted before it has issued the signal to the parent. So now you have a
> non-preemptible parent hogging the hardware, waiting for a signal that isn't

To be clear the parent is always preempted first by the GuC. The parent
can't be running if the child preempt is attempted.

> going to come for an entire execution quantum. Indeed, it is actually quite
> likely the child would be preempted before it can signal the parent because
> any pre-emption request that was issued at any time during the child's
> execution will take effect immediately on the ARB_ON instruction.
> 

Looking at the code, I do think I have a bug though.

I think I'm missing a MI_ARB_CHECK in the parent after turning on
preemption before releasing the children, right?

This covers the case where the GuC issues a preemption to the parent
while it is waiting on the children, all the children join, the parent
turns on preemption and is preempted with the added MI_ARB_CHECK
instruction, and the children all can be preempted waiting on the parent
go semaphore. Does that sound correct?

Matt

> John.
> 
> 
> > > John.
> > > 
> > > 
> > > > +	/* Wait parent on for go */
> > > > +	*cs++ = (MI_SEMAPHORE_WAIT |
> > > > +		 MI_SEMAPHORE_GLOBAL_GTT |
> > > > +		 MI_SEMAPHORE_POLL |
> > > > +		 MI_SEMAPHORE_SAD_EQ_SDD);
> > > > +	*cs++ = CHILD_GO_FINI_BREADCRUMB;
> > > > +	*cs++ = get_children_go_addr(ce->parent);
> > > > +	*cs++ = 0;
> > > > +
> > > > +	/* Emit fini breadcrumb */
> > > > +	cs = gen8_emit_ggtt_write(cs,
> > > > +				  rq->fence.seqno,
> > > > +				  i915_request_active_timeline(rq)->hwsp_offset,
> > > > +				  0);
> > > > +
> > > > +	/* User interrupt */
> > > > +	*cs++ = MI_USER_INTERRUPT;
> > > > +	*cs++ = MI_NOOP;
> > > > +
> > > > +	rq->tail = intel_ring_offset(rq, cs);
> > > > +
> > > > +	return cs;
> > > > +}
> > > > +
> > > >    static struct intel_context *
> > > >    g2h_context_lookup(struct intel_guc *guc, u32 desc_idx)
> > > >    {
> > > > @@ -3807,6 +4075,19 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
> > > >    			drm_printf(p, "\t\tWQI Status: %u\n\n",
> > > >    				   READ_ONCE(desc->wq_status));
> > > > +			drm_printf(p, "\t\tNumber Children: %u\n\n",
> > > > +				   ce->guc_number_children);
> > > > +			if (ce->engine->emit_bb_start ==
> > > > +			    emit_bb_start_parent_no_preempt_mid_batch) {
> > > > +				u8 i;
> > > > +
> > > > +				drm_printf(p, "\t\tChildren Go: %u\n\n",
> > > > +					   get_children_go_value(ce));
> > > > +				for (i = 0; i < ce->guc_number_children; ++i)
> > > > +					drm_printf(p, "\t\tChildren Join: %u\n",
> > > > +						   get_children_join_value(ce, i));
> > > > +			}
> > > > +
> > > >    			for_each_child(ce, child)
> > > >    				guc_log_context(p, child);
> > > >    		}
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
index 5615be32879c..2de62649e275 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -561,7 +561,7 @@  void intel_context_bind_parent_child(struct intel_context *parent,
 	GEM_BUG_ON(intel_context_is_child(child));
 	GEM_BUG_ON(intel_context_is_parent(child));
 
-	parent->guc_number_children++;
+	child->guc_child_index = parent->guc_number_children++;
 	list_add_tail(&child->guc_child_link,
 		      &parent->guc_child_list);
 	child->parent = parent;
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 713d85b0b364..727f91e7f7c2 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -246,6 +246,9 @@  struct intel_context {
 		/** @guc_number_children: number of children if parent */
 		u8 guc_number_children;
 
+		/** @guc_child_index: index into guc_child_list if child */
+		u8 guc_child_index;
+
 		/**
 		 * @parent_page: page in context used by parent for work queue,
 		 * work queue descriptor
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index 6cd26dc060d1..9f61cfa5566a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -188,7 +188,7 @@  struct guc_process_desc {
 	u32 wq_status;
 	u32 engine_presence;
 	u32 priority;
-	u32 reserved[30];
+	u32 reserved[36];
 } __packed;
 
 #define CONTEXT_REGISTRATION_FLAG_KMD	BIT(0)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 91330525330d..1a18f99bf12a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -11,6 +11,7 @@ 
 #include "gt/intel_context.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_heartbeat.h"
+#include "gt/intel_gpu_commands.h"
 #include "gt/intel_gt.h"
 #include "gt/intel_gt_irq.h"
 #include "gt/intel_gt_pm.h"
@@ -366,10 +367,14 @@  static struct i915_priolist *to_priolist(struct rb_node *rb)
 
 /*
  * When using multi-lrc submission an extra page in the context state is
- * reserved for the process descriptor and work queue.
+ * reserved for the process descriptor, work queue, and preempt BB boundary
+ * handshake between the parent + childlren contexts.
  *
  * The layout of this page is below:
  * 0						guc_process_desc
+ * + sizeof(struct guc_process_desc)		child go
+ * + CACHELINE_BYTES				child join ...
+ * + CACHELINE_BYTES ...
  * ...						unused
  * PAGE_SIZE / 2				work queue start
  * ...						work queue
@@ -1785,6 +1790,30 @@  static int deregister_context(struct intel_context *ce, u32 guc_id, bool loop)
 	return __guc_action_deregister_context(guc, guc_id, loop);
 }
 
+static inline void clear_children_join_go_memory(struct intel_context *ce)
+{
+	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
+	u8 i;
+
+	for (i = 0; i < ce->guc_number_children + 1; ++i)
+		mem[i * (CACHELINE_BYTES / sizeof(u32))] = 0;
+}
+
+static inline u32 get_children_go_value(struct intel_context *ce)
+{
+	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
+
+	return mem[0];
+}
+
+static inline u32 get_children_join_value(struct intel_context *ce,
+					  u8 child_index)
+{
+	u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
+
+	return mem[(child_index + 1) * (CACHELINE_BYTES / sizeof(u32))];
+}
+
 static void guc_context_policy_init(struct intel_engine_cs *engine,
 				    struct guc_lrc_desc *desc)
 {
@@ -1867,6 +1896,8 @@  static int guc_lrc_desc_pin(struct intel_context *ce, bool loop)
 			desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD;
 			guc_context_policy_init(engine, desc);
 		}
+
+		clear_children_join_go_memory(ce);
 	}
 
 	/*
@@ -2943,6 +2974,31 @@  static const struct intel_context_ops virtual_child_context_ops = {
 	.get_sibling = guc_virtual_get_sibling,
 };
 
+/*
+ * The below override of the breadcrumbs is enabled when the user configures a
+ * context for parallel submission (multi-lrc, parent-child).
+ *
+ * The overridden breadcrumbs implements an algorithm which allows the GuC to
+ * safely preempt all the hw contexts configured for parallel submission
+ * between each BB. The contract between the i915 and GuC is if the parent
+ * context can be preempted, all the children can be preempted, and the GuC will
+ * always try to preempt the parent before the children. A handshake between the
+ * parent / children breadcrumbs ensures the i915 holds up its end of the deal
+ * creating a window to preempt between each set of BBs.
+ */
+static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
+						     u64 offset, u32 len,
+						     const unsigned int flags);
+static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
+						    u64 offset, u32 len,
+						    const unsigned int flags);
+static u32 *
+emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
+						 u32 *cs);
+static u32 *
+emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq,
+						u32 *cs);
+
 static struct intel_context *
 guc_create_parallel(struct intel_engine_cs **engines,
 		    unsigned int num_siblings,
@@ -2978,6 +3034,20 @@  guc_create_parallel(struct intel_engine_cs **engines,
 		}
 	}
 
+	parent->engine->emit_bb_start =
+		emit_bb_start_parent_no_preempt_mid_batch;
+	parent->engine->emit_fini_breadcrumb =
+		emit_fini_breadcrumb_parent_no_preempt_mid_batch;
+	parent->engine->emit_fini_breadcrumb_dw =
+		12 + 4 * parent->guc_number_children;
+	for_each_child(parent, ce) {
+		ce->engine->emit_bb_start =
+			emit_bb_start_child_no_preempt_mid_batch;
+		ce->engine->emit_fini_breadcrumb =
+			emit_fini_breadcrumb_child_no_preempt_mid_batch;
+		ce->engine->emit_fini_breadcrumb_dw = 16;
+	}
+
 	kfree(siblings);
 	return parent;
 
@@ -3362,6 +3432,204 @@  void intel_guc_submission_init_early(struct intel_guc *guc)
 	guc->submission_selected = __guc_submission_selected(guc);
 }
 
+static inline u32 get_children_go_addr(struct intel_context *ce)
+{
+	GEM_BUG_ON(!intel_context_is_parent(ce));
+
+	return i915_ggtt_offset(ce->state) +
+		__get_process_desc_offset(ce) +
+		sizeof(struct guc_process_desc);
+}
+
+static inline u32 get_children_join_addr(struct intel_context *ce,
+					 u8 child_index)
+{
+	GEM_BUG_ON(!intel_context_is_parent(ce));
+
+	return get_children_go_addr(ce) + (child_index + 1) * CACHELINE_BYTES;
+}
+
+#define PARENT_GO_BB			1
+#define PARENT_GO_FINI_BREADCRUMB	0
+#define CHILD_GO_BB			1
+#define CHILD_GO_FINI_BREADCRUMB	0
+static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
+						     u64 offset, u32 len,
+						     const unsigned int flags)
+{
+	struct intel_context *ce = rq->context;
+	u32 *cs;
+	u8 i;
+
+	GEM_BUG_ON(!intel_context_is_parent(ce));
+
+	cs = intel_ring_begin(rq, 10 + 4 * ce->guc_number_children);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	/* Wait on chidlren */
+	for (i = 0; i < ce->guc_number_children; ++i) {
+		*cs++ = (MI_SEMAPHORE_WAIT |
+			 MI_SEMAPHORE_GLOBAL_GTT |
+			 MI_SEMAPHORE_POLL |
+			 MI_SEMAPHORE_SAD_EQ_SDD);
+		*cs++ = PARENT_GO_BB;
+		*cs++ = get_children_join_addr(ce, i);
+		*cs++ = 0;
+	}
+
+	/* Turn off preemption */
+	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+	*cs++ = MI_NOOP;
+
+	/* Tell children go */
+	cs = gen8_emit_ggtt_write(cs,
+				  CHILD_GO_BB,
+				  get_children_go_addr(ce),
+				  0);
+
+	/* Jump to batch */
+	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
+		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
+	*cs++ = lower_32_bits(offset);
+	*cs++ = upper_32_bits(offset);
+	*cs++ = MI_NOOP;
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
+						    u64 offset, u32 len,
+						    const unsigned int flags)
+{
+	struct intel_context *ce = rq->context;
+	u32 *cs;
+
+	GEM_BUG_ON(!intel_context_is_child(ce));
+
+	cs = intel_ring_begin(rq, 12);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	/* Signal parent */
+	cs = gen8_emit_ggtt_write(cs,
+				  PARENT_GO_BB,
+				  get_children_join_addr(ce->parent,
+							 ce->guc_child_index),
+				  0);
+
+	/* Wait parent on for go */
+	*cs++ = (MI_SEMAPHORE_WAIT |
+		 MI_SEMAPHORE_GLOBAL_GTT |
+		 MI_SEMAPHORE_POLL |
+		 MI_SEMAPHORE_SAD_EQ_SDD);
+	*cs++ = CHILD_GO_BB;
+	*cs++ = get_children_go_addr(ce->parent);
+	*cs++ = 0;
+
+	/* Turn off preemption */
+	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
+	/* Jump to batch */
+	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
+		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
+	*cs++ = lower_32_bits(offset);
+	*cs++ = upper_32_bits(offset);
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static u32 *
+emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
+						 u32 *cs)
+{
+	struct intel_context *ce = rq->context;
+	u8 i;
+
+	GEM_BUG_ON(!intel_context_is_parent(ce));
+
+	/* Wait on children */
+	for (i = 0; i < ce->guc_number_children; ++i) {
+		*cs++ = (MI_SEMAPHORE_WAIT |
+			 MI_SEMAPHORE_GLOBAL_GTT |
+			 MI_SEMAPHORE_POLL |
+			 MI_SEMAPHORE_SAD_EQ_SDD);
+		*cs++ = PARENT_GO_FINI_BREADCRUMB;
+		*cs++ = get_children_join_addr(ce, i);
+		*cs++ = 0;
+	}
+
+	/* Turn on preemption */
+	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+	*cs++ = MI_NOOP;
+
+	/* Tell children go */
+	cs = gen8_emit_ggtt_write(cs,
+				  CHILD_GO_FINI_BREADCRUMB,
+				  get_children_go_addr(ce),
+				  0);
+
+	/* Emit fini breadcrumb */
+	cs = gen8_emit_ggtt_write(cs,
+				  rq->fence.seqno,
+				  i915_request_active_timeline(rq)->hwsp_offset,
+				  0);
+
+	/* User interrupt */
+	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
+
+	rq->tail = intel_ring_offset(rq, cs);
+
+	return cs;
+}
+
+static u32 *
+emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, u32 *cs)
+{
+	struct intel_context *ce = rq->context;
+
+	GEM_BUG_ON(!intel_context_is_child(ce));
+
+	/* Turn on preemption */
+	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+	*cs++ = MI_NOOP;
+
+	/* Signal parent */
+	cs = gen8_emit_ggtt_write(cs,
+				  PARENT_GO_FINI_BREADCRUMB,
+				  get_children_join_addr(ce->parent,
+							 ce->guc_child_index),
+				  0);
+
+	/* Wait parent on for go */
+	*cs++ = (MI_SEMAPHORE_WAIT |
+		 MI_SEMAPHORE_GLOBAL_GTT |
+		 MI_SEMAPHORE_POLL |
+		 MI_SEMAPHORE_SAD_EQ_SDD);
+	*cs++ = CHILD_GO_FINI_BREADCRUMB;
+	*cs++ = get_children_go_addr(ce->parent);
+	*cs++ = 0;
+
+	/* Emit fini breadcrumb */
+	cs = gen8_emit_ggtt_write(cs,
+				  rq->fence.seqno,
+				  i915_request_active_timeline(rq)->hwsp_offset,
+				  0);
+
+	/* User interrupt */
+	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
+
+	rq->tail = intel_ring_offset(rq, cs);
+
+	return cs;
+}
+
 static struct intel_context *
 g2h_context_lookup(struct intel_guc *guc, u32 desc_idx)
 {
@@ -3807,6 +4075,19 @@  void intel_guc_submission_print_context_info(struct intel_guc *guc,
 			drm_printf(p, "\t\tWQI Status: %u\n\n",
 				   READ_ONCE(desc->wq_status));
 
+			drm_printf(p, "\t\tNumber Children: %u\n\n",
+				   ce->guc_number_children);
+			if (ce->engine->emit_bb_start ==
+			    emit_bb_start_parent_no_preempt_mid_batch) {
+				u8 i;
+
+				drm_printf(p, "\t\tChildren Go: %u\n\n",
+					   get_children_go_value(ce));
+				for (i = 0; i < ce->guc_number_children; ++i)
+					drm_printf(p, "\t\tChildren Join: %u\n",
+						   get_children_join_value(ce, i));
+			}
+
 			for_each_child(ce, child)
 				guc_log_context(p, child);
 		}