diff mbox series

[v2,2/2] drm/i915/gt: Enable only one CCS for compute workload

Message ID 20240220143526.259109-3-andi.shyti@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Disable automatic load CCS load balancing | expand

Commit Message

Andi Shyti Feb. 20, 2024, 2:35 p.m. UTC
Enable only one CCS engine by default with all the compute sices
allocated to it.

While generating the list of UABI engines to be exposed to the
user, exclude any additional CCS engines beyond the first
instance.

This change can be tested with igt i915_query.

Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement")
Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
Cc: Chris Wilson <chris.p.wilson@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
Cc: <stable@vger.kernel.org> # v6.2+
---
 drivers/gpu/drm/i915/gt/intel_engine_user.c |  9 +++++++++
 drivers/gpu/drm/i915/gt/intel_gt.c          | 11 +++++++++++
 drivers/gpu/drm/i915/gt/intel_gt_regs.h     |  2 ++
 drivers/gpu/drm/i915/i915_query.c           |  1 +
 4 files changed, 23 insertions(+)

Comments

Andi Shyti Feb. 20, 2024, 2:42 p.m. UTC | #1
Hi,

[...]

> diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
> index 3baa2f54a86e..d5a5143971f5 100644
> --- a/drivers/gpu/drm/i915/i915_query.c
> +++ b/drivers/gpu/drm/i915/i915_query.c
> @@ -124,6 +124,7 @@ static int query_geometry_subslices(struct drm_i915_private *i915,
>  	return fill_topology_info(sseu, query_item, sseu->geometry_subslice_mask);
>  }
>  
> +

sorry, this is a leftover from the cleanup.

Andi

>  static int
>  query_engine_info(struct drm_i915_private *i915,
>  		  struct drm_i915_query_item *query_item)
> -- 
> 2.43.0
Tvrtko Ursulin Feb. 20, 2024, 2:48 p.m. UTC | #2
On 20/02/2024 14:35, Andi Shyti wrote:
> Enable only one CCS engine by default with all the compute sices

slices

> allocated to it.
> 
> While generating the list of UABI engines to be exposed to the
> user, exclude any additional CCS engines beyond the first
> instance.
> 
> This change can be tested with igt i915_query.
> 
> Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement")
> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> Cc: Chris Wilson <chris.p.wilson@linux.intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Matt Roper <matthew.d.roper@intel.com>
> Cc: <stable@vger.kernel.org> # v6.2+
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_user.c |  9 +++++++++
>   drivers/gpu/drm/i915/gt/intel_gt.c          | 11 +++++++++++
>   drivers/gpu/drm/i915/gt/intel_gt_regs.h     |  2 ++
>   drivers/gpu/drm/i915/i915_query.c           |  1 +
>   4 files changed, 23 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> index 833987015b8b..7041acc77810 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> @@ -243,6 +243,15 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
>   		if (engine->uabi_class == I915_NO_UABI_CLASS)
>   			continue;
>   
> +		/*
> +		 * Do not list and do not count CCS engines other than the first
> +		 */
> +		if (engine->uabi_class == I915_ENGINE_CLASS_COMPUTE &&
> +		    engine->uabi_instance > 0) {
> +			i915->engine_uabi_class_count[engine->uabi_class]--;
> +			continue;
> +		}

It's a bit ugly to decrement after increment, instead of somehow 
restructuring the loop to satisfy both cases more elegantly. And I 
wonder if internally (in dmesg when engine name is logged) we don't end 
up with ccs0 ccs0 ccs0 ccs0.. for all instances.

> +
>   		rb_link_node(&engine->uabi_node, prev, p);
>   		rb_insert_color(&engine->uabi_node, &i915->uabi_engines);
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> index a425db5ed3a2..e19df4ef47f6 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -168,6 +168,14 @@ static void init_unused_rings(struct intel_gt *gt)
>   	}
>   }
>   
> +static void intel_gt_apply_ccs_mode(struct intel_gt *gt)
> +{
> +	if (!IS_DG2(gt->i915))
> +		return;
> +
> +	intel_uncore_write(gt->uncore, XEHP_CCS_MODE, 0);
> +}
> +
>   int intel_gt_init_hw(struct intel_gt *gt)
>   {
>   	struct drm_i915_private *i915 = gt->i915;
> @@ -195,6 +203,9 @@ int intel_gt_init_hw(struct intel_gt *gt)
>   
>   	intel_gt_init_swizzling(gt);
>   
> +	/* Configure CCS mode */
> +	intel_gt_apply_ccs_mode(gt);
> +
>   	/*
>   	 * At least 830 can leave some of the unused rings
>   	 * "active" (ie. head != tail) after resume which
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> index cf709f6c05ae..c148113770ea 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> @@ -1605,6 +1605,8 @@
>   #define   GEN12_VOLTAGE_MASK			REG_GENMASK(10, 0)
>   #define   GEN12_CAGF_MASK			REG_GENMASK(19, 11)
>   
> +#define XEHP_CCS_MODE                          _MMIO(0x14804)
> +
>   #define GEN11_GT_INTR_DW(x)			_MMIO(0x190018 + ((x) * 4))
>   #define   GEN11_CSME				(31)
>   #define   GEN12_HECI_2				(30)
> diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
> index 3baa2f54a86e..d5a5143971f5 100644
> --- a/drivers/gpu/drm/i915/i915_query.c
> +++ b/drivers/gpu/drm/i915/i915_query.c
> @@ -124,6 +124,7 @@ static int query_geometry_subslices(struct drm_i915_private *i915,
>   	return fill_topology_info(sseu, query_item, sseu->geometry_subslice_mask);
>   }
>   
> +

Zap please.

>   static int
>   query_engine_info(struct drm_i915_private *i915,
>   		  struct drm_i915_query_item *query_item)

Regards,

Tvrtko
Matt Roper Feb. 20, 2024, 11:39 p.m. UTC | #3
On Tue, Feb 20, 2024 at 03:35:26PM +0100, Andi Shyti wrote:
> Enable only one CCS engine by default with all the compute sices
> allocated to it.
> 
> While generating the list of UABI engines to be exposed to the
> user, exclude any additional CCS engines beyond the first
> instance.
> 
> This change can be tested with igt i915_query.
> 
> Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement")
> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> Cc: Chris Wilson <chris.p.wilson@linux.intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Matt Roper <matthew.d.roper@intel.com>
> Cc: <stable@vger.kernel.org> # v6.2+
> ---
>  drivers/gpu/drm/i915/gt/intel_engine_user.c |  9 +++++++++
>  drivers/gpu/drm/i915/gt/intel_gt.c          | 11 +++++++++++
>  drivers/gpu/drm/i915/gt/intel_gt_regs.h     |  2 ++
>  drivers/gpu/drm/i915/i915_query.c           |  1 +
>  4 files changed, 23 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> index 833987015b8b..7041acc77810 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> @@ -243,6 +243,15 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
>  		if (engine->uabi_class == I915_NO_UABI_CLASS)
>  			continue;
>  
> +		/*
> +		 * Do not list and do not count CCS engines other than the first
> +		 */
> +		if (engine->uabi_class == I915_ENGINE_CLASS_COMPUTE &&
> +		    engine->uabi_instance > 0) {
> +			i915->engine_uabi_class_count[engine->uabi_class]--;
> +			continue;
> +		}

Wouldn't it be simpler to just add a workaround to the end of
engine_mask_apply_compute_fuses() if we want to ensure only a single
compute engine gets exposed?  Then both the driver internals and uapi
will agree that's there's just one CCS (and on which one there is).

If we want to do something fancy with "hotplugging" a new engine later
on or whatever, that can be handled in the future series (although as
noted on the previous patch, it sounds like these changes might not
actually be aligned with the workaround we were trying to address).

> +
>  		rb_link_node(&engine->uabi_node, prev, p);
>  		rb_insert_color(&engine->uabi_node, &i915->uabi_engines);
>  
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> index a425db5ed3a2..e19df4ef47f6 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -168,6 +168,14 @@ static void init_unused_rings(struct intel_gt *gt)
>  	}
>  }
>  
> +static void intel_gt_apply_ccs_mode(struct intel_gt *gt)
> +{
> +	if (!IS_DG2(gt->i915))
> +		return;
> +
> +	intel_uncore_write(gt->uncore, XEHP_CCS_MODE, 0);

This doesn't look right to me.  A value of 0 means every cslice gets
associated with CCS0.  On a DG2-G11 platform, that will flat out break
compute since CCS0 is never present (G11 only has a single CCS and it's
always the hardware's CCS1).  Even on a G10 or G12 this could also break
things depending on the fusing of your card if the hardware CCS0 happens
to be missing.

Also, the register says that we need a field value of 0x7 for each
cslice that's fused off.  By passing 0, we're telling the CCS engine
that it can use cslices that may not actually exist.

> +}
> +
>  int intel_gt_init_hw(struct intel_gt *gt)
>  {
>  	struct drm_i915_private *i915 = gt->i915;
> @@ -195,6 +203,9 @@ int intel_gt_init_hw(struct intel_gt *gt)
>  
>  	intel_gt_init_swizzling(gt);
>  
> +	/* Configure CCS mode */
> +	intel_gt_apply_ccs_mode(gt);
> +
>  	/*
>  	 * At least 830 can leave some of the unused rings
>  	 * "active" (ie. head != tail) after resume which
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> index cf709f6c05ae..c148113770ea 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> @@ -1605,6 +1605,8 @@
>  #define   GEN12_VOLTAGE_MASK			REG_GENMASK(10, 0)
>  #define   GEN12_CAGF_MASK			REG_GENMASK(19, 11)
>  
> +#define XEHP_CCS_MODE                          _MMIO(0x14804)

Nitpick:  this doesn't seem to be in the proper place and also breaks
the file's convention of using tabs to move over to column 48 for the
definition value.


Matt

> +
>  #define GEN11_GT_INTR_DW(x)			_MMIO(0x190018 + ((x) * 4))
>  #define   GEN11_CSME				(31)
>  #define   GEN12_HECI_2				(30)
> diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
> index 3baa2f54a86e..d5a5143971f5 100644
> --- a/drivers/gpu/drm/i915/i915_query.c
> +++ b/drivers/gpu/drm/i915/i915_query.c
> @@ -124,6 +124,7 @@ static int query_geometry_subslices(struct drm_i915_private *i915,
>  	return fill_topology_info(sseu, query_item, sseu->geometry_subslice_mask);
>  }
>  
> +
>  static int
>  query_engine_info(struct drm_i915_private *i915,
>  		  struct drm_i915_query_item *query_item)
> -- 
> 2.43.0
>
Andi Shyti Feb. 21, 2024, 12:12 a.m. UTC | #4
Hi Matt,

thanks a lot for looking into this.

On Tue, Feb 20, 2024 at 03:39:18PM -0800, Matt Roper wrote:
> On Tue, Feb 20, 2024 at 03:35:26PM +0100, Andi Shyti wrote:

[...]

> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > index 833987015b8b..7041acc77810 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > @@ -243,6 +243,15 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
> >  		if (engine->uabi_class == I915_NO_UABI_CLASS)
> >  			continue;
> >  
> > +		/*
> > +		 * Do not list and do not count CCS engines other than the first
> > +		 */
> > +		if (engine->uabi_class == I915_ENGINE_CLASS_COMPUTE &&
> > +		    engine->uabi_instance > 0) {
> > +			i915->engine_uabi_class_count[engine->uabi_class]--;
> > +			continue;
> > +		}
> 
> Wouldn't it be simpler to just add a workaround to the end of
> engine_mask_apply_compute_fuses() if we want to ensure only a single
> compute engine gets exposed?  Then both the driver internals and uapi
> will agree that's there's just one CCS (and on which one there is).
> 
> If we want to do something fancy with "hotplugging" a new engine later
> on or whatever, that can be handled in the future series (although as
> noted on the previous patch, it sounds like these changes might not
> actually be aligned with the workaround we were trying to address).

The hotplugging capability is one of the features I was looking
for, actually.

I have done some more refactoring in this piece of code in
upcoming patches.

Will check, though, if I can do something with compute_fuses(),
even though, the other cslices are not really fused off (read
below).

> > +
> >  		rb_link_node(&engine->uabi_node, prev, p);
> >  		rb_insert_color(&engine->uabi_node, &i915->uabi_engines);
> >  
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> > index a425db5ed3a2..e19df4ef47f6 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> > @@ -168,6 +168,14 @@ static void init_unused_rings(struct intel_gt *gt)
> >  	}
> >  }
> >  
> > +static void intel_gt_apply_ccs_mode(struct intel_gt *gt)
> > +{
> > +	if (!IS_DG2(gt->i915))
> > +		return;
> > +
> > +	intel_uncore_write(gt->uncore, XEHP_CCS_MODE, 0);
> 
> This doesn't look right to me.  A value of 0 means every cslice gets
> associated with CCS0.

Yes, that's what I'm trying to do. The behavior I'm looking for
is this one:

	 /*
	  ...
          * With 1 engine (ccs0):
          *   slice 0, 1, 2, 3: ccs0
          *
          * With 2 engines (ccs0, ccs1):
          *   slice 0, 2: ccs0
          *   slice 1, 3: ccs1
          *
          * With 4 engines (ccs0, ccs1, ccs2, ccs3):
          *   slice 0: ccs0
          *   slice 1: ccs1
          *   slice 2: ccs2
          *   slice 3: ccs3
	  ...
	  */

where the user can configure runtime the mode, making sure that
no client is connected to i915.

But, this needs to be written 

As we are now forcing mode '1', then all cslices are connected
with ccs0.

> On a DG2-G11 platform, that will flat out break
> compute since CCS0 is never present (G11 only has a single CCS and it's
> always the hardware's CCS1).  Even on a G10 or G12 this could also break
> things depending on the fusing of your card if the hardware CCS0 happens
> to be missing.
> 
> Also, the register says that we need a field value of 0x7 for each
> cslice that's fused off.  By passing 0, we're telling the CCS engine
> that it can use cslices that may not actually exist.

does it? Or do I need to write the id (0x0-0x3) of the user
engine? That's how the mode is calculated in this algorithm.

> > +}
> > +

[...]

> > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> > index cf709f6c05ae..c148113770ea 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> > @@ -1605,6 +1605,8 @@
> >  #define   GEN12_VOLTAGE_MASK			REG_GENMASK(10, 0)
> >  #define   GEN12_CAGF_MASK			REG_GENMASK(19, 11)
> >  
> > +#define XEHP_CCS_MODE                          _MMIO(0x14804)
> 
> Nitpick:  this doesn't seem to be in the proper place and also breaks
> the file's convention of using tabs to move over to column 48 for the
> definition value.

This was something I actually forgot to fix. Thanks!
Andi Shyti Feb. 21, 2024, 12:14 a.m. UTC | #5
Hi Tvrtko,

On Tue, Feb 20, 2024 at 02:48:31PM +0000, Tvrtko Ursulin wrote:
> On 20/02/2024 14:35, Andi Shyti wrote:
> > Enable only one CCS engine by default with all the compute sices
> 
> slices

Thanks!

> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > index 833987015b8b..7041acc77810 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > @@ -243,6 +243,15 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
> >   		if (engine->uabi_class == I915_NO_UABI_CLASS)
> >   			continue;
> > +		/*
> > +		 * Do not list and do not count CCS engines other than the first
> > +		 */
> > +		if (engine->uabi_class == I915_ENGINE_CLASS_COMPUTE &&
> > +		    engine->uabi_instance > 0) {
> > +			i915->engine_uabi_class_count[engine->uabi_class]--;
> > +			continue;
> > +		}
> 
> It's a bit ugly to decrement after increment, instead of somehow
> restructuring the loop to satisfy both cases more elegantly.

yes, agree, indeed I had a hard time here to accept this change
myself.

But moving the check above where the counter was incremented it
would have been much uglier.

This check looks ugly everywhere you place it :-)

In any case, I'm working on a patch that is splitting this
function in two parts and there is some refactoring happening
here (for the first initialization and the dynamic update).

Please let me know if it's OK with you or you want me to fix it
in this run.

> And I wonder if
> internally (in dmesg when engine name is logged) we don't end up with ccs0
> ccs0 ccs0 ccs0.. for all instances.

I don't see this. Even in sysfs we see only one ccs. Where is it?

> > +
> >   		rb_link_node(&engine->uabi_node, prev, p);
> >   		rb_insert_color(&engine->uabi_node, &i915->uabi_engines);

[...]

> > diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
> > index 3baa2f54a86e..d5a5143971f5 100644
> > --- a/drivers/gpu/drm/i915/i915_query.c
> > +++ b/drivers/gpu/drm/i915/i915_query.c
> > @@ -124,6 +124,7 @@ static int query_geometry_subslices(struct drm_i915_private *i915,
> >   	return fill_topology_info(sseu, query_item, sseu->geometry_subslice_mask);
> >   }
> > +
> 
> Zap please.

yes... yes... I noticed it after sending the patch :-)

Thanks,
Andi
Tvrtko Ursulin Feb. 21, 2024, 8:19 a.m. UTC | #6
On 21/02/2024 00:14, Andi Shyti wrote:
> Hi Tvrtko,
> 
> On Tue, Feb 20, 2024 at 02:48:31PM +0000, Tvrtko Ursulin wrote:
>> On 20/02/2024 14:35, Andi Shyti wrote:
>>> Enable only one CCS engine by default with all the compute sices
>>
>> slices
> 
> Thanks!
> 
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
>>> index 833987015b8b..7041acc77810 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
>>> @@ -243,6 +243,15 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
>>>    		if (engine->uabi_class == I915_NO_UABI_CLASS)
>>>    			continue;
>>> +		/*
>>> +		 * Do not list and do not count CCS engines other than the first
>>> +		 */
>>> +		if (engine->uabi_class == I915_ENGINE_CLASS_COMPUTE &&
>>> +		    engine->uabi_instance > 0) {
>>> +			i915->engine_uabi_class_count[engine->uabi_class]--;
>>> +			continue;
>>> +		}
>>
>> It's a bit ugly to decrement after increment, instead of somehow
>> restructuring the loop to satisfy both cases more elegantly.
> 
> yes, agree, indeed I had a hard time here to accept this change
> myself.
> 
> But moving the check above where the counter was incremented it
> would have been much uglier.
> 
> This check looks ugly everywhere you place it :-)

One idea would be to introduce a separate local counter array for 
name_instance, so not use i915->engine_uabi_class_count[]. First one 
increments for every engine, second only for the exposed ones. That way 
feels wouldn't be too ugly.

> In any case, I'm working on a patch that is splitting this
> function in two parts and there is some refactoring happening
> here (for the first initialization and the dynamic update).
> 
> Please let me know if it's OK with you or you want me to fix it
> in this run.
> 
>> And I wonder if
>> internally (in dmesg when engine name is logged) we don't end up with ccs0
>> ccs0 ccs0 ccs0.. for all instances.
> 
> I don't see this. Even in sysfs we see only one ccs. Where is it?

When you run this patch on something with two or more ccs-es, the 
"renamed ccs... to ccs.." debug logs do not all log the new name as ccs0?

Regards,

Tvrtko

> 
>>> +
>>>    		rb_link_node(&engine->uabi_node, prev, p);
>>>    		rb_insert_color(&engine->uabi_node, &i915->uabi_engines);
> 
> [...]
> 
>>> diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
>>> index 3baa2f54a86e..d5a5143971f5 100644
>>> --- a/drivers/gpu/drm/i915/i915_query.c
>>> +++ b/drivers/gpu/drm/i915/i915_query.c
>>> @@ -124,6 +124,7 @@ static int query_geometry_subslices(struct drm_i915_private *i915,
>>>    	return fill_topology_info(sseu, query_item, sseu->geometry_subslice_mask);
>>>    }
>>> +
>>
>> Zap please.
> 
> yes... yes... I noticed it after sending the patch :-)
> 
> Thanks,
> Andi
Andi Shyti Feb. 21, 2024, 11:19 a.m. UTC | #7
Hi Tvrtko,

On Wed, Feb 21, 2024 at 08:19:34AM +0000, Tvrtko Ursulin wrote:
> On 21/02/2024 00:14, Andi Shyti wrote:
> > On Tue, Feb 20, 2024 at 02:48:31PM +0000, Tvrtko Ursulin wrote:
> > > On 20/02/2024 14:35, Andi Shyti wrote:
> > > > Enable only one CCS engine by default with all the compute sices
> > > 
> > > slices
> > 
> > Thanks!
> > 
> > > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > > > index 833987015b8b..7041acc77810 100644
> > > > --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > > > +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > > > @@ -243,6 +243,15 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
> > > >    		if (engine->uabi_class == I915_NO_UABI_CLASS)
> > > >    			continue;
> > > > +		/*
> > > > +		 * Do not list and do not count CCS engines other than the first
> > > > +		 */
> > > > +		if (engine->uabi_class == I915_ENGINE_CLASS_COMPUTE &&
> > > > +		    engine->uabi_instance > 0) {
> > > > +			i915->engine_uabi_class_count[engine->uabi_class]--;
> > > > +			continue;
> > > > +		}
> > > 
> > > It's a bit ugly to decrement after increment, instead of somehow
> > > restructuring the loop to satisfy both cases more elegantly.
> > 
> > yes, agree, indeed I had a hard time here to accept this change
> > myself.
> > 
> > But moving the check above where the counter was incremented it
> > would have been much uglier.
> > 
> > This check looks ugly everywhere you place it :-)
> 
> One idea would be to introduce a separate local counter array for
> name_instance, so not use i915->engine_uabi_class_count[]. First one
> increments for every engine, second only for the exposed ones. That way
> feels wouldn't be too ugly.

Ah... you mean that whenever we change the CCS mode, we update
the indexes of the exposed engines from list of the real engines.
Will try.

My approach was to regenerate the list everytime the CCS mode was
changed, but your suggestion looks a bit simplier.

> > In any case, I'm working on a patch that is splitting this
> > function in two parts and there is some refactoring happening
> > here (for the first initialization and the dynamic update).
> > 
> > Please let me know if it's OK with you or you want me to fix it
> > in this run.
> > 
> > > And I wonder if
> > > internally (in dmesg when engine name is logged) we don't end up with ccs0
> > > ccs0 ccs0 ccs0.. for all instances.
> > 
> > I don't see this. Even in sysfs we see only one ccs. Where is it?
> 
> When you run this patch on something with two or more ccs-es, the "renamed
> ccs... to ccs.." debug logs do not all log the new name as ccs0?

it shouldn't, because the name_instance is anyway incremented
normally... anyway, I will test it.

Thanks a lot!
Andi
Tvrtko Ursulin Feb. 21, 2024, 12:08 p.m. UTC | #8
On 21/02/2024 11:19, Andi Shyti wrote:
> Hi Tvrtko,
> 
> On Wed, Feb 21, 2024 at 08:19:34AM +0000, Tvrtko Ursulin wrote:
>> On 21/02/2024 00:14, Andi Shyti wrote:
>>> On Tue, Feb 20, 2024 at 02:48:31PM +0000, Tvrtko Ursulin wrote:
>>>> On 20/02/2024 14:35, Andi Shyti wrote:
>>>>> Enable only one CCS engine by default with all the compute sices
>>>>
>>>> slices
>>>
>>> Thanks!
>>>
>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
>>>>> index 833987015b8b..7041acc77810 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
>>>>> @@ -243,6 +243,15 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
>>>>>     		if (engine->uabi_class == I915_NO_UABI_CLASS)
>>>>>     			continue;
>>>>> +		/*
>>>>> +		 * Do not list and do not count CCS engines other than the first
>>>>> +		 */
>>>>> +		if (engine->uabi_class == I915_ENGINE_CLASS_COMPUTE &&
>>>>> +		    engine->uabi_instance > 0) {
>>>>> +			i915->engine_uabi_class_count[engine->uabi_class]--;
>>>>> +			continue;
>>>>> +		}
>>>>
>>>> It's a bit ugly to decrement after increment, instead of somehow
>>>> restructuring the loop to satisfy both cases more elegantly.
>>>
>>> yes, agree, indeed I had a hard time here to accept this change
>>> myself.
>>>
>>> But moving the check above where the counter was incremented it
>>> would have been much uglier.
>>>
>>> This check looks ugly everywhere you place it :-)
>>
>> One idea would be to introduce a separate local counter array for
>> name_instance, so not use i915->engine_uabi_class_count[]. First one
>> increments for every engine, second only for the exposed ones. That way
>> feels wouldn't be too ugly.
> 
> Ah... you mean that whenever we change the CCS mode, we update
> the indexes of the exposed engines from list of the real engines.
> Will try.
> 
> My approach was to regenerate the list everytime the CCS mode was
> changed, but your suggestion looks a bit simplier.

No, I meant just for this first stage of permanently single engine. For avoiding the decrement after increment. Something like this, but not compile tested even:

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
index 833987015b8b..4c33f30612c4 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
@@ -203,7 +203,8 @@ static void engine_rename(struct intel_engine_cs *engine, const char *name, u16
  
  void intel_engines_driver_register(struct drm_i915_private *i915)
  {
-       u16 name_instance, other_instance = 0;
+       u16 class_instance[I915_LAST_UABI_ENGINE_CLASS + 2] = { };
+       u16 uabi_class, other_instance = 0;
         struct legacy_ring ring = {};
         struct list_head *it, *next;
         struct rb_node **p, *prev;
@@ -222,15 +223,14 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
  
                 GEM_BUG_ON(engine->class >= ARRAY_SIZE(uabi_classes));
                 engine->uabi_class = uabi_classes[engine->class];
+
                 if (engine->uabi_class == I915_NO_UABI_CLASS) {
-                       name_instance = other_instance++;
-               } else {
-                       GEM_BUG_ON(engine->uabi_class >=
-                                  ARRAY_SIZE(i915->engine_uabi_class_count));
-                       name_instance =
-                               i915->engine_uabi_class_count[engine->uabi_class]++;
-               }
-               engine->uabi_instance = name_instance;
+                       uabi_class = I915_LAST_UABI_ENGINE_CLASS + 1;
+               else
+                       uabi_class = engine->uabi_class;
+
+               GEM_BUG_ON(uabi_class >= ARRAY_SIZE(class_instance));
+               engine->uabi_instance = class_instance[uabi_class]++;
  
                 /*
                  * Replace the internal name with the final user and log facing
@@ -238,11 +238,15 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
                  */
                 engine_rename(engine,
                               intel_engine_class_repr(engine->class),
-                             name_instance);
+                             engine->uabi_instance);
  
-               if (engine->uabi_class == I915_NO_UABI_CLASS)
+               if (uabi_class == I915_NO_UABI_CLASS)
                         continue;
  
+               GEM_BUG_ON(uabi_class >=
+                          ARRAY_SIZE(i915->engine_uabi_class_count));
+               i915->engine_uabi_class_count[uabi_class]++;
+
                 rb_link_node(&engine->uabi_node, prev, p);
                 rb_insert_color(&engine->uabi_node, &i915->uabi_engines);


>>> In any case, I'm working on a patch that is splitting this
>>> function in two parts and there is some refactoring happening
>>> here (for the first initialization and the dynamic update).
>>>
>>> Please let me know if it's OK with you or you want me to fix it
>>> in this run.
>>>
>>>> And I wonder if
>>>> internally (in dmesg when engine name is logged) we don't end up with ccs0
>>>> ccs0 ccs0 ccs0.. for all instances.
>>>
>>> I don't see this. Even in sysfs we see only one ccs. Where is it?
>>
>> When you run this patch on something with two or more ccs-es, the "renamed
>> ccs... to ccs.." debug logs do not all log the new name as ccs0?
> 
> it shouldn't, because the name_instance is anyway incremented
> normally... anyway, I will test it.

Hm maybe it needs more than two ccs engines and then it would be ccs0, ccs1, ccs2, ccs2, on a four ccs part. Or something.. It just feels the decrement of i915->engine_uabi_class_count, which engine_instance currently uses, has to mess this up somehow.

Regards,

Tvrtko
Tvrtko Ursulin Feb. 21, 2024, 12:11 p.m. UTC | #9
On 21/02/2024 12:08, Tvrtko Ursulin wrote:
> 
> On 21/02/2024 11:19, Andi Shyti wrote:
>> Hi Tvrtko,
>>
>> On Wed, Feb 21, 2024 at 08:19:34AM +0000, Tvrtko Ursulin wrote:
>>> On 21/02/2024 00:14, Andi Shyti wrote:
>>>> On Tue, Feb 20, 2024 at 02:48:31PM +0000, Tvrtko Ursulin wrote:
>>>>> On 20/02/2024 14:35, Andi Shyti wrote:
>>>>>> Enable only one CCS engine by default with all the compute sices
>>>>>
>>>>> slices
>>>>
>>>> Thanks!
>>>>
>>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c 
>>>>>> b/drivers/gpu/drm/i915/gt/intel_engine_user.c
>>>>>> index 833987015b8b..7041acc77810 100644
>>>>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
>>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
>>>>>> @@ -243,6 +243,15 @@ void intel_engines_driver_register(struct 
>>>>>> drm_i915_private *i915)
>>>>>>             if (engine->uabi_class == I915_NO_UABI_CLASS)
>>>>>>                 continue;
>>>>>> +        /*
>>>>>> +         * Do not list and do not count CCS engines other than 
>>>>>> the first
>>>>>> +         */
>>>>>> +        if (engine->uabi_class == I915_ENGINE_CLASS_COMPUTE &&
>>>>>> +            engine->uabi_instance > 0) {
>>>>>> +            i915->engine_uabi_class_count[engine->uabi_class]--;
>>>>>> +            continue;
>>>>>> +        }
>>>>>
>>>>> It's a bit ugly to decrement after increment, instead of somehow
>>>>> restructuring the loop to satisfy both cases more elegantly.
>>>>
>>>> yes, agree, indeed I had a hard time here to accept this change
>>>> myself.
>>>>
>>>> But moving the check above where the counter was incremented it
>>>> would have been much uglier.
>>>>
>>>> This check looks ugly everywhere you place it :-)
>>>
>>> One idea would be to introduce a separate local counter array for
>>> name_instance, so not use i915->engine_uabi_class_count[]. First one
>>> increments for every engine, second only for the exposed ones. That way
>>> feels wouldn't be too ugly.
>>
>> Ah... you mean that whenever we change the CCS mode, we update
>> the indexes of the exposed engines from list of the real engines.
>> Will try.
>>
>> My approach was to regenerate the list everytime the CCS mode was
>> changed, but your suggestion looks a bit simplier.
> 
> No, I meant just for this first stage of permanently single engine. For 
> avoiding the decrement after increment. Something like this, but not 
> compile tested even:
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c 
> b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> index 833987015b8b..4c33f30612c4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> @@ -203,7 +203,8 @@ static void engine_rename(struct intel_engine_cs 
> *engine, const char *name, u16
> 
>   void intel_engines_driver_register(struct drm_i915_private *i915)
>   {
> -       u16 name_instance, other_instance = 0;
> +       u16 class_instance[I915_LAST_UABI_ENGINE_CLASS + 2] = { };
> +       u16 uabi_class, other_instance = 0;
>          struct legacy_ring ring = {};
>          struct list_head *it, *next;
>          struct rb_node **p, *prev;
> @@ -222,15 +223,14 @@ void intel_engines_driver_register(struct 
> drm_i915_private *i915)
> 
>                  GEM_BUG_ON(engine->class >= ARRAY_SIZE(uabi_classes));
>                  engine->uabi_class = uabi_classes[engine->class];
> +
>                  if (engine->uabi_class == I915_NO_UABI_CLASS) {
> -                       name_instance = other_instance++;
> -               } else {
> -                       GEM_BUG_ON(engine->uabi_class >=
> -                                  
> ARRAY_SIZE(i915->engine_uabi_class_count));
> -                       name_instance =
> -                               
> i915->engine_uabi_class_count[engine->uabi_class]++;
> -               }
> -               engine->uabi_instance = name_instance;
> +                       uabi_class = I915_LAST_UABI_ENGINE_CLASS + 1;
> +               else
> +                       uabi_class = engine->uabi_class;
> +
> +               GEM_BUG_ON(uabi_class >= ARRAY_SIZE(class_instance));
> +               engine->uabi_instance = class_instance[uabi_class]++;
> 
>                  /*
>                   * Replace the internal name with the final user and 
> log facing
> @@ -238,11 +238,15 @@ void intel_engines_driver_register(struct 
> drm_i915_private *i915)
>                   */
>                  engine_rename(engine,
>                                intel_engine_class_repr(engine->class),
> -                             name_instance);
> +                             engine->uabi_instance);
> 
> -               if (engine->uabi_class == I915_NO_UABI_CLASS)
> +               if (uabi_class == I915_NO_UABI_CLASS)
>                          continue;

Here you just add the ccs skip condition.

Anyway.. I rushed it a bit so see what you think.

Regards,

Tvrtko

> 
> +               GEM_BUG_ON(uabi_class >=
> +                          ARRAY_SIZE(i915->engine_uabi_class_count));
> +               i915->engine_uabi_class_count[uabi_class]++;
> +
>                  rb_link_node(&engine->uabi_node, prev, p);
>                  rb_insert_color(&engine->uabi_node, &i915->uabi_engines);
> 
> 
>>>> In any case, I'm working on a patch that is splitting this
>>>> function in two parts and there is some refactoring happening
>>>> here (for the first initialization and the dynamic update).
>>>>
>>>> Please let me know if it's OK with you or you want me to fix it
>>>> in this run.
>>>>
>>>>> And I wonder if
>>>>> internally (in dmesg when engine name is logged) we don't end up 
>>>>> with ccs0
>>>>> ccs0 ccs0 ccs0.. for all instances.
>>>>
>>>> I don't see this. Even in sysfs we see only one ccs. Where is it?
>>>
>>> When you run this patch on something with two or more ccs-es, the 
>>> "renamed
>>> ccs... to ccs.." debug logs do not all log the new name as ccs0?
>>
>> it shouldn't, because the name_instance is anyway incremented
>> normally... anyway, I will test it.
> 
> Hm maybe it needs more than two ccs engines and then it would be ccs0, 
> ccs1, ccs2, ccs2, on a four ccs part. Or something.. It just feels the 
> decrement of i915->engine_uabi_class_count, which engine_instance 
> currently uses, has to mess this up somehow.
> 
> Regards,
> 
> Tvrtko
Matt Roper Feb. 21, 2024, 8:51 p.m. UTC | #10
On Wed, Feb 21, 2024 at 01:12:18AM +0100, Andi Shyti wrote:
> Hi Matt,
> 
> thanks a lot for looking into this.
> 
> On Tue, Feb 20, 2024 at 03:39:18PM -0800, Matt Roper wrote:
> > On Tue, Feb 20, 2024 at 03:35:26PM +0100, Andi Shyti wrote:
> 
> [...]
> 
> > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > > index 833987015b8b..7041acc77810 100644
> > > --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > > +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > > @@ -243,6 +243,15 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
> > >  		if (engine->uabi_class == I915_NO_UABI_CLASS)
> > >  			continue;
> > >  
> > > +		/*
> > > +		 * Do not list and do not count CCS engines other than the first
> > > +		 */
> > > +		if (engine->uabi_class == I915_ENGINE_CLASS_COMPUTE &&
> > > +		    engine->uabi_instance > 0) {
> > > +			i915->engine_uabi_class_count[engine->uabi_class]--;
> > > +			continue;
> > > +		}
> > 
> > Wouldn't it be simpler to just add a workaround to the end of
> > engine_mask_apply_compute_fuses() if we want to ensure only a single
> > compute engine gets exposed?  Then both the driver internals and uapi
> > will agree that's there's just one CCS (and on which one there is).
> > 
> > If we want to do something fancy with "hotplugging" a new engine later
> > on or whatever, that can be handled in the future series (although as
> > noted on the previous patch, it sounds like these changes might not
> > actually be aligned with the workaround we were trying to address).
> 
> The hotplugging capability is one of the features I was looking
> for, actually.
> 
> I have done some more refactoring in this piece of code in
> upcoming patches.
> 
> Will check, though, if I can do something with compute_fuses(),
> even though, the other cslices are not really fused off (read
> below).
> 
> > > +
> > >  		rb_link_node(&engine->uabi_node, prev, p);
> > >  		rb_insert_color(&engine->uabi_node, &i915->uabi_engines);
> > >  
> > > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> > > index a425db5ed3a2..e19df4ef47f6 100644
> > > --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> > > +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> > > @@ -168,6 +168,14 @@ static void init_unused_rings(struct intel_gt *gt)
> > >  	}
> > >  }
> > >  
> > > +static void intel_gt_apply_ccs_mode(struct intel_gt *gt)
> > > +{
> > > +	if (!IS_DG2(gt->i915))
> > > +		return;
> > > +
> > > +	intel_uncore_write(gt->uncore, XEHP_CCS_MODE, 0);
> > 
> > This doesn't look right to me.  A value of 0 means every cslice gets
> > associated with CCS0.
> 
> Yes, that's what I'm trying to do. The behavior I'm looking for
> is this one:
> 
> 	 /*
> 	  ...
>           * With 1 engine (ccs0):
>           *   slice 0, 1, 2, 3: ccs0
>           *
>           * With 2 engines (ccs0, ccs1):
>           *   slice 0, 2: ccs0
>           *   slice 1, 3: ccs1
>           *
>           * With 4 engines (ccs0, ccs1, ccs2, ccs3):
>           *   slice 0: ccs0
>           *   slice 1: ccs1
>           *   slice 2: ccs2
>           *   slice 3: ccs3
> 	  ...
> 	  */
> 
> where the user can configure runtime the mode, making sure that
> no client is connected to i915.
> 
> But, this needs to be written 
> 
> As we are now forcing mode '1', then all cslices are connected
> with ccs0.

Right --- and that's what I'm pointing out as illegal.  I think that
code comment above was taken out of context from a different RFC series;
that's not an accurate description of the behavior we want here.

First, the above comment is using ccs# to refer to userspace engines,
not hardware engines.  As a simple example, DG2-G11 only ever has a
single CCS which userspace sees as "instance 0" but which is actually
CCS1 at the hardware level.  If you try to follow the comment above when
programming CCS_MODE, you've assigned all of the cslices to a
non-existent engine and assigned no cslices to the CCS engine that
actually exists.  For DG2-G10 (and I think DG2-G12), there are different
combinations of fused-off / not-fused-off engines that will always show
up in userspace as CCS0-CCSn, even if those don't match the hardware
IDs.

Second, the above comment is assuming that you have a part with a
maximum fusing config (i.e., all cslices present).  Using DG2-G11 again
as an example, there's also only a single cslice (cslice1), so if you
tell CCS1 that it's allowed to use EUs from non-existent cslice0,
cslice2, and cslice3, you might not get the behavior you were hoping
for.

> 
> > On a DG2-G11 platform, that will flat out break
> > compute since CCS0 is never present (G11 only has a single CCS and it's
> > always the hardware's CCS1).  Even on a G10 or G12 this could also break
> > things depending on the fusing of your card if the hardware CCS0 happens
> > to be missing.
> > 
> > Also, the register says that we need a field value of 0x7 for each
> > cslice that's fused off.  By passing 0, we're telling the CCS engine
> > that it can use cslices that may not actually exist.
> 
> does it? Or do I need to write the id (0x0-0x3) of the user
> engine? That's how the mode is calculated in this algorithm.

0x0 - 0x3 are how you specify that a specific CCS engine can use the
cslice.  If the cslice can't be used at all (i.e., it's fused off), then
you need to program a 0x7 to ensure no engines try to use the
non-existent DSS/EUs.


Matt

> 
> > > +}
> > > +
> 
> [...]
> 
> > > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> > > index cf709f6c05ae..c148113770ea 100644
> > > --- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> > > +++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> > > @@ -1605,6 +1605,8 @@
> > >  #define   GEN12_VOLTAGE_MASK			REG_GENMASK(10, 0)
> > >  #define   GEN12_CAGF_MASK			REG_GENMASK(19, 11)
> > >  
> > > +#define XEHP_CCS_MODE                          _MMIO(0x14804)
> > 
> > Nitpick:  this doesn't seem to be in the proper place and also breaks
> > the file's convention of using tabs to move over to column 48 for the
> > definition value.
> 
> This was something I actually forgot to fix. Thanks!
Andi Shyti Feb. 22, 2024, 10:03 p.m. UTC | #11
Hi Matt,

first of all thanks a lot for the observations you are raising.

On Wed, Feb 21, 2024 at 12:51:04PM -0800, Matt Roper wrote:
> On Wed, Feb 21, 2024 at 01:12:18AM +0100, Andi Shyti wrote:
> > On Tue, Feb 20, 2024 at 03:39:18PM -0800, Matt Roper wrote:
> > > On Tue, Feb 20, 2024 at 03:35:26PM +0100, Andi Shyti wrote:

...

> > > > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> > > > index a425db5ed3a2..e19df4ef47f6 100644
> > > > --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> > > > +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> > > > @@ -168,6 +168,14 @@ static void init_unused_rings(struct intel_gt *gt)
> > > >  	}
> > > >  }
> > > >  
> > > > +static void intel_gt_apply_ccs_mode(struct intel_gt *gt)
> > > > +{
> > > > +	if (!IS_DG2(gt->i915))
> > > > +		return;
> > > > +
> > > > +	intel_uncore_write(gt->uncore, XEHP_CCS_MODE, 0);
> > > 
> > > This doesn't look right to me.  A value of 0 means every cslice gets
> > > associated with CCS0.
> > 
> > Yes, that's what I'm trying to do. The behavior I'm looking for
> > is this one:
> > 
> > 	 /*
> > 	  ...
> >           * With 1 engine (ccs0):
> >           *   slice 0, 1, 2, 3: ccs0
> >           *
> >           * With 2 engines (ccs0, ccs1):
> >           *   slice 0, 2: ccs0
> >           *   slice 1, 3: ccs1
> >           *
> >           * With 4 engines (ccs0, ccs1, ccs2, ccs3):
> >           *   slice 0: ccs0
> >           *   slice 1: ccs1
> >           *   slice 2: ccs2
> >           *   slice 3: ccs3
> > 	  ...
> > 	  */
> > 
> > where the user can configure runtime the mode, making sure that
> > no client is connected to i915.
> > 
> > But, this needs to be written 
> > 
> > As we are now forcing mode '1', then all cslices are connected
> > with ccs0.
> 
> Right --- and that's what I'm pointing out as illegal.  I think that
> code comment above was taken out of context from a different RFC series;
> that's not an accurate description of the behavior we want here.
> 
> First, the above comment is using ccs# to refer to userspace engines,
> not hardware engines.  As a simple example, DG2-G11 only ever has a
> single CCS which userspace sees as "instance 0" but which is actually
> CCS1 at the hardware level.  If you try to follow the comment above when
> programming CCS_MODE, you've assigned all of the cslices to a
> non-existent engine and assigned no cslices to the CCS engine that
> actually exists.  For DG2-G10 (and I think DG2-G12), there are different
> combinations of fused-off / not-fused-off engines that will always show
> up in userspace as CCS0-CCSn, even if those don't match the hardware
> IDs.
> 
> Second, the above comment is assuming that you have a part with a
> maximum fusing config (i.e., all cslices present).  Using DG2-G11 again
> as an example, there's also only a single cslice (cslice1), so if you
> tell CCS1 that it's allowed to use EUs from non-existent cslice0,
> cslice2, and cslice3, you might not get the behavior you were hoping
> for.

if the hardware slices are fused off we wouldn't see them in a
first place, right? And that's anyway a permanent configuration
that wouldn't affect the patch.

BTW, is there any machine I can test this scenario?

> > > On a DG2-G11 platform, that will flat out break
> > > compute since CCS0 is never present (G11 only has a single CCS and it's
> > > always the hardware's CCS1).  Even on a G10 or G12 this could also break
> > > things depending on the fusing of your card if the hardware CCS0 happens
> > > to be missing.
> > > 
> > > Also, the register says that we need a field value of 0x7 for each
> > > cslice that's fused off.  By passing 0, we're telling the CCS engine
> > > that it can use cslices that may not actually exist.
> > 
> > does it? Or do I need to write the id (0x0-0x3) of the user
> > engine? That's how the mode is calculated in this algorithm.
> 
> 0x0 - 0x3 are how you specify that a specific CCS engine can use the
> cslice.  If the cslice can't be used at all (i.e., it's fused off), then
> you need to program a 0x7 to ensure no engines try to use the
> non-existent DSS/EUs.

I planned to limit this to the only DG2 (and ATSM, of course).
Do you think it would it help?

Andi
Matt Roper Feb. 22, 2024, 10:33 p.m. UTC | #12
On Thu, Feb 22, 2024 at 11:03:27PM +0100, Andi Shyti wrote:
> Hi Matt,
> 
> first of all thanks a lot for the observations you are raising.
> 
> On Wed, Feb 21, 2024 at 12:51:04PM -0800, Matt Roper wrote:
> > On Wed, Feb 21, 2024 at 01:12:18AM +0100, Andi Shyti wrote:
> > > On Tue, Feb 20, 2024 at 03:39:18PM -0800, Matt Roper wrote:
> > > > On Tue, Feb 20, 2024 at 03:35:26PM +0100, Andi Shyti wrote:
> 
> ...
> 
> > > > > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> > > > > index a425db5ed3a2..e19df4ef47f6 100644
> > > > > --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> > > > > +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> > > > > @@ -168,6 +168,14 @@ static void init_unused_rings(struct intel_gt *gt)
> > > > >  	}
> > > > >  }
> > > > >  
> > > > > +static void intel_gt_apply_ccs_mode(struct intel_gt *gt)
> > > > > +{
> > > > > +	if (!IS_DG2(gt->i915))
> > > > > +		return;
> > > > > +
> > > > > +	intel_uncore_write(gt->uncore, XEHP_CCS_MODE, 0);
> > > > 
> > > > This doesn't look right to me.  A value of 0 means every cslice gets
> > > > associated with CCS0.
> > > 
> > > Yes, that's what I'm trying to do. The behavior I'm looking for
> > > is this one:
> > > 
> > > 	 /*
> > > 	  ...
> > >           * With 1 engine (ccs0):
> > >           *   slice 0, 1, 2, 3: ccs0
> > >           *
> > >           * With 2 engines (ccs0, ccs1):
> > >           *   slice 0, 2: ccs0
> > >           *   slice 1, 3: ccs1
> > >           *
> > >           * With 4 engines (ccs0, ccs1, ccs2, ccs3):
> > >           *   slice 0: ccs0
> > >           *   slice 1: ccs1
> > >           *   slice 2: ccs2
> > >           *   slice 3: ccs3
> > > 	  ...
> > > 	  */
> > > 
> > > where the user can configure runtime the mode, making sure that
> > > no client is connected to i915.
> > > 
> > > But, this needs to be written 
> > > 
> > > As we are now forcing mode '1', then all cslices are connected
> > > with ccs0.
> > 
> > Right --- and that's what I'm pointing out as illegal.  I think that
> > code comment above was taken out of context from a different RFC series;
> > that's not an accurate description of the behavior we want here.
> > 
> > First, the above comment is using ccs# to refer to userspace engines,
> > not hardware engines.  As a simple example, DG2-G11 only ever has a
> > single CCS which userspace sees as "instance 0" but which is actually
> > CCS1 at the hardware level.  If you try to follow the comment above when
> > programming CCS_MODE, you've assigned all of the cslices to a
> > non-existent engine and assigned no cslices to the CCS engine that
> > actually exists.  For DG2-G10 (and I think DG2-G12), there are different
> > combinations of fused-off / not-fused-off engines that will always show
> > up in userspace as CCS0-CCSn, even if those don't match the hardware
> > IDs.
> > 
> > Second, the above comment is assuming that you have a part with a
> > maximum fusing config (i.e., all cslices present).  Using DG2-G11 again
> > as an example, there's also only a single cslice (cslice1), so if you
> > tell CCS1 that it's allowed to use EUs from non-existent cslice0,
> > cslice2, and cslice3, you might not get the behavior you were hoping
> > for.
> 
> if the hardware slices are fused off we wouldn't see them in a
> first place, right? And that's anyway a permanent configuration
> that wouldn't affect the patch.

There are physically four possible cslices in the IP design.  The
presence/absence of each of those cslices can vary both by SKU and by
part-specific fusing.  Some SKUs (DG2-G11) wind up only ever having a
single possible configuration as far as I know, but the larger SKUs have
more part-to-part variation in terms of exactly which specific subset of
DSS (and by extension cslices) are present/absent.  The KMD determines
the configuration at runtime by reading the DSS fuse registers and
deriving the cslice presence/absence from that.

The register you're writing in this patch tells the CCS engine which
cslice(s) it can use to execute work.  If the KMD already knows that
cslice<x> doesn't exist, but it tells CCS<y> that it can go ahead and
use it anyway, things probably won't work properly.  That's why the spec
mandates that we always program 0x7 in the register for any cslices that
we know aren't present so that none of the CCS engines will incorrectly
try to utilize them.  If we ignore that rule, then it's a driver bug.

> 
> BTW, is there any machine I can test this scenario?

There should differently-fused DG2 systems in our internal pool,
although I'm not sure what the breakdown is.  I don't think the
reservation system makes the low-level cslice configuration immediately
obvious on the summary page, so you might just need to log into a few
systems and read the fuse registers to check which ones are best for
testing these cases.

> 
> > > > On a DG2-G11 platform, that will flat out break
> > > > compute since CCS0 is never present (G11 only has a single CCS and it's
> > > > always the hardware's CCS1).  Even on a G10 or G12 this could also break
> > > > things depending on the fusing of your card if the hardware CCS0 happens
> > > > to be missing.
> > > > 
> > > > Also, the register says that we need a field value of 0x7 for each
> > > > cslice that's fused off.  By passing 0, we're telling the CCS engine
> > > > that it can use cslices that may not actually exist.
> > > 
> > > does it? Or do I need to write the id (0x0-0x3) of the user
> > > engine? That's how the mode is calculated in this algorithm.
> > 
> > 0x0 - 0x3 are how you specify that a specific CCS engine can use the
> > cslice.  If the cslice can't be used at all (i.e., it's fused off), then
> > you need to program a 0x7 to ensure no engines try to use the
> > non-existent DSS/EUs.
> 
> I planned to limit this to the only DG2 (and ATSM, of course).
> Do you think it would it help?

Yes, definitely.  It's mandatory programming for these platforms.


Matt

> 
> Andi
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
index 833987015b8b..7041acc77810 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
@@ -243,6 +243,15 @@  void intel_engines_driver_register(struct drm_i915_private *i915)
 		if (engine->uabi_class == I915_NO_UABI_CLASS)
 			continue;
 
+		/*
+		 * Do not list and do not count CCS engines other than the first
+		 */
+		if (engine->uabi_class == I915_ENGINE_CLASS_COMPUTE &&
+		    engine->uabi_instance > 0) {
+			i915->engine_uabi_class_count[engine->uabi_class]--;
+			continue;
+		}
+
 		rb_link_node(&engine->uabi_node, prev, p);
 		rb_insert_color(&engine->uabi_node, &i915->uabi_engines);
 
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
index a425db5ed3a2..e19df4ef47f6 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -168,6 +168,14 @@  static void init_unused_rings(struct intel_gt *gt)
 	}
 }
 
+static void intel_gt_apply_ccs_mode(struct intel_gt *gt)
+{
+	if (!IS_DG2(gt->i915))
+		return;
+
+	intel_uncore_write(gt->uncore, XEHP_CCS_MODE, 0);
+}
+
 int intel_gt_init_hw(struct intel_gt *gt)
 {
 	struct drm_i915_private *i915 = gt->i915;
@@ -195,6 +203,9 @@  int intel_gt_init_hw(struct intel_gt *gt)
 
 	intel_gt_init_swizzling(gt);
 
+	/* Configure CCS mode */
+	intel_gt_apply_ccs_mode(gt);
+
 	/*
 	 * At least 830 can leave some of the unused rings
 	 * "active" (ie. head != tail) after resume which
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
index cf709f6c05ae..c148113770ea 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
@@ -1605,6 +1605,8 @@ 
 #define   GEN12_VOLTAGE_MASK			REG_GENMASK(10, 0)
 #define   GEN12_CAGF_MASK			REG_GENMASK(19, 11)
 
+#define XEHP_CCS_MODE                          _MMIO(0x14804)
+
 #define GEN11_GT_INTR_DW(x)			_MMIO(0x190018 + ((x) * 4))
 #define   GEN11_CSME				(31)
 #define   GEN12_HECI_2				(30)
diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
index 3baa2f54a86e..d5a5143971f5 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -124,6 +124,7 @@  static int query_geometry_subslices(struct drm_i915_private *i915,
 	return fill_topology_info(sseu, query_item, sseu->geometry_subslice_mask);
 }
 
+
 static int
 query_engine_info(struct drm_i915_private *i915,
 		  struct drm_i915_query_item *query_item)