diff mbox series

[v4,3/3] drm/i915/gt: Enable only one CCS for compute workload

Message ID 20240306012247.246003-4-andi.shyti@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Disable automatic load CCS load balancing | expand

Commit Message

Andi Shyti March 6, 2024, 1:22 a.m. UTC
Enable only one CCS engine by default with all the compute sices
allocated to it.

While generating the list of UABI engines to be exposed to the
user, exclude any additional CCS engines beyond the first
instance.

This change can be tested with igt i915_query.

Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement")
Requires: 97aba5e46038 ("drm/i915/gt: Refactor uabi engine class/instance list creation")
Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
Cc: Chris Wilson <chris.p.wilson@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
Cc: <stable@vger.kernel.org> # v6.2+
---
 drivers/gpu/drm/i915/gt/intel_engine_user.c | 11 ++++++++++
 drivers/gpu/drm/i915/gt/intel_gt.c          | 23 +++++++++++++++++++++
 drivers/gpu/drm/i915/gt/intel_gt_regs.h     |  5 +++++
 3 files changed, 39 insertions(+)

Comments

Matt Roper March 7, 2024, 12:14 a.m. UTC | #1
On Wed, Mar 06, 2024 at 02:22:47AM +0100, Andi Shyti wrote:
> Enable only one CCS engine by default with all the compute sices
> allocated to it.
> 
> While generating the list of UABI engines to be exposed to the
> user, exclude any additional CCS engines beyond the first
> instance.
> 
> This change can be tested with igt i915_query.
> 
> Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement")
> Requires: 97aba5e46038 ("drm/i915/gt: Refactor uabi engine class/instance list creation")
> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> Cc: Chris Wilson <chris.p.wilson@linux.intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Matt Roper <matthew.d.roper@intel.com>
> Cc: <stable@vger.kernel.org> # v6.2+
> ---
>  drivers/gpu/drm/i915/gt/intel_engine_user.c | 11 ++++++++++
>  drivers/gpu/drm/i915/gt/intel_gt.c          | 23 +++++++++++++++++++++
>  drivers/gpu/drm/i915/gt/intel_gt_regs.h     |  5 +++++
>  3 files changed, 39 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> index 11cc06c0c785..9ef1c4ce252d 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> @@ -208,6 +208,7 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
>  	struct list_head *it, *next;
>  	struct rb_node **p, *prev;
>  	LIST_HEAD(engines);
> +	u16 uabi_ccs = 0;
>  
>  	sort_engines(i915, &engines);
>  
> @@ -244,6 +245,16 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
>  		if (uabi_class > I915_LAST_UABI_ENGINE_CLASS)
>  			continue;
>  
> +		/*
> +		 * The load is balanced among all the available compute
> +		 * slices. Expose only the first instance of the compute
> +		 * engine.
> +		 */
> +		if (IS_DG2(i915) &&
> +		    uabi_class == I915_ENGINE_CLASS_COMPUTE &&
> +		    uabi_ccs++)
> +			continue;
> +
>  		GEM_BUG_ON(uabi_class >=
>  			   ARRAY_SIZE(i915->engine_uabi_class_count));
>  		i915->engine_uabi_class_count[uabi_class]++;
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> index a425db5ed3a2..0aac97439552 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -168,6 +168,26 @@ static void init_unused_rings(struct intel_gt *gt)
>  	}
>  }
>  
> +static void intel_gt_apply_ccs_mode(struct intel_gt *gt)
> +{
> +	u32 mode;
> +	int cslice;
> +
> +	if (!IS_DG2(gt->i915))
> +		return;
> +
> +	/* Set '0' as a default CCS id to all the cslices */
> +	mode = 0;
> +
> +	for (cslice = 0; cslice < hweight32(CCS_MASK(gt)); cslice++)
> +		/* Write 0x7 if no CCS context dispatches to this cslice */
> +		if (!(CCS_MASK(gt) & BIT(cslice)))
> +			mode |= XEHP_CCS_MODE_CSLICE(cslice,
> +						     XEHP_CCS_MODE_CSLICE_MASK);
> +
> +	intel_uncore_write(gt->uncore, XEHP_CCS_MODE, mode);

This is still going to hook all available cslices up to hardware engine
ccs0.  But what you actually want is to hook them all up to what
userspace sees as CCS0 (i.e., the first CCS engine that wasn't fused
off).  Hardware's engine numbering and userspace's numbering aren't the
same.

Also, if you have a part that only has hardware ccs1/cslice1 for
example, you're not going to set cslices 2 & 3 to 0x7 properly.

So probably what you want is something like this (untested):

static void intel_gt_apply_ccs_mode(struct intel_gt *gt)
{
        u32 mode = 0;
        int first_ccs = __ffs(CCS_MASK(gt));

        /*
         * Re-assign every present cslice to the first available CCS
         * engine; mark unavailable cslices as unused.
         */
        for (int cslice = 0; cslice < 4; cslice++) {
                if (CCS_MASK(gt) & BIT(cslice))
                        mode |= XEHP_CCS_MODE_CSLICE(cslice, first_ccs);
                else
                        mode |= XEHP_CCS_MODE_CSLICE(cslice,
                                                     XEHP_CCS_MODE_CSLICE_MASK);
        }

        intel_uncore_write(gt->uncore, XEHP_CCS_MODE, mode);
}

> +}
> +
>  int intel_gt_init_hw(struct intel_gt *gt)
>  {
>  	struct drm_i915_private *i915 = gt->i915;
> @@ -195,6 +215,9 @@ int intel_gt_init_hw(struct intel_gt *gt)
>  
>  	intel_gt_init_swizzling(gt);
>  
> +	/* Configure CCS mode */
> +	intel_gt_apply_ccs_mode(gt);

This is only setting this once during init.  The value gets lost on
every RCS/CCS reset, so we need to make sure it gets reapplied when
necessary.  That means you either need to add this to the GuC regset, or
you need to implement the programming as a "fake workaround" so that the
workaround framework will take care of the re-application for you.


Matt

> +
>  	/*
>  	 * At least 830 can leave some of the unused rings
>  	 * "active" (ie. head != tail) after resume which
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> index cf709f6c05ae..8224dd99c7d7 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> @@ -1480,6 +1480,11 @@
>  #define   GEN12_RCU_MODE_CCS_ENABLE		REG_BIT(0)
>  #define   XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE	REG_BIT(1)
>  
> +#define XEHP_CCS_MODE				_MMIO(0x14804)
> +#define   XEHP_CCS_MODE_CSLICE_MASK		REG_GENMASK(2, 0) /* CCS0-3 + rsvd */
> +#define   XEHP_CCS_MODE_CSLICE_WIDTH		ilog2(XEHP_CCS_MODE_CSLICE_MASK + 1)
> +#define   XEHP_CCS_MODE_CSLICE(cslice, ccs)	(ccs << (cslice * XEHP_CCS_MODE_CSLICE_WIDTH))
> +
>  #define CHV_FUSE_GT				_MMIO(VLV_GUNIT_BASE + 0x2168)
>  #define   CHV_FGT_DISABLE_SS0			(1 << 10)
>  #define   CHV_FGT_DISABLE_SS1			(1 << 11)
> -- 
> 2.43.0
>
Andi Shyti March 7, 2024, 11:52 p.m. UTC | #2
Hi Matt,

> > +static void intel_gt_apply_ccs_mode(struct intel_gt *gt)
> > +{
> > +	u32 mode;
> > +	int cslice;
> > +
> > +	if (!IS_DG2(gt->i915))
> > +		return;
> > +
> > +	/* Set '0' as a default CCS id to all the cslices */
> > +	mode = 0;
> > +
> > +	for (cslice = 0; cslice < hweight32(CCS_MASK(gt)); cslice++)
> > +		/* Write 0x7 if no CCS context dispatches to this cslice */
> > +		if (!(CCS_MASK(gt) & BIT(cslice)))
> > +			mode |= XEHP_CCS_MODE_CSLICE(cslice,
> > +						     XEHP_CCS_MODE_CSLICE_MASK);
> > +
> > +	intel_uncore_write(gt->uncore, XEHP_CCS_MODE, mode);
> 
> This is still going to hook all available cslices up to hardware engine
> ccs0.  But what you actually want is to hook them all up to what
> userspace sees as CCS0 (i.e., the first CCS engine that wasn't fused
> off).  Hardware's engine numbering and userspace's numbering aren't the
> same.

Yes, correct... we had so many discussions and I forgot about it :-)

> Also, if you have a part that only has hardware ccs1/cslice1 for
> example, you're not going to set cslices 2 & 3 to 0x7 properly.

Good point also here, the XEHP_CCS_MODE register is indeed
generic to all platforms.

> So probably what you want is something like this (untested):
> 
> static void intel_gt_apply_ccs_mode(struct intel_gt *gt)
> {
>         u32 mode = 0;
>         int first_ccs = __ffs(CCS_MASK(gt));
> 
>         /*
>          * Re-assign every present cslice to the first available CCS
>          * engine; mark unavailable cslices as unused.
>          */
>         for (int cslice = 0; cslice < 4; cslice++) {
>                 if (CCS_MASK(gt) & BIT(cslice))
>                         mode |= XEHP_CCS_MODE_CSLICE(cslice, first_ccs);
>                 else
>                         mode |= XEHP_CCS_MODE_CSLICE(cslice,
>                                                      XEHP_CCS_MODE_CSLICE_MASK);
>         }
> 
>         intel_uncore_write(gt->uncore, XEHP_CCS_MODE, mode);
> }
> 
> > +}
> > +
> >  int intel_gt_init_hw(struct intel_gt *gt)
> >  {
> >  	struct drm_i915_private *i915 = gt->i915;
> > @@ -195,6 +215,9 @@ int intel_gt_init_hw(struct intel_gt *gt)
> >  
> >  	intel_gt_init_swizzling(gt);
> >  
> > +	/* Configure CCS mode */
> > +	intel_gt_apply_ccs_mode(gt);
> 
> This is only setting this once during init.  The value gets lost on
> every RCS/CCS reset, so we need to make sure it gets reapplied when
> necessary.  That means you either need to add this to the GuC regset, or
> you need to implement the programming as a "fake workaround" so that the
> workaround framework will take care of the re-application for you.

OK, I'll hook everything up in the ccs_engine_wa_init().

Thanks,
Andi
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
index 11cc06c0c785..9ef1c4ce252d 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
@@ -208,6 +208,7 @@  void intel_engines_driver_register(struct drm_i915_private *i915)
 	struct list_head *it, *next;
 	struct rb_node **p, *prev;
 	LIST_HEAD(engines);
+	u16 uabi_ccs = 0;
 
 	sort_engines(i915, &engines);
 
@@ -244,6 +245,16 @@  void intel_engines_driver_register(struct drm_i915_private *i915)
 		if (uabi_class > I915_LAST_UABI_ENGINE_CLASS)
 			continue;
 
+		/*
+		 * The load is balanced among all the available compute
+		 * slices. Expose only the first instance of the compute
+		 * engine.
+		 */
+		if (IS_DG2(i915) &&
+		    uabi_class == I915_ENGINE_CLASS_COMPUTE &&
+		    uabi_ccs++)
+			continue;
+
 		GEM_BUG_ON(uabi_class >=
 			   ARRAY_SIZE(i915->engine_uabi_class_count));
 		i915->engine_uabi_class_count[uabi_class]++;
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
index a425db5ed3a2..0aac97439552 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -168,6 +168,26 @@  static void init_unused_rings(struct intel_gt *gt)
 	}
 }
 
+static void intel_gt_apply_ccs_mode(struct intel_gt *gt)
+{
+	u32 mode;
+	int cslice;
+
+	if (!IS_DG2(gt->i915))
+		return;
+
+	/* Set '0' as a default CCS id to all the cslices */
+	mode = 0;
+
+	for (cslice = 0; cslice < hweight32(CCS_MASK(gt)); cslice++)
+		/* Write 0x7 if no CCS context dispatches to this cslice */
+		if (!(CCS_MASK(gt) & BIT(cslice)))
+			mode |= XEHP_CCS_MODE_CSLICE(cslice,
+						     XEHP_CCS_MODE_CSLICE_MASK);
+
+	intel_uncore_write(gt->uncore, XEHP_CCS_MODE, mode);
+}
+
 int intel_gt_init_hw(struct intel_gt *gt)
 {
 	struct drm_i915_private *i915 = gt->i915;
@@ -195,6 +215,9 @@  int intel_gt_init_hw(struct intel_gt *gt)
 
 	intel_gt_init_swizzling(gt);
 
+	/* Configure CCS mode */
+	intel_gt_apply_ccs_mode(gt);
+
 	/*
 	 * At least 830 can leave some of the unused rings
 	 * "active" (ie. head != tail) after resume which
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
index cf709f6c05ae..8224dd99c7d7 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
@@ -1480,6 +1480,11 @@ 
 #define   GEN12_RCU_MODE_CCS_ENABLE		REG_BIT(0)
 #define   XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE	REG_BIT(1)
 
+#define XEHP_CCS_MODE				_MMIO(0x14804)
+#define   XEHP_CCS_MODE_CSLICE_MASK		REG_GENMASK(2, 0) /* CCS0-3 + rsvd */
+#define   XEHP_CCS_MODE_CSLICE_WIDTH		ilog2(XEHP_CCS_MODE_CSLICE_MASK + 1)
+#define   XEHP_CCS_MODE_CSLICE(cslice, ccs)	(ccs << (cslice * XEHP_CCS_MODE_CSLICE_WIDTH))
+
 #define CHV_FUSE_GT				_MMIO(VLV_GUNIT_BASE + 0x2168)
 #define   CHV_FGT_DISABLE_SS0			(1 << 10)
 #define   CHV_FGT_DISABLE_SS1			(1 << 11)