diff mbox

[8/8] drm/i915: Expose RPCS (SSEU) configuration to userspace

Message ID 20180425114521.7524-9-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Lionel Landwerlin April 25, 2018, 11:45 a.m. UTC
From: Chris Wilson <chris@chris-wilson.co.uk>

We want to allow userspace to reconfigure the subslice configuration for
its own use case. To do so, we expose a context parameter to allow
adjustment of the RPCS register stored within the context image (and
currently not accessible via LRI). If the context is adjusted before
first use, the adjustment is for "free"; otherwise if the context is
active we flush the context off the GPU (stalling all users) and forcing
the GPU to save the context to memory where we can modify it and so
ensure that the register is reloaded on next execution.

The overhead of managing additional EU subslices can be significant,
especially in multi-context workloads. Non-GPGPU contexts should
preferably disable the subslices it is not using, and others should
fine-tune the number to match their workload.

We expose complete control over the RPCS register, allowing
configuration of slice/subslice, via masks packed into a u64 for
simplicity. For example,

	struct drm_i915_gem_context_param arg;
	struct drm_i915_gem_context_param_sseu sseu = { .flags = I915_EXEC_RENDER };

	memset(&arg, 0, sizeof(arg));
	arg.ctx_id = ctx;
	arg.param = I915_CONTEXT_PARAM_SSEU;
	arg.value = (uintptr_t) &sseu;
	if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg) == 0) {
		sseu.packed.subslice_mask = 0;

		drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &arg);
	}

could be used to disable all subslices where supported.

v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() (Lionel)

v3: Add ability to program this per engine (Chris)

v4: Move most get_sseu() into i915_gem_context.c (Lionel)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
c: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
CC: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
CC: Zhipeng Gong <zhipeng.gong@intel.com>
CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem_context.c | 82 ++++++++++++++++++++++++-
 drivers/gpu/drm/i915/intel_lrc.c        | 55 +++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.h        |  4 ++
 include/uapi/drm/i915_drm.h             | 28 +++++++++
 4 files changed, 168 insertions(+), 1 deletion(-)

Comments

Joonas Lahtinen April 26, 2018, 10 a.m. UTC | #1
Quoting Lionel Landwerlin (2018-04-25 14:45:21)
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> We want to allow userspace to reconfigure the subslice configuration for
> its own use case. To do so, we expose a context parameter to allow
> adjustment of the RPCS register stored within the context image (and
> currently not accessible via LRI). If the context is adjusted before
> first use, the adjustment is for "free"; otherwise if the context is
> active we flush the context off the GPU (stalling all users) and forcing
> the GPU to save the context to memory where we can modify it and so
> ensure that the register is reloaded on next execution.
> 
> The overhead of managing additional EU subslices can be significant,
> especially in multi-context workloads. Non-GPGPU contexts should
> preferably disable the subslices it is not using, and others should
> fine-tune the number to match their workload.

This hit a dead end last time due to the system wide policy needed to
avoid two parties fighting over the slice count (and going back and
forth between two slice counts would counter the benefits received from
this).

Do we now have a solution for the contention? I don't see code to
negotiate a global value, just raw setter.

Regards, Joonas

> 
> We expose complete control over the RPCS register, allowing
> configuration of slice/subslice, via masks packed into a u64 for
> simplicity. For example,
> 
>         struct drm_i915_gem_context_param arg;
>         struct drm_i915_gem_context_param_sseu sseu = { .flags = I915_EXEC_RENDER };
> 
>         memset(&arg, 0, sizeof(arg));
>         arg.ctx_id = ctx;
>         arg.param = I915_CONTEXT_PARAM_SSEU;
>         arg.value = (uintptr_t) &sseu;
>         if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg) == 0) {
>                 sseu.packed.subslice_mask = 0;
> 
>                 drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &arg);
>         }
> 
> could be used to disable all subslices where supported.
> 
> v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() (Lionel)
> 
> v3: Add ability to program this per engine (Chris)
> 
> v4: Move most get_sseu() into i915_gem_context.c (Lionel)
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> c: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
> CC: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> CC: Zhipeng Gong <zhipeng.gong@intel.com>
> CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem_context.c | 82 ++++++++++++++++++++++++-
>  drivers/gpu/drm/i915/intel_lrc.c        | 55 +++++++++++++++++
>  drivers/gpu/drm/i915/intel_lrc.h        |  4 ++
>  include/uapi/drm/i915_drm.h             | 28 +++++++++
>  4 files changed, 168 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index bdf050beeb94..b97ddcf47514 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -740,6 +740,26 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
>         return 0;
>  }
>  
> +static struct i915_gem_context_sseu
> +i915_gem_context_sseu_from_user_sseu(const struct sseu_dev_info *sseu,
> +                                    const struct drm_i915_gem_context_param_sseu *user_sseu)
> +{
> +       struct i915_gem_context_sseu value = {
> +               .slice_mask = user_sseu->packed.slice_mask == 0 ?
> +                             sseu->slice_mask :
> +                             (user_sseu->packed.slice_mask & sseu->slice_mask),
> +               .subslice_mask = user_sseu->packed.subslice_mask == 0 ?
> +                                sseu->subslice_mask[0] :
> +                                (user_sseu->packed.subslice_mask & sseu->subslice_mask[0]),
> +               .min_eus_per_subslice = min(user_sseu->packed.min_eus_per_subslice,
> +                                           sseu->max_eus_per_subslice),
> +               .max_eus_per_subslice = min(user_sseu->packed.max_eus_per_subslice,
> +                                           sseu->max_eus_per_subslice),
> +       };
> +
> +       return value;
> +}
> +
>  int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>                                     struct drm_file *file)
>  {
> @@ -777,6 +797,37 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>         case I915_CONTEXT_PARAM_PRIORITY:
>                 args->value = ctx->sched.priority;
>                 break;
> +       case I915_CONTEXT_PARAM_SSEU: {
> +               struct drm_i915_gem_context_param_sseu param_sseu;
> +               struct intel_engine_cs *engine;
> +
> +               if (copy_from_user(&param_sseu, u64_to_user_ptr(args->value),
> +                                  sizeof(param_sseu))) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +
> +               engine = i915_gem_engine_from_flags(to_i915(dev), file,
> +                                                   param_sseu.flags);
> +               if (!engine) {
> +                       ret = -EINVAL;
> +                       break;
> +               }
> +
> +               param_sseu.packed.slice_mask =
> +                       ctx->engine[engine->id].sseu.slice_mask;
> +               param_sseu.packed.subslice_mask =
> +                       ctx->engine[engine->id].sseu.subslice_mask;
> +               param_sseu.packed.min_eus_per_subslice =
> +                       ctx->engine[engine->id].sseu.min_eus_per_subslice;
> +               param_sseu.packed.max_eus_per_subslice =
> +                       ctx->engine[engine->id].sseu.max_eus_per_subslice;
> +
> +               if (copy_to_user(u64_to_user_ptr(args->value), &param_sseu,
> +                                sizeof(param_sseu)))
> +                       ret = -EFAULT;
> +               break;
> +       }
>         default:
>                 ret = -EINVAL;
>                 break;
> @@ -832,7 +883,6 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>                 else
>                         i915_gem_context_clear_bannable(ctx);
>                 break;
> -
>         case I915_CONTEXT_PARAM_PRIORITY:
>                 {
>                         s64 priority = args->value;
> @@ -851,7 +901,37 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>                                 ctx->sched.priority = priority;
>                 }
>                 break;
> +       case I915_CONTEXT_PARAM_SSEU:
> +               if (args->size)
> +                       ret = -EINVAL;
> +               else if (!HAS_EXECLISTS(ctx->i915))
> +                       ret = -ENODEV;
> +               else {
> +                       struct drm_i915_private *dev_priv = to_i915(dev);
> +                       struct drm_i915_gem_context_param_sseu user_sseu;
> +                       struct i915_gem_context_sseu ctx_sseu;
> +                       struct intel_engine_cs *engine;
> +
> +                       if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
> +                                          sizeof(user_sseu))) {
> +                               ret = -EFAULT;
> +                               break;
> +                       }
> +
> +                       engine = i915_gem_engine_from_flags(dev_priv, file,
> +                                                           user_sseu.flags);
> +                       if (!engine) {
> +                               ret = -EINVAL;
> +                               break;
> +                       }
> +
> +                       ctx_sseu =
> +                               i915_gem_context_sseu_from_user_sseu(&INTEL_INFO(dev_priv)->sseu,
> +                                                                    &user_sseu);
>  
> +                       ret = intel_lr_context_set_sseu(ctx, engine, &ctx_sseu);
> +               }
> +               break;
>         default:
>                 ret = -EINVAL;
>                 break;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index dca17ef24de5..9caddcb1f234 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2711,6 +2711,61 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv)
>         }
>  }
>  
> +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
> +                             struct intel_engine_cs *engine,
> +                             struct i915_gem_context_sseu *sseu)
> +{
> +       struct drm_i915_private *dev_priv = ctx->i915;
> +       struct intel_context *ce;
> +       enum intel_engine_id id;
> +       int ret;
> +
> +       lockdep_assert_held(&dev_priv->drm.struct_mutex);
> +
> +       if (memcmp(sseu, &ctx->engine[engine->id].sseu, sizeof(*sseu)) == 0)
> +               return 0;
> +
> +       /*
> +        * We can only program this on render ring.
> +        */
> +       ce = &ctx->engine[RCS];
> +
> +       if (ce->pin_count) { /* Assume that the context is active! */
> +               ret = i915_gem_switch_to_kernel_context(dev_priv);
> +               if (ret)
> +                       return ret;
> +
> +               ret = i915_gem_wait_for_idle(dev_priv,
> +                                            I915_WAIT_INTERRUPTIBLE |
> +                                            I915_WAIT_LOCKED);
> +               if (ret)
> +                       return ret;
> +       }
> +
> +       if (ce->state) {
> +               void *vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
> +               u32 *regs;
> +
> +               if (IS_ERR(vaddr))
> +                       return PTR_ERR(vaddr);
> +
> +               regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
> +
> +               regs[CTX_R_PWR_CLK_STATE + 1] =
> +                       make_rpcs(&INTEL_INFO(dev_priv)->sseu, sseu);
> +               i915_gem_object_unpin_map(ce->state->obj);
> +       }
> +
> +       /*
> +        * Apply the configuration to all engine. Our hardware doesn't
> +        * currently support different configurations for each engine.
> +        */
> +       for_each_engine(engine, dev_priv, id)
> +               ctx->engine[id].sseu = *sseu;
> +
> +       return 0;
> +}
> +
>  #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>  #include "selftests/intel_lrc.c"
>  #endif
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index d91d69a17206..4c84266814fa 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -112,4 +112,8 @@ intel_lr_context_descriptor(struct i915_gem_context *ctx,
>         return ctx->engine[engine->id].lrc_desc;
>  }
>  
> +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
> +                             struct intel_engine_cs *engine,
> +                             struct i915_gem_context_sseu *sseu);
> +
>  #endif /* _INTEL_LRC_H_ */
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 96eda8176030..75749ce11c03 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1456,9 +1456,37 @@ struct drm_i915_gem_context_param {
>  #define   I915_CONTEXT_MAX_USER_PRIORITY       1023 /* inclusive */
>  #define   I915_CONTEXT_DEFAULT_PRIORITY                0
>  #define   I915_CONTEXT_MIN_USER_PRIORITY       -1023 /* inclusive */
> +       /*
> +        * When using the following param, value should be a pointer to
> +        * drm_i915_gem_context_param_sseu.
> +        */
> +#define I915_CONTEXT_PARAM_SSEU                0x7
>         __u64 value;
>  };
>  
> +struct drm_i915_gem_context_param_sseu {
> +       /*
> +        * Engine to be configured or queried. Same value you would use with
> +        * drm_i915_gem_execbuffer2.
> +        */
> +       __u64 flags;
> +
> +       /*
> +        * Setting slice_mask or subslice_mask to 0 will make the context use
> +        * masks reported respectively by I915_PARAM_SLICE_MASK or
> +        * I915_PARAM_SUBSLICE_MASK.
> +        */
> +       union {
> +               struct {
> +                       __u8 slice_mask;
> +                       __u8 subslice_mask;
> +                       __u8 min_eus_per_subslice;
> +                       __u8 max_eus_per_subslice;
> +               } packed;
> +               __u64 value;
> +       };
> +};
> +
>  enum drm_i915_oa_format {
>         I915_OA_FORMAT_A13 = 1,     /* HSW only */
>         I915_OA_FORMAT_A29,         /* HSW only */
> -- 
> 2.17.0
>
Lionel Landwerlin April 26, 2018, 10:22 a.m. UTC | #2
On 26/04/18 11:00, Joonas Lahtinen wrote:
> Quoting Lionel Landwerlin (2018-04-25 14:45:21)
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>
>> We want to allow userspace to reconfigure the subslice configuration for
>> its own use case. To do so, we expose a context parameter to allow
>> adjustment of the RPCS register stored within the context image (and
>> currently not accessible via LRI). If the context is adjusted before
>> first use, the adjustment is for "free"; otherwise if the context is
>> active we flush the context off the GPU (stalling all users) and forcing
>> the GPU to save the context to memory where we can modify it and so
>> ensure that the register is reloaded on next execution.
>>
>> The overhead of managing additional EU subslices can be significant,
>> especially in multi-context workloads. Non-GPGPU contexts should
>> preferably disable the subslices it is not using, and others should
>> fine-tune the number to match their workload.
> This hit a dead end last time due to the system wide policy needed to
> avoid two parties fighting over the slice count (and going back and
> forth between two slice counts would counter the benefits received from
> this).
>
> Do we now have a solution for the contention? I don't see code to
> negotiate a global value, just raw setter.
>
> Regards, Joonas

I've tried to come up with some numbers about the cost of the back & 
forth (see igt series).

Other than that, I don't think we can expect the kernel to workaround 
the inefficient use of the hardware by userspace.

>
>> We expose complete control over the RPCS register, allowing
>> configuration of slice/subslice, via masks packed into a u64 for
>> simplicity. For example,
>>
>>          struct drm_i915_gem_context_param arg;
>>          struct drm_i915_gem_context_param_sseu sseu = { .flags = I915_EXEC_RENDER };
>>
>>          memset(&arg, 0, sizeof(arg));
>>          arg.ctx_id = ctx;
>>          arg.param = I915_CONTEXT_PARAM_SSEU;
>>          arg.value = (uintptr_t) &sseu;
>>          if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg) == 0) {
>>                  sseu.packed.subslice_mask = 0;
>>
>>                  drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &arg);
>>          }
>>
>> could be used to disable all subslices where supported.
>>
>> v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() (Lionel)
>>
>> v3: Add ability to program this per engine (Chris)
>>
>> v4: Move most get_sseu() into i915_gem_context.c (Lionel)
>>
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>> c: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
>> CC: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> CC: Zhipeng Gong <zhipeng.gong@intel.com>
>> CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_gem_context.c | 82 ++++++++++++++++++++++++-
>>   drivers/gpu/drm/i915/intel_lrc.c        | 55 +++++++++++++++++
>>   drivers/gpu/drm/i915/intel_lrc.h        |  4 ++
>>   include/uapi/drm/i915_drm.h             | 28 +++++++++
>>   4 files changed, 168 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>> index bdf050beeb94..b97ddcf47514 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -740,6 +740,26 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
>>          return 0;
>>   }
>>   
>> +static struct i915_gem_context_sseu
>> +i915_gem_context_sseu_from_user_sseu(const struct sseu_dev_info *sseu,
>> +                                    const struct drm_i915_gem_context_param_sseu *user_sseu)
>> +{
>> +       struct i915_gem_context_sseu value = {
>> +               .slice_mask = user_sseu->packed.slice_mask == 0 ?
>> +                             sseu->slice_mask :
>> +                             (user_sseu->packed.slice_mask & sseu->slice_mask),
>> +               .subslice_mask = user_sseu->packed.subslice_mask == 0 ?
>> +                                sseu->subslice_mask[0] :
>> +                                (user_sseu->packed.subslice_mask & sseu->subslice_mask[0]),
>> +               .min_eus_per_subslice = min(user_sseu->packed.min_eus_per_subslice,
>> +                                           sseu->max_eus_per_subslice),
>> +               .max_eus_per_subslice = min(user_sseu->packed.max_eus_per_subslice,
>> +                                           sseu->max_eus_per_subslice),
>> +       };
>> +
>> +       return value;
>> +}
>> +
>>   int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>>                                      struct drm_file *file)
>>   {
>> @@ -777,6 +797,37 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>>          case I915_CONTEXT_PARAM_PRIORITY:
>>                  args->value = ctx->sched.priority;
>>                  break;
>> +       case I915_CONTEXT_PARAM_SSEU: {
>> +               struct drm_i915_gem_context_param_sseu param_sseu;
>> +               struct intel_engine_cs *engine;
>> +
>> +               if (copy_from_user(&param_sseu, u64_to_user_ptr(args->value),
>> +                                  sizeof(param_sseu))) {
>> +                       ret = -EFAULT;
>> +                       break;
>> +               }
>> +
>> +               engine = i915_gem_engine_from_flags(to_i915(dev), file,
>> +                                                   param_sseu.flags);
>> +               if (!engine) {
>> +                       ret = -EINVAL;
>> +                       break;
>> +               }
>> +
>> +               param_sseu.packed.slice_mask =
>> +                       ctx->engine[engine->id].sseu.slice_mask;
>> +               param_sseu.packed.subslice_mask =
>> +                       ctx->engine[engine->id].sseu.subslice_mask;
>> +               param_sseu.packed.min_eus_per_subslice =
>> +                       ctx->engine[engine->id].sseu.min_eus_per_subslice;
>> +               param_sseu.packed.max_eus_per_subslice =
>> +                       ctx->engine[engine->id].sseu.max_eus_per_subslice;
>> +
>> +               if (copy_to_user(u64_to_user_ptr(args->value), &param_sseu,
>> +                                sizeof(param_sseu)))
>> +                       ret = -EFAULT;
>> +               break;
>> +       }
>>          default:
>>                  ret = -EINVAL;
>>                  break;
>> @@ -832,7 +883,6 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>>                  else
>>                          i915_gem_context_clear_bannable(ctx);
>>                  break;
>> -
>>          case I915_CONTEXT_PARAM_PRIORITY:
>>                  {
>>                          s64 priority = args->value;
>> @@ -851,7 +901,37 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>>                                  ctx->sched.priority = priority;
>>                  }
>>                  break;
>> +       case I915_CONTEXT_PARAM_SSEU:
>> +               if (args->size)
>> +                       ret = -EINVAL;
>> +               else if (!HAS_EXECLISTS(ctx->i915))
>> +                       ret = -ENODEV;
>> +               else {
>> +                       struct drm_i915_private *dev_priv = to_i915(dev);
>> +                       struct drm_i915_gem_context_param_sseu user_sseu;
>> +                       struct i915_gem_context_sseu ctx_sseu;
>> +                       struct intel_engine_cs *engine;
>> +
>> +                       if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
>> +                                          sizeof(user_sseu))) {
>> +                               ret = -EFAULT;
>> +                               break;
>> +                       }
>> +
>> +                       engine = i915_gem_engine_from_flags(dev_priv, file,
>> +                                                           user_sseu.flags);
>> +                       if (!engine) {
>> +                               ret = -EINVAL;
>> +                               break;
>> +                       }
>> +
>> +                       ctx_sseu =
>> +                               i915_gem_context_sseu_from_user_sseu(&INTEL_INFO(dev_priv)->sseu,
>> +                                                                    &user_sseu);
>>   
>> +                       ret = intel_lr_context_set_sseu(ctx, engine, &ctx_sseu);
>> +               }
>> +               break;
>>          default:
>>                  ret = -EINVAL;
>>                  break;
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index dca17ef24de5..9caddcb1f234 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -2711,6 +2711,61 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv)
>>          }
>>   }
>>   
>> +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
>> +                             struct intel_engine_cs *engine,
>> +                             struct i915_gem_context_sseu *sseu)
>> +{
>> +       struct drm_i915_private *dev_priv = ctx->i915;
>> +       struct intel_context *ce;
>> +       enum intel_engine_id id;
>> +       int ret;
>> +
>> +       lockdep_assert_held(&dev_priv->drm.struct_mutex);
>> +
>> +       if (memcmp(sseu, &ctx->engine[engine->id].sseu, sizeof(*sseu)) == 0)
>> +               return 0;
>> +
>> +       /*
>> +        * We can only program this on render ring.
>> +        */
>> +       ce = &ctx->engine[RCS];
>> +
>> +       if (ce->pin_count) { /* Assume that the context is active! */
>> +               ret = i915_gem_switch_to_kernel_context(dev_priv);
>> +               if (ret)
>> +                       return ret;
>> +
>> +               ret = i915_gem_wait_for_idle(dev_priv,
>> +                                            I915_WAIT_INTERRUPTIBLE |
>> +                                            I915_WAIT_LOCKED);
>> +               if (ret)
>> +                       return ret;
>> +       }
>> +
>> +       if (ce->state) {
>> +               void *vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
>> +               u32 *regs;
>> +
>> +               if (IS_ERR(vaddr))
>> +                       return PTR_ERR(vaddr);
>> +
>> +               regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
>> +
>> +               regs[CTX_R_PWR_CLK_STATE + 1] =
>> +                       make_rpcs(&INTEL_INFO(dev_priv)->sseu, sseu);
>> +               i915_gem_object_unpin_map(ce->state->obj);
>> +       }
>> +
>> +       /*
>> +        * Apply the configuration to all engine. Our hardware doesn't
>> +        * currently support different configurations for each engine.
>> +        */
>> +       for_each_engine(engine, dev_priv, id)
>> +               ctx->engine[id].sseu = *sseu;
>> +
>> +       return 0;
>> +}
>> +
>>   #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>>   #include "selftests/intel_lrc.c"
>>   #endif
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
>> index d91d69a17206..4c84266814fa 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.h
>> +++ b/drivers/gpu/drm/i915/intel_lrc.h
>> @@ -112,4 +112,8 @@ intel_lr_context_descriptor(struct i915_gem_context *ctx,
>>          return ctx->engine[engine->id].lrc_desc;
>>   }
>>   
>> +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
>> +                             struct intel_engine_cs *engine,
>> +                             struct i915_gem_context_sseu *sseu);
>> +
>>   #endif /* _INTEL_LRC_H_ */
>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>> index 96eda8176030..75749ce11c03 100644
>> --- a/include/uapi/drm/i915_drm.h
>> +++ b/include/uapi/drm/i915_drm.h
>> @@ -1456,9 +1456,37 @@ struct drm_i915_gem_context_param {
>>   #define   I915_CONTEXT_MAX_USER_PRIORITY       1023 /* inclusive */
>>   #define   I915_CONTEXT_DEFAULT_PRIORITY                0
>>   #define   I915_CONTEXT_MIN_USER_PRIORITY       -1023 /* inclusive */
>> +       /*
>> +        * When using the following param, value should be a pointer to
>> +        * drm_i915_gem_context_param_sseu.
>> +        */
>> +#define I915_CONTEXT_PARAM_SSEU                0x7
>>          __u64 value;
>>   };
>>   
>> +struct drm_i915_gem_context_param_sseu {
>> +       /*
>> +        * Engine to be configured or queried. Same value you would use with
>> +        * drm_i915_gem_execbuffer2.
>> +        */
>> +       __u64 flags;
>> +
>> +       /*
>> +        * Setting slice_mask or subslice_mask to 0 will make the context use
>> +        * masks reported respectively by I915_PARAM_SLICE_MASK or
>> +        * I915_PARAM_SUBSLICE_MASK.
>> +        */
>> +       union {
>> +               struct {
>> +                       __u8 slice_mask;
>> +                       __u8 subslice_mask;
>> +                       __u8 min_eus_per_subslice;
>> +                       __u8 max_eus_per_subslice;
>> +               } packed;
>> +               __u64 value;
>> +       };
>> +};
>> +
>>   enum drm_i915_oa_format {
>>          I915_OA_FORMAT_A13 = 1,     /* HSW only */
>>          I915_OA_FORMAT_A29,         /* HSW only */
>> -- 
>> 2.17.0
>>
Joonas Lahtinen May 3, 2018, 4:04 p.m. UTC | #3
Quoting Lionel Landwerlin (2018-04-26 13:22:30)
> On 26/04/18 11:00, Joonas Lahtinen wrote:
> > Quoting Lionel Landwerlin (2018-04-25 14:45:21)
> >> From: Chris Wilson <chris@chris-wilson.co.uk>
> >>
> >> We want to allow userspace to reconfigure the subslice configuration for
> >> its own use case. To do so, we expose a context parameter to allow
> >> adjustment of the RPCS register stored within the context image (and
> >> currently not accessible via LRI). If the context is adjusted before
> >> first use, the adjustment is for "free"; otherwise if the context is
> >> active we flush the context off the GPU (stalling all users) and forcing
> >> the GPU to save the context to memory where we can modify it and so
> >> ensure that the register is reloaded on next execution.
> >>
> >> The overhead of managing additional EU subslices can be significant,
> >> especially in multi-context workloads. Non-GPGPU contexts should
> >> preferably disable the subslices it is not using, and others should
> >> fine-tune the number to match their workload.
> > This hit a dead end last time due to the system wide policy needed to
> > avoid two parties fighting over the slice count (and going back and
> > forth between two slice counts would counter the benefits received from
> > this).
> >
> > Do we now have a solution for the contention? I don't see code to
> > negotiate a global value, just raw setter.
> >
> > Regards, Joonas
> 
> I've tried to come up with some numbers about the cost of the back & 
> forth (see igt series).
> 
> Other than that, I don't think we can expect the kernel to workaround 
> the inefficient use of the hardware by userspace.

Well, I'm pretty sure we should not try to make the situation too
miserable for the basic usecases.

If we allow two contexts to fight over the slice count, countering any
perceived benefit, I don't think that'll be a good default.

My recollection of the last round of discussion was that reasonable
thing to do would be to only disable slices when everyone is willing to
let go of. Then it would become a system maintainer level decision to
only run on two slices for when they see fit (configuring the userspace).

More advanced tactics would include scheduling work so that we try to
avoid the slice count changes and deduct the switching time from the
execution budget of the app requesting less slices (if we had fair
time slicing).

Regards, Joonas

> 
> >
> >> We expose complete control over the RPCS register, allowing
> >> configuration of slice/subslice, via masks packed into a u64 for
> >> simplicity. For example,
> >>
> >>          struct drm_i915_gem_context_param arg;
> >>          struct drm_i915_gem_context_param_sseu sseu = { .flags = I915_EXEC_RENDER };
> >>
> >>          memset(&arg, 0, sizeof(arg));
> >>          arg.ctx_id = ctx;
> >>          arg.param = I915_CONTEXT_PARAM_SSEU;
> >>          arg.value = (uintptr_t) &sseu;
> >>          if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg) == 0) {
> >>                  sseu.packed.subslice_mask = 0;
> >>
> >>                  drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &arg);
> >>          }
> >>
> >> could be used to disable all subslices where supported.
> >>
> >> v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() (Lionel)
> >>
> >> v3: Add ability to program this per engine (Chris)
> >>
> >> v4: Move most get_sseu() into i915_gem_context.c (Lionel)
> >>
> >> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
> >> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> >> c: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
> >> CC: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >> CC: Zhipeng Gong <zhipeng.gong@intel.com>
> >> CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> >> ---
> >>   drivers/gpu/drm/i915/i915_gem_context.c | 82 ++++++++++++++++++++++++-
> >>   drivers/gpu/drm/i915/intel_lrc.c        | 55 +++++++++++++++++
> >>   drivers/gpu/drm/i915/intel_lrc.h        |  4 ++
> >>   include/uapi/drm/i915_drm.h             | 28 +++++++++
> >>   4 files changed, 168 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> >> index bdf050beeb94..b97ddcf47514 100644
> >> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> >> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> >> @@ -740,6 +740,26 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
> >>          return 0;
> >>   }
> >>   
> >> +static struct i915_gem_context_sseu
> >> +i915_gem_context_sseu_from_user_sseu(const struct sseu_dev_info *sseu,
> >> +                                    const struct drm_i915_gem_context_param_sseu *user_sseu)
> >> +{
> >> +       struct i915_gem_context_sseu value = {
> >> +               .slice_mask = user_sseu->packed.slice_mask == 0 ?
> >> +                             sseu->slice_mask :
> >> +                             (user_sseu->packed.slice_mask & sseu->slice_mask),
> >> +               .subslice_mask = user_sseu->packed.subslice_mask == 0 ?
> >> +                                sseu->subslice_mask[0] :
> >> +                                (user_sseu->packed.subslice_mask & sseu->subslice_mask[0]),
> >> +               .min_eus_per_subslice = min(user_sseu->packed.min_eus_per_subslice,
> >> +                                           sseu->max_eus_per_subslice),
> >> +               .max_eus_per_subslice = min(user_sseu->packed.max_eus_per_subslice,
> >> +                                           sseu->max_eus_per_subslice),
> >> +       };
> >> +
> >> +       return value;
> >> +}
> >> +
> >>   int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
> >>                                      struct drm_file *file)
> >>   {
> >> @@ -777,6 +797,37 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
> >>          case I915_CONTEXT_PARAM_PRIORITY:
> >>                  args->value = ctx->sched.priority;
> >>                  break;
> >> +       case I915_CONTEXT_PARAM_SSEU: {
> >> +               struct drm_i915_gem_context_param_sseu param_sseu;
> >> +               struct intel_engine_cs *engine;
> >> +
> >> +               if (copy_from_user(&param_sseu, u64_to_user_ptr(args->value),
> >> +                                  sizeof(param_sseu))) {
> >> +                       ret = -EFAULT;
> >> +                       break;
> >> +               }
> >> +
> >> +               engine = i915_gem_engine_from_flags(to_i915(dev), file,
> >> +                                                   param_sseu.flags);
> >> +               if (!engine) {
> >> +                       ret = -EINVAL;
> >> +                       break;
> >> +               }
> >> +
> >> +               param_sseu.packed.slice_mask =
> >> +                       ctx->engine[engine->id].sseu.slice_mask;
> >> +               param_sseu.packed.subslice_mask =
> >> +                       ctx->engine[engine->id].sseu.subslice_mask;
> >> +               param_sseu.packed.min_eus_per_subslice =
> >> +                       ctx->engine[engine->id].sseu.min_eus_per_subslice;
> >> +               param_sseu.packed.max_eus_per_subslice =
> >> +                       ctx->engine[engine->id].sseu.max_eus_per_subslice;
> >> +
> >> +               if (copy_to_user(u64_to_user_ptr(args->value), &param_sseu,
> >> +                                sizeof(param_sseu)))
> >> +                       ret = -EFAULT;
> >> +               break;
> >> +       }
> >>          default:
> >>                  ret = -EINVAL;
> >>                  break;
> >> @@ -832,7 +883,6 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
> >>                  else
> >>                          i915_gem_context_clear_bannable(ctx);
> >>                  break;
> >> -
> >>          case I915_CONTEXT_PARAM_PRIORITY:
> >>                  {
> >>                          s64 priority = args->value;
> >> @@ -851,7 +901,37 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
> >>                                  ctx->sched.priority = priority;
> >>                  }
> >>                  break;
> >> +       case I915_CONTEXT_PARAM_SSEU:
> >> +               if (args->size)
> >> +                       ret = -EINVAL;
> >> +               else if (!HAS_EXECLISTS(ctx->i915))
> >> +                       ret = -ENODEV;
> >> +               else {
> >> +                       struct drm_i915_private *dev_priv = to_i915(dev);
> >> +                       struct drm_i915_gem_context_param_sseu user_sseu;
> >> +                       struct i915_gem_context_sseu ctx_sseu;
> >> +                       struct intel_engine_cs *engine;
> >> +
> >> +                       if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
> >> +                                          sizeof(user_sseu))) {
> >> +                               ret = -EFAULT;
> >> +                               break;
> >> +                       }
> >> +
> >> +                       engine = i915_gem_engine_from_flags(dev_priv, file,
> >> +                                                           user_sseu.flags);
> >> +                       if (!engine) {
> >> +                               ret = -EINVAL;
> >> +                               break;
> >> +                       }
> >> +
> >> +                       ctx_sseu =
> >> +                               i915_gem_context_sseu_from_user_sseu(&INTEL_INFO(dev_priv)->sseu,
> >> +                                                                    &user_sseu);
> >>   
> >> +                       ret = intel_lr_context_set_sseu(ctx, engine, &ctx_sseu);
> >> +               }
> >> +               break;
> >>          default:
> >>                  ret = -EINVAL;
> >>                  break;
> >> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> >> index dca17ef24de5..9caddcb1f234 100644
> >> --- a/drivers/gpu/drm/i915/intel_lrc.c
> >> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> >> @@ -2711,6 +2711,61 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv)
> >>          }
> >>   }
> >>   
> >> +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
> >> +                             struct intel_engine_cs *engine,
> >> +                             struct i915_gem_context_sseu *sseu)
> >> +{
> >> +       struct drm_i915_private *dev_priv = ctx->i915;
> >> +       struct intel_context *ce;
> >> +       enum intel_engine_id id;
> >> +       int ret;
> >> +
> >> +       lockdep_assert_held(&dev_priv->drm.struct_mutex);
> >> +
> >> +       if (memcmp(sseu, &ctx->engine[engine->id].sseu, sizeof(*sseu)) == 0)
> >> +               return 0;
> >> +
> >> +       /*
> >> +        * We can only program this on render ring.
> >> +        */
> >> +       ce = &ctx->engine[RCS];
> >> +
> >> +       if (ce->pin_count) { /* Assume that the context is active! */
> >> +               ret = i915_gem_switch_to_kernel_context(dev_priv);
> >> +               if (ret)
> >> +                       return ret;
> >> +
> >> +               ret = i915_gem_wait_for_idle(dev_priv,
> >> +                                            I915_WAIT_INTERRUPTIBLE |
> >> +                                            I915_WAIT_LOCKED);
> >> +               if (ret)
> >> +                       return ret;
> >> +       }
> >> +
> >> +       if (ce->state) {
> >> +               void *vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
> >> +               u32 *regs;
> >> +
> >> +               if (IS_ERR(vaddr))
> >> +                       return PTR_ERR(vaddr);
> >> +
> >> +               regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
> >> +
> >> +               regs[CTX_R_PWR_CLK_STATE + 1] =
> >> +                       make_rpcs(&INTEL_INFO(dev_priv)->sseu, sseu);
> >> +               i915_gem_object_unpin_map(ce->state->obj);
> >> +       }
> >> +
> >> +       /*
> >> +        * Apply the configuration to all engine. Our hardware doesn't
> >> +        * currently support different configurations for each engine.
> >> +        */
> >> +       for_each_engine(engine, dev_priv, id)
> >> +               ctx->engine[id].sseu = *sseu;
> >> +
> >> +       return 0;
> >> +}
> >> +
> >>   #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> >>   #include "selftests/intel_lrc.c"
> >>   #endif
> >> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> >> index d91d69a17206..4c84266814fa 100644
> >> --- a/drivers/gpu/drm/i915/intel_lrc.h
> >> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> >> @@ -112,4 +112,8 @@ intel_lr_context_descriptor(struct i915_gem_context *ctx,
> >>          return ctx->engine[engine->id].lrc_desc;
> >>   }
> >>   
> >> +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
> >> +                             struct intel_engine_cs *engine,
> >> +                             struct i915_gem_context_sseu *sseu);
> >> +
> >>   #endif /* _INTEL_LRC_H_ */
> >> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> >> index 96eda8176030..75749ce11c03 100644
> >> --- a/include/uapi/drm/i915_drm.h
> >> +++ b/include/uapi/drm/i915_drm.h
> >> @@ -1456,9 +1456,37 @@ struct drm_i915_gem_context_param {
> >>   #define   I915_CONTEXT_MAX_USER_PRIORITY       1023 /* inclusive */
> >>   #define   I915_CONTEXT_DEFAULT_PRIORITY                0
> >>   #define   I915_CONTEXT_MIN_USER_PRIORITY       -1023 /* inclusive */
> >> +       /*
> >> +        * When using the following param, value should be a pointer to
> >> +        * drm_i915_gem_context_param_sseu.
> >> +        */
> >> +#define I915_CONTEXT_PARAM_SSEU                0x7
> >>          __u64 value;
> >>   };
> >>   
> >> +struct drm_i915_gem_context_param_sseu {
> >> +       /*
> >> +        * Engine to be configured or queried. Same value you would use with
> >> +        * drm_i915_gem_execbuffer2.
> >> +        */
> >> +       __u64 flags;
> >> +
> >> +       /*
> >> +        * Setting slice_mask or subslice_mask to 0 will make the context use
> >> +        * masks reported respectively by I915_PARAM_SLICE_MASK or
> >> +        * I915_PARAM_SUBSLICE_MASK.
> >> +        */
> >> +       union {
> >> +               struct {
> >> +                       __u8 slice_mask;
> >> +                       __u8 subslice_mask;
> >> +                       __u8 min_eus_per_subslice;
> >> +                       __u8 max_eus_per_subslice;
> >> +               } packed;
> >> +               __u64 value;
> >> +       };
> >> +};
> >> +
> >>   enum drm_i915_oa_format {
> >>          I915_OA_FORMAT_A13 = 1,     /* HSW only */
> >>          I915_OA_FORMAT_A29,         /* HSW only */
> >> -- 
> >> 2.17.0
> >>
>
Chris Wilson May 3, 2018, 4:14 p.m. UTC | #4
Quoting Joonas Lahtinen (2018-05-03 17:04:31)
> More advanced tactics would include scheduling work so that we try to
> avoid the slice count changes and deduct the switching time from the
> execution budget of the app requesting less slices (if we had fair
> time slicing).

Have you been peeking at my reading material? ;)
-Chris
Lionel Landwerlin May 3, 2018, 4:25 p.m. UTC | #5
On 03/05/18 17:04, Joonas Lahtinen wrote:
> Quoting Lionel Landwerlin (2018-04-26 13:22:30)
>> On 26/04/18 11:00, Joonas Lahtinen wrote:
>>> Quoting Lionel Landwerlin (2018-04-25 14:45:21)
>>>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>>>
>>>> We want to allow userspace to reconfigure the subslice configuration for
>>>> its own use case. To do so, we expose a context parameter to allow
>>>> adjustment of the RPCS register stored within the context image (and
>>>> currently not accessible via LRI). If the context is adjusted before
>>>> first use, the adjustment is for "free"; otherwise if the context is
>>>> active we flush the context off the GPU (stalling all users) and forcing
>>>> the GPU to save the context to memory where we can modify it and so
>>>> ensure that the register is reloaded on next execution.
>>>>
>>>> The overhead of managing additional EU subslices can be significant,
>>>> especially in multi-context workloads. Non-GPGPU contexts should
>>>> preferably disable the subslices it is not using, and others should
>>>> fine-tune the number to match their workload.
>>> This hit a dead end last time due to the system wide policy needed to
>>> avoid two parties fighting over the slice count (and going back and
>>> forth between two slice counts would counter the benefits received from
>>> this).
>>>
>>> Do we now have a solution for the contention? I don't see code to
>>> negotiate a global value, just raw setter.
>>>
>>> Regards, Joonas
>> I've tried to come up with some numbers about the cost of the back &
>> forth (see igt series).
>>
>> Other than that, I don't think we can expect the kernel to workaround
>> the inefficient use of the hardware by userspace.
> Well, I'm pretty sure we should not try to make the situation too
> miserable for the basic usecases.
>
> If we allow two contexts to fight over the slice count, countering any
> perceived benefit, I don't think that'll be a good default.
>
> My recollection of the last round of discussion was that reasonable
> thing to do would be to only disable slices when everyone is willing to
> let go of. Then it would become a system maintainer level decision to
> only run on two slices for when they see fit (configuring the userspace).

How would you detect that everybody is willing to let go?
We don't appear to have a mechanism to detect that.
Wouldn't that require to teach all userspace drivers?

>
> More advanced tactics would include scheduling work so that we try to
> avoid the slice count changes and deduct the switching time from the
> execution budget of the app requesting less slices (if we had fair
> time slicing).

That sounds more workable, although fairly complicated.
Maybe we could tweak the priority based on the slice count and say :
     for the next (1second / number of slice config), priority bumped 
for the configuration with x slices
     and rotate every (1second / number of slice config)

Would that resolve the back & forth issue completely though?

Moving a particular context back & forth between different 
configurations is costly too.
You need to drain the context, then pin the context before you can edit 
and resubmit.

Thanks for your feedback,

-
Lionel

>
> Regards, Joonas
>
Tvrtko Ursulin May 3, 2018, 4:30 p.m. UTC | #6
On 03/05/2018 17:04, Joonas Lahtinen wrote:
> Quoting Lionel Landwerlin (2018-04-26 13:22:30)
>> On 26/04/18 11:00, Joonas Lahtinen wrote:
>>> Quoting Lionel Landwerlin (2018-04-25 14:45:21)
>>>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>>>
>>>> We want to allow userspace to reconfigure the subslice configuration for
>>>> its own use case. To do so, we expose a context parameter to allow
>>>> adjustment of the RPCS register stored within the context image (and
>>>> currently not accessible via LRI). If the context is adjusted before
>>>> first use, the adjustment is for "free"; otherwise if the context is
>>>> active we flush the context off the GPU (stalling all users) and forcing
>>>> the GPU to save the context to memory where we can modify it and so
>>>> ensure that the register is reloaded on next execution.
>>>>
>>>> The overhead of managing additional EU subslices can be significant,
>>>> especially in multi-context workloads. Non-GPGPU contexts should
>>>> preferably disable the subslices it is not using, and others should
>>>> fine-tune the number to match their workload.
>>> This hit a dead end last time due to the system wide policy needed to
>>> avoid two parties fighting over the slice count (and going back and
>>> forth between two slice counts would counter the benefits received from
>>> this).
>>>
>>> Do we now have a solution for the contention? I don't see code to
>>> negotiate a global value, just raw setter.
>>>
>>> Regards, Joonas
>>
>> I've tried to come up with some numbers about the cost of the back &
>> forth (see igt series).
>>
>> Other than that, I don't think we can expect the kernel to workaround
>> the inefficient use of the hardware by userspace.
> 
> Well, I'm pretty sure we should not try to make the situation too
> miserable for the basic usecases.
> 
> If we allow two contexts to fight over the slice count, countering any
> perceived benefit, I don't think that'll be a good default.
> 
> My recollection of the last round of discussion was that reasonable
> thing to do would be to only disable slices when everyone is willing to
> let go of. Then it would become a system maintainer level decision to
> only run on two slices for when they see fit (configuring the userspace).

Yes there was definitely talk about co-ordination between OpenCL and 
media stacks.

Btw we mentioned on IRC a few days back that a sysfs knob like 
allow_media_priority, defaulting to 0, could be a good start.

That way userspace could use the uAPI, but unless sysadmin allows it to, 
there will be no fighting over slice counts.

Then at some future time, as long as the uAPI is sensible, more advanced 
policies could be explored. Regardless if in userspace or i915.

Regards,

Tvrtko

> More advanced tactics would include scheduling work so that we try to
> avoid the slice count changes and deduct the switching time from the
> execution budget of the app requesting less slices (if we had fair
> time slicing).
> 
> Regards, Joonas
> 
>>
>>>
>>>> We expose complete control over the RPCS register, allowing
>>>> configuration of slice/subslice, via masks packed into a u64 for
>>>> simplicity. For example,
>>>>
>>>>           struct drm_i915_gem_context_param arg;
>>>>           struct drm_i915_gem_context_param_sseu sseu = { .flags = I915_EXEC_RENDER };
>>>>
>>>>           memset(&arg, 0, sizeof(arg));
>>>>           arg.ctx_id = ctx;
>>>>           arg.param = I915_CONTEXT_PARAM_SSEU;
>>>>           arg.value = (uintptr_t) &sseu;
>>>>           if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg) == 0) {
>>>>                   sseu.packed.subslice_mask = 0;
>>>>
>>>>                   drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &arg);
>>>>           }
>>>>
>>>> could be used to disable all subslices where supported.
>>>>
>>>> v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() (Lionel)
>>>>
>>>> v3: Add ability to program this per engine (Chris)
>>>>
>>>> v4: Move most get_sseu() into i915_gem_context.c (Lionel)
>>>>
>>>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>>>> c: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
>>>> CC: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>> CC: Zhipeng Gong <zhipeng.gong@intel.com>
>>>> CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>>>> ---
>>>>    drivers/gpu/drm/i915/i915_gem_context.c | 82 ++++++++++++++++++++++++-
>>>>    drivers/gpu/drm/i915/intel_lrc.c        | 55 +++++++++++++++++
>>>>    drivers/gpu/drm/i915/intel_lrc.h        |  4 ++
>>>>    include/uapi/drm/i915_drm.h             | 28 +++++++++
>>>>    4 files changed, 168 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>>>> index bdf050beeb94..b97ddcf47514 100644
>>>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>>>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>>>> @@ -740,6 +740,26 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
>>>>           return 0;
>>>>    }
>>>>    
>>>> +static struct i915_gem_context_sseu
>>>> +i915_gem_context_sseu_from_user_sseu(const struct sseu_dev_info *sseu,
>>>> +                                    const struct drm_i915_gem_context_param_sseu *user_sseu)
>>>> +{
>>>> +       struct i915_gem_context_sseu value = {
>>>> +               .slice_mask = user_sseu->packed.slice_mask == 0 ?
>>>> +                             sseu->slice_mask :
>>>> +                             (user_sseu->packed.slice_mask & sseu->slice_mask),
>>>> +               .subslice_mask = user_sseu->packed.subslice_mask == 0 ?
>>>> +                                sseu->subslice_mask[0] :
>>>> +                                (user_sseu->packed.subslice_mask & sseu->subslice_mask[0]),
>>>> +               .min_eus_per_subslice = min(user_sseu->packed.min_eus_per_subslice,
>>>> +                                           sseu->max_eus_per_subslice),
>>>> +               .max_eus_per_subslice = min(user_sseu->packed.max_eus_per_subslice,
>>>> +                                           sseu->max_eus_per_subslice),
>>>> +       };
>>>> +
>>>> +       return value;
>>>> +}
>>>> +
>>>>    int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>>>>                                       struct drm_file *file)
>>>>    {
>>>> @@ -777,6 +797,37 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>>>>           case I915_CONTEXT_PARAM_PRIORITY:
>>>>                   args->value = ctx->sched.priority;
>>>>                   break;
>>>> +       case I915_CONTEXT_PARAM_SSEU: {
>>>> +               struct drm_i915_gem_context_param_sseu param_sseu;
>>>> +               struct intel_engine_cs *engine;
>>>> +
>>>> +               if (copy_from_user(&param_sseu, u64_to_user_ptr(args->value),
>>>> +                                  sizeof(param_sseu))) {
>>>> +                       ret = -EFAULT;
>>>> +                       break;
>>>> +               }
>>>> +
>>>> +               engine = i915_gem_engine_from_flags(to_i915(dev), file,
>>>> +                                                   param_sseu.flags);
>>>> +               if (!engine) {
>>>> +                       ret = -EINVAL;
>>>> +                       break;
>>>> +               }
>>>> +
>>>> +               param_sseu.packed.slice_mask =
>>>> +                       ctx->engine[engine->id].sseu.slice_mask;
>>>> +               param_sseu.packed.subslice_mask =
>>>> +                       ctx->engine[engine->id].sseu.subslice_mask;
>>>> +               param_sseu.packed.min_eus_per_subslice =
>>>> +                       ctx->engine[engine->id].sseu.min_eus_per_subslice;
>>>> +               param_sseu.packed.max_eus_per_subslice =
>>>> +                       ctx->engine[engine->id].sseu.max_eus_per_subslice;
>>>> +
>>>> +               if (copy_to_user(u64_to_user_ptr(args->value), &param_sseu,
>>>> +                                sizeof(param_sseu)))
>>>> +                       ret = -EFAULT;
>>>> +               break;
>>>> +       }
>>>>           default:
>>>>                   ret = -EINVAL;
>>>>                   break;
>>>> @@ -832,7 +883,6 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>>>>                   else
>>>>                           i915_gem_context_clear_bannable(ctx);
>>>>                   break;
>>>> -
>>>>           case I915_CONTEXT_PARAM_PRIORITY:
>>>>                   {
>>>>                           s64 priority = args->value;
>>>> @@ -851,7 +901,37 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>>>>                                   ctx->sched.priority = priority;
>>>>                   }
>>>>                   break;
>>>> +       case I915_CONTEXT_PARAM_SSEU:
>>>> +               if (args->size)
>>>> +                       ret = -EINVAL;
>>>> +               else if (!HAS_EXECLISTS(ctx->i915))
>>>> +                       ret = -ENODEV;
>>>> +               else {
>>>> +                       struct drm_i915_private *dev_priv = to_i915(dev);
>>>> +                       struct drm_i915_gem_context_param_sseu user_sseu;
>>>> +                       struct i915_gem_context_sseu ctx_sseu;
>>>> +                       struct intel_engine_cs *engine;
>>>> +
>>>> +                       if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
>>>> +                                          sizeof(user_sseu))) {
>>>> +                               ret = -EFAULT;
>>>> +                               break;
>>>> +                       }
>>>> +
>>>> +                       engine = i915_gem_engine_from_flags(dev_priv, file,
>>>> +                                                           user_sseu.flags);
>>>> +                       if (!engine) {
>>>> +                               ret = -EINVAL;
>>>> +                               break;
>>>> +                       }
>>>> +
>>>> +                       ctx_sseu =
>>>> +                               i915_gem_context_sseu_from_user_sseu(&INTEL_INFO(dev_priv)->sseu,
>>>> +                                                                    &user_sseu);
>>>>    
>>>> +                       ret = intel_lr_context_set_sseu(ctx, engine, &ctx_sseu);
>>>> +               }
>>>> +               break;
>>>>           default:
>>>>                   ret = -EINVAL;
>>>>                   break;
>>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>>> index dca17ef24de5..9caddcb1f234 100644
>>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>>> @@ -2711,6 +2711,61 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv)
>>>>           }
>>>>    }
>>>>    
>>>> +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
>>>> +                             struct intel_engine_cs *engine,
>>>> +                             struct i915_gem_context_sseu *sseu)
>>>> +{
>>>> +       struct drm_i915_private *dev_priv = ctx->i915;
>>>> +       struct intel_context *ce;
>>>> +       enum intel_engine_id id;
>>>> +       int ret;
>>>> +
>>>> +       lockdep_assert_held(&dev_priv->drm.struct_mutex);
>>>> +
>>>> +       if (memcmp(sseu, &ctx->engine[engine->id].sseu, sizeof(*sseu)) == 0)
>>>> +               return 0;
>>>> +
>>>> +       /*
>>>> +        * We can only program this on render ring.
>>>> +        */
>>>> +       ce = &ctx->engine[RCS];
>>>> +
>>>> +       if (ce->pin_count) { /* Assume that the context is active! */
>>>> +               ret = i915_gem_switch_to_kernel_context(dev_priv);
>>>> +               if (ret)
>>>> +                       return ret;
>>>> +
>>>> +               ret = i915_gem_wait_for_idle(dev_priv,
>>>> +                                            I915_WAIT_INTERRUPTIBLE |
>>>> +                                            I915_WAIT_LOCKED);
>>>> +               if (ret)
>>>> +                       return ret;
>>>> +       }
>>>> +
>>>> +       if (ce->state) {
>>>> +               void *vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
>>>> +               u32 *regs;
>>>> +
>>>> +               if (IS_ERR(vaddr))
>>>> +                       return PTR_ERR(vaddr);
>>>> +
>>>> +               regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
>>>> +
>>>> +               regs[CTX_R_PWR_CLK_STATE + 1] =
>>>> +                       make_rpcs(&INTEL_INFO(dev_priv)->sseu, sseu);
>>>> +               i915_gem_object_unpin_map(ce->state->obj);
>>>> +       }
>>>> +
>>>> +       /*
>>>> +        * Apply the configuration to all engine. Our hardware doesn't
>>>> +        * currently support different configurations for each engine.
>>>> +        */
>>>> +       for_each_engine(engine, dev_priv, id)
>>>> +               ctx->engine[id].sseu = *sseu;
>>>> +
>>>> +       return 0;
>>>> +}
>>>> +
>>>>    #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>>>>    #include "selftests/intel_lrc.c"
>>>>    #endif
>>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
>>>> index d91d69a17206..4c84266814fa 100644
>>>> --- a/drivers/gpu/drm/i915/intel_lrc.h
>>>> +++ b/drivers/gpu/drm/i915/intel_lrc.h
>>>> @@ -112,4 +112,8 @@ intel_lr_context_descriptor(struct i915_gem_context *ctx,
>>>>           return ctx->engine[engine->id].lrc_desc;
>>>>    }
>>>>    
>>>> +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
>>>> +                             struct intel_engine_cs *engine,
>>>> +                             struct i915_gem_context_sseu *sseu);
>>>> +
>>>>    #endif /* _INTEL_LRC_H_ */
>>>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>>> index 96eda8176030..75749ce11c03 100644
>>>> --- a/include/uapi/drm/i915_drm.h
>>>> +++ b/include/uapi/drm/i915_drm.h
>>>> @@ -1456,9 +1456,37 @@ struct drm_i915_gem_context_param {
>>>>    #define   I915_CONTEXT_MAX_USER_PRIORITY       1023 /* inclusive */
>>>>    #define   I915_CONTEXT_DEFAULT_PRIORITY                0
>>>>    #define   I915_CONTEXT_MIN_USER_PRIORITY       -1023 /* inclusive */
>>>> +       /*
>>>> +        * When using the following param, value should be a pointer to
>>>> +        * drm_i915_gem_context_param_sseu.
>>>> +        */
>>>> +#define I915_CONTEXT_PARAM_SSEU                0x7
>>>>           __u64 value;
>>>>    };
>>>>    
>>>> +struct drm_i915_gem_context_param_sseu {
>>>> +       /*
>>>> +        * Engine to be configured or queried. Same value you would use with
>>>> +        * drm_i915_gem_execbuffer2.
>>>> +        */
>>>> +       __u64 flags;
>>>> +
>>>> +       /*
>>>> +        * Setting slice_mask or subslice_mask to 0 will make the context use
>>>> +        * masks reported respectively by I915_PARAM_SLICE_MASK or
>>>> +        * I915_PARAM_SUBSLICE_MASK.
>>>> +        */
>>>> +       union {
>>>> +               struct {
>>>> +                       __u8 slice_mask;
>>>> +                       __u8 subslice_mask;
>>>> +                       __u8 min_eus_per_subslice;
>>>> +                       __u8 max_eus_per_subslice;
>>>> +               } packed;
>>>> +               __u64 value;
>>>> +       };
>>>> +};
>>>> +
>>>>    enum drm_i915_oa_format {
>>>>           I915_OA_FORMAT_A13 = 1,     /* HSW only */
>>>>           I915_OA_FORMAT_A29,         /* HSW only */
>>>> -- 
>>>> 2.17.0
>>>>
>>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
Tvrtko Ursulin May 3, 2018, 5:18 p.m. UTC | #7
On 25/04/2018 12:45, Lionel Landwerlin wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> We want to allow userspace to reconfigure the subslice configuration for
> its own use case. To do so, we expose a context parameter to allow
> adjustment of the RPCS register stored within the context image (and
> currently not accessible via LRI). If the context is adjusted before
> first use, the adjustment is for "free"; otherwise if the context is
> active we flush the context off the GPU (stalling all users) and forcing
> the GPU to save the context to memory where we can modify it and so
> ensure that the register is reloaded on next execution.
> 
> The overhead of managing additional EU subslices can be significant,
> especially in multi-context workloads. Non-GPGPU contexts should
> preferably disable the subslices it is not using, and others should
> fine-tune the number to match their workload.
> 
> We expose complete control over the RPCS register, allowing
> configuration of slice/subslice, via masks packed into a u64 for
> simplicity. For example,
> 
> 	struct drm_i915_gem_context_param arg;
> 	struct drm_i915_gem_context_param_sseu sseu = { .flags = I915_EXEC_RENDER };
> 
> 	memset(&arg, 0, sizeof(arg));
> 	arg.ctx_id = ctx;
> 	arg.param = I915_CONTEXT_PARAM_SSEU;
> 	arg.value = (uintptr_t) &sseu;
> 	if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg) == 0) {
> 		sseu.packed.subslice_mask = 0;
> 
> 		drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &arg);
> 	}
> 
> could be used to disable all subslices where supported.
> 
> v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() (Lionel)
> 
> v3: Add ability to program this per engine (Chris)
> 
> v4: Move most get_sseu() into i915_gem_context.c (Lionel)
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> c: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
> CC: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> CC: Zhipeng Gong <zhipeng.gong@intel.com>
> CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem_context.c | 82 ++++++++++++++++++++++++-
>   drivers/gpu/drm/i915/intel_lrc.c        | 55 +++++++++++++++++
>   drivers/gpu/drm/i915/intel_lrc.h        |  4 ++
>   include/uapi/drm/i915_drm.h             | 28 +++++++++
>   4 files changed, 168 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index bdf050beeb94..b97ddcf47514 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -740,6 +740,26 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
>   	return 0;
>   }
>   
> +static struct i915_gem_context_sseu
> +i915_gem_context_sseu_from_user_sseu(const struct sseu_dev_info *sseu,
> +				     const struct drm_i915_gem_context_param_sseu *user_sseu)
> +{
> +	struct i915_gem_context_sseu value = {
> +		.slice_mask = user_sseu->packed.slice_mask == 0 ?
> +			      sseu->slice_mask :
> +			      (user_sseu->packed.slice_mask & sseu->slice_mask),
> +		.subslice_mask = user_sseu->packed.subslice_mask == 0 ?
> +				 sseu->subslice_mask[0] :
> +				 (user_sseu->packed.subslice_mask & sseu->subslice_mask[0]),
> +		.min_eus_per_subslice = min(user_sseu->packed.min_eus_per_subslice,
> +					    sseu->max_eus_per_subslice),
> +		.max_eus_per_subslice = min(user_sseu->packed.max_eus_per_subslice,
> +					    sseu->max_eus_per_subslice),
> +	};
> +
> +	return value;
> +}
> +
>   int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>   				    struct drm_file *file)
>   {
> @@ -777,6 +797,37 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>   	case I915_CONTEXT_PARAM_PRIORITY:
>   		args->value = ctx->sched.priority;
>   		break;
> +	case I915_CONTEXT_PARAM_SSEU: {
> +		struct drm_i915_gem_context_param_sseu param_sseu;
> +		struct intel_engine_cs *engine;
> +
> +		if (copy_from_user(&param_sseu, u64_to_user_ptr(args->value),
> +				   sizeof(param_sseu))) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		engine = i915_gem_engine_from_flags(to_i915(dev), file,
> +						    param_sseu.flags);
> +		if (!engine) {
> +			ret = -EINVAL;
> +			break;
> +		}
> +
> +		param_sseu.packed.slice_mask =
> +			ctx->engine[engine->id].sseu.slice_mask;
> +		param_sseu.packed.subslice_mask =
> +			ctx->engine[engine->id].sseu.subslice_mask;
> +		param_sseu.packed.min_eus_per_subslice =
> +			ctx->engine[engine->id].sseu.min_eus_per_subslice;
> +		param_sseu.packed.max_eus_per_subslice =
> +			ctx->engine[engine->id].sseu.max_eus_per_subslice;
> +
> +		if (copy_to_user(u64_to_user_ptr(args->value), &param_sseu,
> +				 sizeof(param_sseu)))
> +			ret = -EFAULT;
> +		break;
> +	}
>   	default:
>   		ret = -EINVAL;
>   		break;
> @@ -832,7 +883,6 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>   		else
>   			i915_gem_context_clear_bannable(ctx);
>   		break;
> -
>   	case I915_CONTEXT_PARAM_PRIORITY:
>   		{
>   			s64 priority = args->value;
> @@ -851,7 +901,37 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>   				ctx->sched.priority = priority;
>   		}
>   		break;
> +	case I915_CONTEXT_PARAM_SSEU:
> +		if (args->size)
> +			ret = -EINVAL;
> +		else if (!HAS_EXECLISTS(ctx->i915))
> +			ret = -ENODEV;
> +		else {
> +			struct drm_i915_private *dev_priv = to_i915(dev);
> +			struct drm_i915_gem_context_param_sseu user_sseu;
> +			struct i915_gem_context_sseu ctx_sseu;
> +			struct intel_engine_cs *engine;
> +
> +			if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
> +					   sizeof(user_sseu))) {
> +				ret = -EFAULT;
> +				break;
> +			}
> +
> +			engine = i915_gem_engine_from_flags(dev_priv, file,
> +							    user_sseu.flags);
> +			if (!engine) {
> +				ret = -EINVAL;
> +				break;
> +			}
> +
> +			ctx_sseu =
> +				i915_gem_context_sseu_from_user_sseu(&INTEL_INFO(dev_priv)->sseu,
> +								     &user_sseu);
>   
> +			ret = intel_lr_context_set_sseu(ctx, engine, &ctx_sseu);
> +		}
> +		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index dca17ef24de5..9caddcb1f234 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2711,6 +2711,61 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv)
>   	}
>   }
>   
> +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
> +			      struct intel_engine_cs *engine,
> +			      struct i915_gem_context_sseu *sseu)
> +{
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +	struct intel_context *ce;
> +	enum intel_engine_id id;
> +	int ret;
> +
> +	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> +
> +	if (memcmp(sseu, &ctx->engine[engine->id].sseu, sizeof(*sseu)) == 0)
> +		return 0;
> +
> +	/*
> +	 * We can only program this on render ring.
> +	 */
> +	ce = &ctx->engine[RCS];
> +
> +	if (ce->pin_count) { /* Assume that the context is active! */
> +		ret = i915_gem_switch_to_kernel_context(dev_priv);
> +		if (ret)
> +			return ret;
> +
> +		ret = i915_gem_wait_for_idle(dev_priv,
> +					     I915_WAIT_INTERRUPTIBLE |
> +					     I915_WAIT_LOCKED);

Could we consider the alternative of only allowing this to be configured 
on context create? That way we would not need to idle the GPU every time 
userspace decides to fiddle with it. It is unprivileged so quite an easy 
way for random app to ruin GPU performance for everyone.

Regards,

Tvrtko

> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (ce->state) {
> +		void *vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
> +		u32 *regs;
> +
> +		if (IS_ERR(vaddr))
> +			return PTR_ERR(vaddr);
> +
> +		regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
> +
> +		regs[CTX_R_PWR_CLK_STATE + 1] =
> +			make_rpcs(&INTEL_INFO(dev_priv)->sseu, sseu);
> +		i915_gem_object_unpin_map(ce->state->obj);
> +	}
> +
> +	/*
> +	 * Apply the configuration to all engine. Our hardware doesn't
> +	 * currently support different configurations for each engine.
> +	 */
> +	for_each_engine(engine, dev_priv, id)
> +		ctx->engine[id].sseu = *sseu;
> +
> +	return 0;
> +}
> +
>   #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>   #include "selftests/intel_lrc.c"
>   #endif
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index d91d69a17206..4c84266814fa 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -112,4 +112,8 @@ intel_lr_context_descriptor(struct i915_gem_context *ctx,
>   	return ctx->engine[engine->id].lrc_desc;
>   }
>   
> +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
> +			      struct intel_engine_cs *engine,
> +			      struct i915_gem_context_sseu *sseu);
> +
>   #endif /* _INTEL_LRC_H_ */
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 96eda8176030..75749ce11c03 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1456,9 +1456,37 @@ struct drm_i915_gem_context_param {
>   #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
>   #define   I915_CONTEXT_DEFAULT_PRIORITY		0
>   #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
> +	/*
> +	 * When using the following param, value should be a pointer to
> +	 * drm_i915_gem_context_param_sseu.
> +	 */
> +#define I915_CONTEXT_PARAM_SSEU		0x7
>   	__u64 value;
>   };
>   
> +struct drm_i915_gem_context_param_sseu {
> +	/*
> +	 * Engine to be configured or queried. Same value you would use with
> +	 * drm_i915_gem_execbuffer2.
> +	 */
> +	__u64 flags;
> +
> +	/*
> +	 * Setting slice_mask or subslice_mask to 0 will make the context use
> +	 * masks reported respectively by I915_PARAM_SLICE_MASK or
> +	 * I915_PARAM_SUBSLICE_MASK.
> +	 */
> +	union {
> +		struct {
> +			__u8 slice_mask;
> +			__u8 subslice_mask;
> +			__u8 min_eus_per_subslice;
> +			__u8 max_eus_per_subslice;
> +		} packed;
> +		__u64 value;
> +	};
> +};
> +
>   enum drm_i915_oa_format {
>   	I915_OA_FORMAT_A13 = 1,	    /* HSW only */
>   	I915_OA_FORMAT_A29,	    /* HSW only */
>
Lionel Landwerlin May 4, 2018, 4:25 p.m. UTC | #8
On 03/05/18 18:18, Tvrtko Ursulin wrote:
>>   +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
>> +                  struct intel_engine_cs *engine,
>> +                  struct i915_gem_context_sseu *sseu)
>> +{
>> +    struct drm_i915_private *dev_priv = ctx->i915;
>> +    struct intel_context *ce;
>> +    enum intel_engine_id id;
>> +    int ret;
>> +
>> +    lockdep_assert_held(&dev_priv->drm.struct_mutex);
>> +
>> +    if (memcmp(sseu, &ctx->engine[engine->id].sseu, sizeof(*sseu)) 
>> == 0)
>> +        return 0;
>> +
>> +    /*
>> +     * We can only program this on render ring.
>> +     */
>> +    ce = &ctx->engine[RCS];
>> +
>> +    if (ce->pin_count) { /* Assume that the context is active! */
>> +        ret = i915_gem_switch_to_kernel_context(dev_priv);
>> +        if (ret)
>> +            return ret;
>> +
>> +        ret = i915_gem_wait_for_idle(dev_priv,
>> +                         I915_WAIT_INTERRUPTIBLE |
>> +                         I915_WAIT_LOCKED);
>
> Could we consider the alternative of only allowing this to be 
> configured on context create? That way we would not need to idle the 
> GPU every time userspace decides to fiddle with it. It is unprivileged 
> so quite an easy way for random app to ruin GPU performance for everyone.
>
> Regards,
>
> Tvrtko 


I'm pretty sure Dmitry wants dynamic configurations.

Dmitry?
Rogozhkin, Dmitry V May 8, 2018, 4:04 a.m. UTC | #9
>> I'm pretty sure Dmitry wants dynamic configurations.


Yes, I afraid we really need dynamic slice configurations for media.

From: Landwerlin, Lionel G

Sent: Friday, May 4, 2018 9:25 AM
To: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>; intel-gfx@lists.freedesktop.org; Rogozhkin, Dmitry V <dmitry.v.rogozhkin@intel.com>
Subject: Re: [Intel-gfx] [PATCH 8/8] drm/i915: Expose RPCS (SSEU) configuration to userspace

On 03/05/18 18:18, Tvrtko Ursulin wrote:
  +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
+                  struct intel_engine_cs *engine,
+                  struct i915_gem_context_sseu *sseu)
+{
+    struct drm_i915_private *dev_priv = ctx->i915;
+    struct intel_context *ce;
+    enum intel_engine_id id;
+    int ret;
+
+    lockdep_assert_held(&dev_priv->drm.struct_mutex);
+
+    if (memcmp(sseu, &ctx->engine[engine->id].sseu, sizeof(*sseu)) == 0)
+        return 0;
+
+    /*
+     * We can only program this on render ring.
+     */
+    ce = &ctx->engine[RCS];
+
+    if (ce->pin_count) { /* Assume that the context is active! */
+        ret = i915_gem_switch_to_kernel_context(dev_priv);
+        if (ret)
+            return ret;
+
+        ret = i915_gem_wait_for_idle(dev_priv,
+                         I915_WAIT_INTERRUPTIBLE |
+                         I915_WAIT_LOCKED);

Could we consider the alternative of only allowing this to be configured on context create? That way we would not need to idle the GPU every time userspace decides to fiddle with it. It is unprivileged so quite an easy way for random app to ruin GPU performance for everyone.

Regards,

Tvrtko


I'm pretty sure Dmitry wants dynamic configurations.

Dmitry?
Tvrtko Ursulin May 8, 2018, 8:24 a.m. UTC | #10
On 08/05/2018 05:04, Rogozhkin, Dmitry V wrote:
>>>  I'm pretty sure Dmitry wants dynamic configurations.
> 
> Yes, I afraid we really need dynamic slice configurations for media.

Context of the question was whether you need to change slice 
configuration for a single GEM context between submitting batch buffers?

Regards,

Tvrtko


> *From:*Landwerlin, Lionel G
> *Sent:* Friday, May 4, 2018 9:25 AM
> *To:* Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>; 
> intel-gfx@lists.freedesktop.org; Rogozhkin, Dmitry V 
> <dmitry.v.rogozhkin@intel.com>
> *Subject:* Re: [Intel-gfx] [PATCH 8/8] drm/i915: Expose RPCS (SSEU) 
> configuration to userspace
> 
> On 03/05/18 18:18, Tvrtko Ursulin wrote:
> 
>            +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
>         +                  struct intel_engine_cs *engine,
>         +                  struct i915_gem_context_sseu *sseu)
>         +{
>         +    struct drm_i915_private *dev_priv = ctx->i915;
>         +    struct intel_context *ce;
>         +    enum intel_engine_id id;
>         +    int ret;
>         +
>         +    lockdep_assert_held(&dev_priv->drm.struct_mutex);
>         +
>         +    if (memcmp(sseu, &ctx->engine[engine->id].sseu,
>         sizeof(*sseu)) == 0)
>         +        return 0;
>         +
>         +    /*
>         +     * We can only program this on render ring.
>         +     */
>         +    ce = &ctx->engine[RCS];
>         +
>         +    if (ce->pin_count) { /* Assume that the context is active! */
>         +        ret = i915_gem_switch_to_kernel_context(dev_priv);
>         +        if (ret)
>         +            return ret;
>         +
>         +        ret = i915_gem_wait_for_idle(dev_priv,
>         +                         I915_WAIT_INTERRUPTIBLE |
>         +                         I915_WAIT_LOCKED);
> 
> 
>     Could we consider the alternative of only allowing this to be
>     configured on context create? That way we would not need to idle the
>     GPU every time userspace decides to fiddle with it. It is
>     unprivileged so quite an easy way for random app to ruin GPU
>     performance for everyone.
> 
>     Regards,
> 
>     Tvrtko
> 
> I'm pretty sure Dmitry wants dynamic configurations.
> 
> Dmitry?
>
Rogozhkin, Dmitry V May 8, 2018, 4 p.m. UTC | #11
>> Context of the question was whether you need to change slice configuration for a single GEM context between submitting batch buffers?


When we create a context we know the optimal slice count for it. And this optimal point does not change for this context in unperturbed conditions, i.e. when context runs alone. However, there are critical use cases when we never run single context, but instead we run few contexts in parallel and each of them has its own optimal slice count operating point. As a result, major question is: what is the cost of context switch if there is associated slice configuration change, powering on or off the slice(s)? In the ideal situation when the cost is 0, we don't need any change of slice configuration for single GEM context between submitting batch buffers. Problem is that cost is far from 0. And the cost is non-tolerable for the worst case when we will have a switch at each next batch buffer. As a result, we are forced to have some negotiation channel between different contexts and make them agree on some single slice configuration which will exist for reasonably long period of time to have associated cost negligible in genera. During this period we will submit a number of batch buffers before next reconfiguration attempt. So, that's the situation when we need to reconfigure slice configuration for a single GEM context between submitting batches.

Dmitry.

-----Original Message-----
From: Tvrtko Ursulin [mailto:tvrtko.ursulin@linux.intel.com] 

Sent: Tuesday, May 8, 2018 1:25 AM
To: Rogozhkin, Dmitry V <dmitry.v.rogozhkin@intel.com>; Landwerlin, Lionel G <lionel.g.landwerlin@intel.com>; intel-gfx@lists.freedesktop.org
Subject: Re: [Intel-gfx] [PATCH 8/8] drm/i915: Expose RPCS (SSEU) configuration to userspace


On 08/05/2018 05:04, Rogozhkin, Dmitry V wrote:
>>>  I'm pretty sure Dmitry wants dynamic configurations.

> 

> Yes, I afraid we really need dynamic slice configurations for media.


Context of the question was whether you need to change slice configuration for a single GEM context between submitting batch buffers?

Regards,

Tvrtko


> *From:*Landwerlin, Lionel G

> *Sent:* Friday, May 4, 2018 9:25 AM

> *To:* Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>; 

> intel-gfx@lists.freedesktop.org; Rogozhkin, Dmitry V 

> <dmitry.v.rogozhkin@intel.com>

> *Subject:* Re: [Intel-gfx] [PATCH 8/8] drm/i915: Expose RPCS (SSEU) 

> configuration to userspace

> 

> On 03/05/18 18:18, Tvrtko Ursulin wrote:

> 

>            +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,

>         +                  struct intel_engine_cs *engine,

>         +                  struct i915_gem_context_sseu *sseu)

>         +{

>         +    struct drm_i915_private *dev_priv = ctx->i915;

>         +    struct intel_context *ce;

>         +    enum intel_engine_id id;

>         +    int ret;

>         +

>         +    lockdep_assert_held(&dev_priv->drm.struct_mutex);

>         +

>         +    if (memcmp(sseu, &ctx->engine[engine->id].sseu,

>         sizeof(*sseu)) == 0)

>         +        return 0;

>         +

>         +    /*

>         +     * We can only program this on render ring.

>         +     */

>         +    ce = &ctx->engine[RCS];

>         +

>         +    if (ce->pin_count) { /* Assume that the context is active! */

>         +        ret = i915_gem_switch_to_kernel_context(dev_priv);

>         +        if (ret)

>         +            return ret;

>         +

>         +        ret = i915_gem_wait_for_idle(dev_priv,

>         +                         I915_WAIT_INTERRUPTIBLE |

>         +                         I915_WAIT_LOCKED);

> 

> 

>     Could we consider the alternative of only allowing this to be

>     configured on context create? That way we would not need to idle the

>     GPU every time userspace decides to fiddle with it. It is

>     unprivileged so quite an easy way for random app to ruin GPU

>     performance for everyone.

> 

>     Regards,

> 

>     Tvrtko

> 

> I'm pretty sure Dmitry wants dynamic configurations.

> 

> Dmitry?

>
Chris Wilson May 8, 2018, 8:56 p.m. UTC | #12
Quoting Tvrtko Ursulin (2018-05-03 18:18:43)
> 
> On 25/04/2018 12:45, Lionel Landwerlin wrote:
> > From: Chris Wilson <chris@chris-wilson.co.uk>
> > 
> > We want to allow userspace to reconfigure the subslice configuration for
> > its own use case. To do so, we expose a context parameter to allow
> > adjustment of the RPCS register stored within the context image (and
> > currently not accessible via LRI). If the context is adjusted before
> > first use, the adjustment is for "free"; otherwise if the context is
> > active we flush the context off the GPU (stalling all users) and forcing
> > the GPU to save the context to memory where we can modify it and so
> > ensure that the register is reloaded on next execution.
> > 
> > The overhead of managing additional EU subslices can be significant,
> > especially in multi-context workloads. Non-GPGPU contexts should
> > preferably disable the subslices it is not using, and others should
> > fine-tune the number to match their workload.
> > 
> > We expose complete control over the RPCS register, allowing
> > configuration of slice/subslice, via masks packed into a u64 for
> > simplicity. For example,
> > 
> >       struct drm_i915_gem_context_param arg;
> >       struct drm_i915_gem_context_param_sseu sseu = { .flags = I915_EXEC_RENDER };
> > 
> >       memset(&arg, 0, sizeof(arg));
> >       arg.ctx_id = ctx;
> >       arg.param = I915_CONTEXT_PARAM_SSEU;
> >       arg.value = (uintptr_t) &sseu;
> >       if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg) == 0) {
> >               sseu.packed.subslice_mask = 0;
> > 
> >               drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &arg);
> >       }
> > 
> > could be used to disable all subslices where supported.
> > 
> > v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() (Lionel)
> > 
> > v3: Add ability to program this per engine (Chris)
> > 
> > v4: Move most get_sseu() into i915_gem_context.c (Lionel)
> > 
> > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> > c: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
> > CC: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > CC: Zhipeng Gong <zhipeng.gong@intel.com>
> > CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_gem_context.c | 82 ++++++++++++++++++++++++-
> >   drivers/gpu/drm/i915/intel_lrc.c        | 55 +++++++++++++++++
> >   drivers/gpu/drm/i915/intel_lrc.h        |  4 ++
> >   include/uapi/drm/i915_drm.h             | 28 +++++++++
> >   4 files changed, 168 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> > index bdf050beeb94..b97ddcf47514 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -740,6 +740,26 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
> >       return 0;
> >   }
> >   
> > +static struct i915_gem_context_sseu
> > +i915_gem_context_sseu_from_user_sseu(const struct sseu_dev_info *sseu,
> > +                                  const struct drm_i915_gem_context_param_sseu *user_sseu)
> > +{
> > +     struct i915_gem_context_sseu value = {
> > +             .slice_mask = user_sseu->packed.slice_mask == 0 ?
> > +                           sseu->slice_mask :
> > +                           (user_sseu->packed.slice_mask & sseu->slice_mask),
> > +             .subslice_mask = user_sseu->packed.subslice_mask == 0 ?
> > +                              sseu->subslice_mask[0] :
> > +                              (user_sseu->packed.subslice_mask & sseu->subslice_mask[0]),
> > +             .min_eus_per_subslice = min(user_sseu->packed.min_eus_per_subslice,
> > +                                         sseu->max_eus_per_subslice),
> > +             .max_eus_per_subslice = min(user_sseu->packed.max_eus_per_subslice,
> > +                                         sseu->max_eus_per_subslice),
> > +     };
> > +
> > +     return value;
> > +}
> > +
> >   int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
> >                                   struct drm_file *file)
> >   {
> > @@ -777,6 +797,37 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
> >       case I915_CONTEXT_PARAM_PRIORITY:
> >               args->value = ctx->sched.priority;
> >               break;
> > +     case I915_CONTEXT_PARAM_SSEU: {
> > +             struct drm_i915_gem_context_param_sseu param_sseu;
> > +             struct intel_engine_cs *engine;
> > +
> > +             if (copy_from_user(&param_sseu, u64_to_user_ptr(args->value),
> > +                                sizeof(param_sseu))) {
> > +                     ret = -EFAULT;
> > +                     break;
> > +             }
> > +
> > +             engine = i915_gem_engine_from_flags(to_i915(dev), file,
> > +                                                 param_sseu.flags);
> > +             if (!engine) {
> > +                     ret = -EINVAL;
> > +                     break;
> > +             }
> > +
> > +             param_sseu.packed.slice_mask =
> > +                     ctx->engine[engine->id].sseu.slice_mask;
> > +             param_sseu.packed.subslice_mask =
> > +                     ctx->engine[engine->id].sseu.subslice_mask;
> > +             param_sseu.packed.min_eus_per_subslice =
> > +                     ctx->engine[engine->id].sseu.min_eus_per_subslice;
> > +             param_sseu.packed.max_eus_per_subslice =
> > +                     ctx->engine[engine->id].sseu.max_eus_per_subslice;
> > +
> > +             if (copy_to_user(u64_to_user_ptr(args->value), &param_sseu,
> > +                              sizeof(param_sseu)))
> > +                     ret = -EFAULT;
> > +             break;
> > +     }
> >       default:
> >               ret = -EINVAL;
> >               break;
> > @@ -832,7 +883,6 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
> >               else
> >                       i915_gem_context_clear_bannable(ctx);
> >               break;
> > -
> >       case I915_CONTEXT_PARAM_PRIORITY:
> >               {
> >                       s64 priority = args->value;
> > @@ -851,7 +901,37 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
> >                               ctx->sched.priority = priority;
> >               }
> >               break;
> > +     case I915_CONTEXT_PARAM_SSEU:
> > +             if (args->size)
> > +                     ret = -EINVAL;
> > +             else if (!HAS_EXECLISTS(ctx->i915))
> > +                     ret = -ENODEV;
> > +             else {
> > +                     struct drm_i915_private *dev_priv = to_i915(dev);
> > +                     struct drm_i915_gem_context_param_sseu user_sseu;
> > +                     struct i915_gem_context_sseu ctx_sseu;
> > +                     struct intel_engine_cs *engine;
> > +
> > +                     if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
> > +                                        sizeof(user_sseu))) {
> > +                             ret = -EFAULT;
> > +                             break;
> > +                     }
> > +
> > +                     engine = i915_gem_engine_from_flags(dev_priv, file,
> > +                                                         user_sseu.flags);
> > +                     if (!engine) {
> > +                             ret = -EINVAL;
> > +                             break;
> > +                     }
> > +
> > +                     ctx_sseu =
> > +                             i915_gem_context_sseu_from_user_sseu(&INTEL_INFO(dev_priv)->sseu,
> > +                                                                  &user_sseu);
> >   
> > +                     ret = intel_lr_context_set_sseu(ctx, engine, &ctx_sseu);
> > +             }
> > +             break;
> >       default:
> >               ret = -EINVAL;
> >               break;
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index dca17ef24de5..9caddcb1f234 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -2711,6 +2711,61 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv)
> >       }
> >   }
> >   
> > +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
> > +                           struct intel_engine_cs *engine,
> > +                           struct i915_gem_context_sseu *sseu)
> > +{
> > +     struct drm_i915_private *dev_priv = ctx->i915;
> > +     struct intel_context *ce;
> > +     enum intel_engine_id id;
> > +     int ret;
> > +
> > +     lockdep_assert_held(&dev_priv->drm.struct_mutex);
> > +
> > +     if (memcmp(sseu, &ctx->engine[engine->id].sseu, sizeof(*sseu)) == 0)
> > +             return 0;
> > +
> > +     /*
> > +      * We can only program this on render ring.
> > +      */
> > +     ce = &ctx->engine[RCS];
> > +
> > +     if (ce->pin_count) { /* Assume that the context is active! */
> > +             ret = i915_gem_switch_to_kernel_context(dev_priv);
> > +             if (ret)
> > +                     return ret;
> > +
> > +             ret = i915_gem_wait_for_idle(dev_priv,
> > +                                          I915_WAIT_INTERRUPTIBLE |
> > +                                          I915_WAIT_LOCKED);
> 
> Could we consider the alternative of only allowing this to be configured 
> on context create? That way we would not need to idle the GPU every time 
> userspace decides to fiddle with it. It is unprivileged so quite an easy 
> way for random app to ruin GPU performance for everyone.

I think we can do dynamic reconfiguration of the context using LRI. And
if we can't do so from within the context, we can use SDM from another.
-Chris
Lionel Landwerlin May 9, 2018, 3:35 p.m. UTC | #13
On 08/05/18 21:56, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-05-03 18:18:43)
>> On 25/04/2018 12:45, Lionel Landwerlin wrote:
>>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>>
>>> We want to allow userspace to reconfigure the subslice configuration for
>>> its own use case. To do so, we expose a context parameter to allow
>>> adjustment of the RPCS register stored within the context image (and
>>> currently not accessible via LRI). If the context is adjusted before
>>> first use, the adjustment is for "free"; otherwise if the context is
>>> active we flush the context off the GPU (stalling all users) and forcing
>>> the GPU to save the context to memory where we can modify it and so
>>> ensure that the register is reloaded on next execution.
>>>
>>> The overhead of managing additional EU subslices can be significant,
>>> especially in multi-context workloads. Non-GPGPU contexts should
>>> preferably disable the subslices it is not using, and others should
>>> fine-tune the number to match their workload.
>>>
>>> We expose complete control over the RPCS register, allowing
>>> configuration of slice/subslice, via masks packed into a u64 for
>>> simplicity. For example,
>>>
>>>        struct drm_i915_gem_context_param arg;
>>>        struct drm_i915_gem_context_param_sseu sseu = { .flags = I915_EXEC_RENDER };
>>>
>>>        memset(&arg, 0, sizeof(arg));
>>>        arg.ctx_id = ctx;
>>>        arg.param = I915_CONTEXT_PARAM_SSEU;
>>>        arg.value = (uintptr_t) &sseu;
>>>        if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg) == 0) {
>>>                sseu.packed.subslice_mask = 0;
>>>
>>>                drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &arg);
>>>        }
>>>
>>> could be used to disable all subslices where supported.
>>>
>>> v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() (Lionel)
>>>
>>> v3: Add ability to program this per engine (Chris)
>>>
>>> v4: Move most get_sseu() into i915_gem_context.c (Lionel)
>>>
>>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>>> c: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
>>> CC: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> CC: Zhipeng Gong <zhipeng.gong@intel.com>
>>> CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/i915_gem_context.c | 82 ++++++++++++++++++++++++-
>>>    drivers/gpu/drm/i915/intel_lrc.c        | 55 +++++++++++++++++
>>>    drivers/gpu/drm/i915/intel_lrc.h        |  4 ++
>>>    include/uapi/drm/i915_drm.h             | 28 +++++++++
>>>    4 files changed, 168 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>>> index bdf050beeb94..b97ddcf47514 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>>> @@ -740,6 +740,26 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
>>>        return 0;
>>>    }
>>>    
>>> +static struct i915_gem_context_sseu
>>> +i915_gem_context_sseu_from_user_sseu(const struct sseu_dev_info *sseu,
>>> +                                  const struct drm_i915_gem_context_param_sseu *user_sseu)
>>> +{
>>> +     struct i915_gem_context_sseu value = {
>>> +             .slice_mask = user_sseu->packed.slice_mask == 0 ?
>>> +                           sseu->slice_mask :
>>> +                           (user_sseu->packed.slice_mask & sseu->slice_mask),
>>> +             .subslice_mask = user_sseu->packed.subslice_mask == 0 ?
>>> +                              sseu->subslice_mask[0] :
>>> +                              (user_sseu->packed.subslice_mask & sseu->subslice_mask[0]),
>>> +             .min_eus_per_subslice = min(user_sseu->packed.min_eus_per_subslice,
>>> +                                         sseu->max_eus_per_subslice),
>>> +             .max_eus_per_subslice = min(user_sseu->packed.max_eus_per_subslice,
>>> +                                         sseu->max_eus_per_subslice),
>>> +     };
>>> +
>>> +     return value;
>>> +}
>>> +
>>>    int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>>>                                    struct drm_file *file)
>>>    {
>>> @@ -777,6 +797,37 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>>>        case I915_CONTEXT_PARAM_PRIORITY:
>>>                args->value = ctx->sched.priority;
>>>                break;
>>> +     case I915_CONTEXT_PARAM_SSEU: {
>>> +             struct drm_i915_gem_context_param_sseu param_sseu;
>>> +             struct intel_engine_cs *engine;
>>> +
>>> +             if (copy_from_user(&param_sseu, u64_to_user_ptr(args->value),
>>> +                                sizeof(param_sseu))) {
>>> +                     ret = -EFAULT;
>>> +                     break;
>>> +             }
>>> +
>>> +             engine = i915_gem_engine_from_flags(to_i915(dev), file,
>>> +                                                 param_sseu.flags);
>>> +             if (!engine) {
>>> +                     ret = -EINVAL;
>>> +                     break;
>>> +             }
>>> +
>>> +             param_sseu.packed.slice_mask =
>>> +                     ctx->engine[engine->id].sseu.slice_mask;
>>> +             param_sseu.packed.subslice_mask =
>>> +                     ctx->engine[engine->id].sseu.subslice_mask;
>>> +             param_sseu.packed.min_eus_per_subslice =
>>> +                     ctx->engine[engine->id].sseu.min_eus_per_subslice;
>>> +             param_sseu.packed.max_eus_per_subslice =
>>> +                     ctx->engine[engine->id].sseu.max_eus_per_subslice;
>>> +
>>> +             if (copy_to_user(u64_to_user_ptr(args->value), &param_sseu,
>>> +                              sizeof(param_sseu)))
>>> +                     ret = -EFAULT;
>>> +             break;
>>> +     }
>>>        default:
>>>                ret = -EINVAL;
>>>                break;
>>> @@ -832,7 +883,6 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>>>                else
>>>                        i915_gem_context_clear_bannable(ctx);
>>>                break;
>>> -
>>>        case I915_CONTEXT_PARAM_PRIORITY:
>>>                {
>>>                        s64 priority = args->value;
>>> @@ -851,7 +901,37 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>>>                                ctx->sched.priority = priority;
>>>                }
>>>                break;
>>> +     case I915_CONTEXT_PARAM_SSEU:
>>> +             if (args->size)
>>> +                     ret = -EINVAL;
>>> +             else if (!HAS_EXECLISTS(ctx->i915))
>>> +                     ret = -ENODEV;
>>> +             else {
>>> +                     struct drm_i915_private *dev_priv = to_i915(dev);
>>> +                     struct drm_i915_gem_context_param_sseu user_sseu;
>>> +                     struct i915_gem_context_sseu ctx_sseu;
>>> +                     struct intel_engine_cs *engine;
>>> +
>>> +                     if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
>>> +                                        sizeof(user_sseu))) {
>>> +                             ret = -EFAULT;
>>> +                             break;
>>> +                     }
>>> +
>>> +                     engine = i915_gem_engine_from_flags(dev_priv, file,
>>> +                                                         user_sseu.flags);
>>> +                     if (!engine) {
>>> +                             ret = -EINVAL;
>>> +                             break;
>>> +                     }
>>> +
>>> +                     ctx_sseu =
>>> +                             i915_gem_context_sseu_from_user_sseu(&INTEL_INFO(dev_priv)->sseu,
>>> +                                                                  &user_sseu);
>>>    
>>> +                     ret = intel_lr_context_set_sseu(ctx, engine, &ctx_sseu);
>>> +             }
>>> +             break;
>>>        default:
>>>                ret = -EINVAL;
>>>                break;
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>> index dca17ef24de5..9caddcb1f234 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -2711,6 +2711,61 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv)
>>>        }
>>>    }
>>>    
>>> +int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
>>> +                           struct intel_engine_cs *engine,
>>> +                           struct i915_gem_context_sseu *sseu)
>>> +{
>>> +     struct drm_i915_private *dev_priv = ctx->i915;
>>> +     struct intel_context *ce;
>>> +     enum intel_engine_id id;
>>> +     int ret;
>>> +
>>> +     lockdep_assert_held(&dev_priv->drm.struct_mutex);
>>> +
>>> +     if (memcmp(sseu, &ctx->engine[engine->id].sseu, sizeof(*sseu)) == 0)
>>> +             return 0;
>>> +
>>> +     /*
>>> +      * We can only program this on render ring.
>>> +      */
>>> +     ce = &ctx->engine[RCS];
>>> +
>>> +     if (ce->pin_count) { /* Assume that the context is active! */
>>> +             ret = i915_gem_switch_to_kernel_context(dev_priv);
>>> +             if (ret)
>>> +                     return ret;
>>> +
>>> +             ret = i915_gem_wait_for_idle(dev_priv,
>>> +                                          I915_WAIT_INTERRUPTIBLE |
>>> +                                          I915_WAIT_LOCKED);
>> Could we consider the alternative of only allowing this to be configured
>> on context create? That way we would not need to idle the GPU every time
>> userspace decides to fiddle with it. It is unprivileged so quite an easy
>> way for random app to ruin GPU performance for everyone.
> I think we can do dynamic reconfiguration of the context using LRI. And
> if we can't do so from within the context, we can use SDM from another.
> -Chris
>
Documentation says we can't LRI. So that leaves SDM from another context.
Should we use the kernel one?

Thanks,

-
Lionel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index bdf050beeb94..b97ddcf47514 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -740,6 +740,26 @@  int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
 	return 0;
 }
 
+static struct i915_gem_context_sseu
+i915_gem_context_sseu_from_user_sseu(const struct sseu_dev_info *sseu,
+				     const struct drm_i915_gem_context_param_sseu *user_sseu)
+{
+	struct i915_gem_context_sseu value = {
+		.slice_mask = user_sseu->packed.slice_mask == 0 ?
+			      sseu->slice_mask :
+			      (user_sseu->packed.slice_mask & sseu->slice_mask),
+		.subslice_mask = user_sseu->packed.subslice_mask == 0 ?
+				 sseu->subslice_mask[0] :
+				 (user_sseu->packed.subslice_mask & sseu->subslice_mask[0]),
+		.min_eus_per_subslice = min(user_sseu->packed.min_eus_per_subslice,
+					    sseu->max_eus_per_subslice),
+		.max_eus_per_subslice = min(user_sseu->packed.max_eus_per_subslice,
+					    sseu->max_eus_per_subslice),
+	};
+
+	return value;
+}
+
 int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 				    struct drm_file *file)
 {
@@ -777,6 +797,37 @@  int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_PRIORITY:
 		args->value = ctx->sched.priority;
 		break;
+	case I915_CONTEXT_PARAM_SSEU: {
+		struct drm_i915_gem_context_param_sseu param_sseu;
+		struct intel_engine_cs *engine;
+
+		if (copy_from_user(&param_sseu, u64_to_user_ptr(args->value),
+				   sizeof(param_sseu))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		engine = i915_gem_engine_from_flags(to_i915(dev), file,
+						    param_sseu.flags);
+		if (!engine) {
+			ret = -EINVAL;
+			break;
+		}
+
+		param_sseu.packed.slice_mask =
+			ctx->engine[engine->id].sseu.slice_mask;
+		param_sseu.packed.subslice_mask =
+			ctx->engine[engine->id].sseu.subslice_mask;
+		param_sseu.packed.min_eus_per_subslice =
+			ctx->engine[engine->id].sseu.min_eus_per_subslice;
+		param_sseu.packed.max_eus_per_subslice =
+			ctx->engine[engine->id].sseu.max_eus_per_subslice;
+
+		if (copy_to_user(u64_to_user_ptr(args->value), &param_sseu,
+				 sizeof(param_sseu)))
+			ret = -EFAULT;
+		break;
+	}
 	default:
 		ret = -EINVAL;
 		break;
@@ -832,7 +883,6 @@  int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 		else
 			i915_gem_context_clear_bannable(ctx);
 		break;
-
 	case I915_CONTEXT_PARAM_PRIORITY:
 		{
 			s64 priority = args->value;
@@ -851,7 +901,37 @@  int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 				ctx->sched.priority = priority;
 		}
 		break;
+	case I915_CONTEXT_PARAM_SSEU:
+		if (args->size)
+			ret = -EINVAL;
+		else if (!HAS_EXECLISTS(ctx->i915))
+			ret = -ENODEV;
+		else {
+			struct drm_i915_private *dev_priv = to_i915(dev);
+			struct drm_i915_gem_context_param_sseu user_sseu;
+			struct i915_gem_context_sseu ctx_sseu;
+			struct intel_engine_cs *engine;
+
+			if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
+					   sizeof(user_sseu))) {
+				ret = -EFAULT;
+				break;
+			}
+
+			engine = i915_gem_engine_from_flags(dev_priv, file,
+							    user_sseu.flags);
+			if (!engine) {
+				ret = -EINVAL;
+				break;
+			}
+
+			ctx_sseu =
+				i915_gem_context_sseu_from_user_sseu(&INTEL_INFO(dev_priv)->sseu,
+								     &user_sseu);
 
+			ret = intel_lr_context_set_sseu(ctx, engine, &ctx_sseu);
+		}
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index dca17ef24de5..9caddcb1f234 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2711,6 +2711,61 @@  void intel_lr_context_resume(struct drm_i915_private *dev_priv)
 	}
 }
 
+int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
+			      struct intel_engine_cs *engine,
+			      struct i915_gem_context_sseu *sseu)
+{
+	struct drm_i915_private *dev_priv = ctx->i915;
+	struct intel_context *ce;
+	enum intel_engine_id id;
+	int ret;
+
+	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+
+	if (memcmp(sseu, &ctx->engine[engine->id].sseu, sizeof(*sseu)) == 0)
+		return 0;
+
+	/*
+	 * We can only program this on render ring.
+	 */
+	ce = &ctx->engine[RCS];
+
+	if (ce->pin_count) { /* Assume that the context is active! */
+		ret = i915_gem_switch_to_kernel_context(dev_priv);
+		if (ret)
+			return ret;
+
+		ret = i915_gem_wait_for_idle(dev_priv,
+					     I915_WAIT_INTERRUPTIBLE |
+					     I915_WAIT_LOCKED);
+		if (ret)
+			return ret;
+	}
+
+	if (ce->state) {
+		void *vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
+		u32 *regs;
+
+		if (IS_ERR(vaddr))
+			return PTR_ERR(vaddr);
+
+		regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
+
+		regs[CTX_R_PWR_CLK_STATE + 1] =
+			make_rpcs(&INTEL_INFO(dev_priv)->sseu, sseu);
+		i915_gem_object_unpin_map(ce->state->obj);
+	}
+
+	/*
+	 * Apply the configuration to all engine. Our hardware doesn't
+	 * currently support different configurations for each engine.
+	 */
+	for_each_engine(engine, dev_priv, id)
+		ctx->engine[id].sseu = *sseu;
+
+	return 0;
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftests/intel_lrc.c"
 #endif
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index d91d69a17206..4c84266814fa 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -112,4 +112,8 @@  intel_lr_context_descriptor(struct i915_gem_context *ctx,
 	return ctx->engine[engine->id].lrc_desc;
 }
 
+int intel_lr_context_set_sseu(struct i915_gem_context *ctx,
+			      struct intel_engine_cs *engine,
+			      struct i915_gem_context_sseu *sseu);
+
 #endif /* _INTEL_LRC_H_ */
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 96eda8176030..75749ce11c03 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1456,9 +1456,37 @@  struct drm_i915_gem_context_param {
 #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
 #define   I915_CONTEXT_DEFAULT_PRIORITY		0
 #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
+	/*
+	 * When using the following param, value should be a pointer to
+	 * drm_i915_gem_context_param_sseu.
+	 */
+#define I915_CONTEXT_PARAM_SSEU		0x7
 	__u64 value;
 };
 
+struct drm_i915_gem_context_param_sseu {
+	/*
+	 * Engine to be configured or queried. Same value you would use with
+	 * drm_i915_gem_execbuffer2.
+	 */
+	__u64 flags;
+
+	/*
+	 * Setting slice_mask or subslice_mask to 0 will make the context use
+	 * masks reported respectively by I915_PARAM_SLICE_MASK or
+	 * I915_PARAM_SUBSLICE_MASK.
+	 */
+	union {
+		struct {
+			__u8 slice_mask;
+			__u8 subslice_mask;
+			__u8 min_eus_per_subslice;
+			__u8 max_eus_per_subslice;
+		} packed;
+		__u64 value;
+	};
+};
+
 enum drm_i915_oa_format {
 	I915_OA_FORMAT_A13 = 1,	    /* HSW only */
 	I915_OA_FORMAT_A29,	    /* HSW only */