diff mbox

[v6,6/7] drm/i915: Expose RPCS (SSEU) configuration to userspace

Message ID 20180522180002.11522-7-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Lionel Landwerlin May 22, 2018, 6 p.m. UTC
From: Chris Wilson <chris@chris-wilson.co.uk>

We want to allow userspace to reconfigure the subslice configuration for
its own use case. To do so, we expose a context parameter to allow
adjustment of the RPCS register stored within the context image (and
currently not accessible via LRI). If the context is adjusted before
first use, the adjustment is for "free"; otherwise if the context is
active we flush the context off the GPU (stalling all users) and forcing
the GPU to save the context to memory where we can modify it and so
ensure that the register is reloaded on next execution.

The overhead of managing additional EU subslices can be significant,
especially in multi-context workloads. Non-GPGPU contexts should
preferably disable the subslices it is not using, and others should
fine-tune the number to match their workload.

We expose complete control over the RPCS register, allowing
configuration of slice/subslice, via masks packed into a u64 for
simplicity. For example,

	struct drm_i915_gem_context_param arg;
	struct drm_i915_gem_context_param_sseu sseu = { .class = 0,
	                                                .instance = 0, };

	memset(&arg, 0, sizeof(arg));
	arg.ctx_id = ctx;
	arg.param = I915_CONTEXT_PARAM_SSEU;
	arg.value = (uintptr_t) &sseu;
	if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg) == 0) {
		sseu.packed.subslice_mask = 0;

		drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &arg);
	}

could be used to disable all subslices where supported.

v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() (Lionel)

v3: Add ability to program this per engine (Chris)

v4: Move most get_sseu() into i915_gem_context.c (Lionel)

v5: Validate sseu configuration against the device's capabilities (Lionel)

v6: Change context powergating settings through MI_SDM on kernel context (Chris)

v7: Synchronize the requests following a powergating setting change using a global
    dependency (Chris)
    Iterate timelines through dev_priv.gt.active_rings (Tvrtko)
    Disable RPCS configuration setting for non capable users (Lionel/Tvrtko)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
c: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
CC: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
CC: Zhipeng Gong <zhipeng.gong@intel.com>
CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  13 ++
 drivers/gpu/drm/i915/i915_gem.c         |   2 +
 drivers/gpu/drm/i915/i915_gem_context.c | 167 ++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_request.c     |  20 +++
 drivers/gpu/drm/i915/intel_lrc.c        | 103 ++++++++++-----
 drivers/gpu/drm/i915/intel_ringbuffer.c |   2 +
 drivers/gpu/drm/i915/intel_ringbuffer.h |   4 +
 include/uapi/drm/i915_drm.h             |  38 ++++++
 8 files changed, 314 insertions(+), 35 deletions(-)

Comments

Tvrtko Ursulin May 23, 2018, 3:13 p.m. UTC | #1
On 22/05/2018 19:00, Lionel Landwerlin wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> We want to allow userspace to reconfigure the subslice configuration for
> its own use case. To do so, we expose a context parameter to allow
> adjustment of the RPCS register stored within the context image (and
> currently not accessible via LRI). If the context is adjusted before
> first use, the adjustment is for "free"; otherwise if the context is
> active we flush the context off the GPU (stalling all users) and forcing
> the GPU to save the context to memory where we can modify it and so
> ensure that the register is reloaded on next execution.
> 
> The overhead of managing additional EU subslices can be significant,
> especially in multi-context workloads. Non-GPGPU contexts should
> preferably disable the subslices it is not using, and others should
> fine-tune the number to match their workload.
> 
> We expose complete control over the RPCS register, allowing
> configuration of slice/subslice, via masks packed into a u64 for
> simplicity. For example,
> 
> 	struct drm_i915_gem_context_param arg;
> 	struct drm_i915_gem_context_param_sseu sseu = { .class = 0,
> 	                                                .instance = 0, };
> 
> 	memset(&arg, 0, sizeof(arg));
> 	arg.ctx_id = ctx;
> 	arg.param = I915_CONTEXT_PARAM_SSEU;
> 	arg.value = (uintptr_t) &sseu;
> 	if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg) == 0) {
> 		sseu.packed.subslice_mask = 0;
> 
> 		drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &arg);
> 	}
> 
> could be used to disable all subslices where supported.
> 
> v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() (Lionel)
> 
> v3: Add ability to program this per engine (Chris)
> 
> v4: Move most get_sseu() into i915_gem_context.c (Lionel)
> 
> v5: Validate sseu configuration against the device's capabilities (Lionel)
> 
> v6: Change context powergating settings through MI_SDM on kernel context (Chris)
> 
> v7: Synchronize the requests following a powergating setting change using a global
>      dependency (Chris)
>      Iterate timelines through dev_priv.gt.active_rings (Tvrtko)
>      Disable RPCS configuration setting for non capable users (Lionel/Tvrtko)
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> c: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
> CC: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> CC: Zhipeng Gong <zhipeng.gong@intel.com>
> CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h         |  13 ++
>   drivers/gpu/drm/i915/i915_gem.c         |   2 +
>   drivers/gpu/drm/i915/i915_gem_context.c | 167 ++++++++++++++++++++++++
>   drivers/gpu/drm/i915/i915_request.c     |  20 +++
>   drivers/gpu/drm/i915/intel_lrc.c        | 103 ++++++++++-----
>   drivers/gpu/drm/i915/intel_ringbuffer.c |   2 +
>   drivers/gpu/drm/i915/intel_ringbuffer.h |   4 +
>   include/uapi/drm/i915_drm.h             |  38 ++++++
>   8 files changed, 314 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 21631b51b37b..09cfcfe1c339 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2067,6 +2067,12 @@ struct drm_i915_private {
>   		u32 active_requests;
>   		u32 request_serial;
>   
> +		/**
> +		 * Global barrier to ensuring ordering of sseu transitions
> +		 * requests.
> +		 */
> +		struct i915_gem_active global_barrier;
> +
>   		/**
>   		 * Is the GPU currently considered idle, or busy executing
>   		 * userspace requests? Whilst idle, we allow runtime power
> @@ -3227,6 +3233,13 @@ i915_vm_to_ppgtt(struct i915_address_space *vm)
>   	return container_of(vm, struct i915_hw_ppgtt, base);
>   }
>   
> +static inline void i915_gem_set_global_barrier(struct drm_i915_private *i915,
> +					       struct i915_request *rq)
> +{
> +	lockdep_assert_held(&i915->drm.struct_mutex);
> +	i915_gem_active_set(&i915->gt.global_barrier, rq);
> +}
> +
>   /* i915_gem_fence_reg.c */
>   struct drm_i915_fence_reg *
>   i915_reserve_fence(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 03874b50ada9..9c2a0d04bd39 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -5548,6 +5548,8 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
>   	if (!dev_priv->priorities)
>   		goto err_dependencies;
>   
> +	init_request_active(&dev_priv->gt.global_barrier, NULL);
> +
>   	INIT_LIST_HEAD(&dev_priv->gt.timelines);
>   	INIT_LIST_HEAD(&dev_priv->gt.active_rings);
>   	INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index ea9ae1046827..5c5a12f1c265 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -730,6 +730,103 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
>   	return 0;
>   }
>   
> +static int
> +intel_sseu_from_user_sseu(const struct sseu_dev_info *sseu,
> +			  const struct drm_i915_gem_context_param_sseu *user_sseu,
> +			  union intel_sseu *ctx_sseu)
> +{
> +	if ((user_sseu->slice_mask & ~sseu->slice_mask) != 0 ||
> +	    user_sseu->slice_mask == 0)
> +		return -EINVAL;
> +
> +	if ((user_sseu->subslice_mask & ~sseu->subslice_mask[0]) != 0 ||
> +	    user_sseu->subslice_mask == 0)
> +		return -EINVAL;
> +
> +	if (user_sseu->min_eus_per_subslice > sseu->max_eus_per_subslice)
> +		return -EINVAL;
> +
> +	if (user_sseu->max_eus_per_subslice > sseu->max_eus_per_subslice ||
> +	    user_sseu->max_eus_per_subslice < user_sseu->min_eus_per_subslice ||
> +	    user_sseu->max_eus_per_subslice == 0)
> +		return -EINVAL;
> +
> +	ctx_sseu->slice_mask = user_sseu->slice_mask;
> +	ctx_sseu->subslice_mask = user_sseu->subslice_mask;
> +	ctx_sseu->min_eus_per_subslice = user_sseu->min_eus_per_subslice;
> +	ctx_sseu->max_eus_per_subslice = user_sseu->max_eus_per_subslice;
> +
> +	return 0;
> +}
> +
> +static int
> +i915_gem_context_reconfigure_sseu(struct i915_gem_context *ctx,
> +				  struct intel_engine_cs *engine,
> +				  union intel_sseu sseu)
> +{
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +	struct i915_request *rq;
> +	struct intel_ring *ring;
> +	enum intel_engine_id id;
> +	int ret;
> +
> +	/*
> +	 * First notify user when this capability is not available so that it
> +	 * can be detected with any valid input.
> +	 */
> +	if (!engine->emit_rpcs_config)
> +		return -ENODEV;
> +
> +	if (memcmp(&to_intel_context(ctx, engine)->sseu,
> +		   &sseu, sizeof(sseu)) == 0) {
> +		return 0;

But union is still there so I don't know.

> +	}
> +
> +	lockdep_assert_held(&dev_priv->drm.struct_mutex);

Best to move above return -ENODEV line.

> +
> +	i915_retire_requests(dev_priv);
> +
> +	/* Now use the RCS to actually reconfigure. */
> +	engine = dev_priv->engine[RCS];
> +
> +	rq = i915_request_alloc(engine, dev_priv->kernel_context);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	ret = engine->emit_rpcs_config(rq, ctx,
> +				       intel_engine_prepare_sseu(engine, sseu));
> +	if (ret) {
> +		__i915_request_add(rq, true);
> +		return ret;
> +	}
> +
> +	/* Queue this switch after all other activity */
> +	list_for_each_entry(ring, &dev_priv->gt.active_rings, active_link) {
> +		struct i915_request *prev;
> +
> +		prev = last_request_on_engine(ring->timeline, engine);
> +		if (prev)
> +			i915_sw_fence_await_sw_fence_gfp(&rq->submit,
> +							 &prev->submit,
> +							 I915_FENCE_GFP);
> +	}
> +
> +	i915_gem_set_global_barrier(dev_priv, rq);
> +	__i915_request_add(rq, true);
> +
> +	/*
> +	 * Apply the configuration to all engine. Our hardware doesn't
> +	 * currently support different configurations for each engine.
> +	 */
> +	for_each_engine(engine, dev_priv, id) {
> +		struct intel_context *ce = to_intel_context(ctx, engine);
> +
> +		ce->sseu.value = sseu.value;
> +	}
> +
> +	return 0;
> +}
> +
>   int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>   				    struct drm_file *file)
>   {
> @@ -767,6 +864,37 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>   	case I915_CONTEXT_PARAM_PRIORITY:
>   		args->value = ctx->sched.priority;
>   		break;
> +	case I915_CONTEXT_PARAM_SSEU: {
> +		struct drm_i915_gem_context_param_sseu param_sseu;
> +		struct intel_engine_cs *engine;
> +		struct intel_context *ce;
> +
> +		if (copy_from_user(&param_sseu, u64_to_user_ptr(args->value),
> +				   sizeof(param_sseu))) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		engine = intel_engine_lookup_user(to_i915(dev),
> +						  param_sseu.class,
> +						  param_sseu.instance);
> +		if (!engine) {
> +			ret = -EINVAL;
> +			break;
> +		}
> +
> +		ce = to_intel_context(ctx, engine);
> +
> +		param_sseu.slice_mask = ce->sseu.slice_mask;
> +		param_sseu.subslice_mask = ce->sseu.subslice_mask;
> +		param_sseu.min_eus_per_subslice = ce->sseu.min_eus_per_subslice;
> +		param_sseu.max_eus_per_subslice = ce->sseu.max_eus_per_subslice;
> +
> +		if (copy_to_user(u64_to_user_ptr(args->value), &param_sseu,
> +				 sizeof(param_sseu)))
> +			ret = -EFAULT;
> +		break;

Should we think about maybe not implementing the getter? I mean, is it 
useful or just code for the driver which could be dropped?

> +	}
>   	default:
>   		ret = -EINVAL;
>   		break;
> @@ -841,7 +969,46 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>   				ctx->sched.priority = priority;
>   		}
>   		break;
> +	case I915_CONTEXT_PARAM_SSEU:
> +		{
> +			struct drm_i915_private *dev_priv = to_i915(dev);
> +			struct drm_i915_gem_context_param_sseu user_sseu;
> +			struct intel_engine_cs *engine;
> +			union intel_sseu ctx_sseu;
>   
> +			if (args->size) {
> +				ret = -EINVAL;
> +				break;
> +			}
> +
> +			if (!capable(CAP_SYS_ADMIN)) {

Hm not sure, will come back to it in the next patch.

> +				ret = -EPERM;
> +				break;
> +			}
> +
> +			if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
> +					   sizeof(user_sseu))) {
> +				ret = -EFAULT;
> +				break;
> +			}
> +
> +			engine = intel_engine_lookup_user(dev_priv,
> +							  user_sseu.class,
> +							  user_sseu.instance);
> +			if (!engine) {
> +				ret = -EINVAL;
> +				break;
> +			}
> +
> +			ret = intel_sseu_from_user_sseu(&INTEL_INFO(dev_priv)->sseu,
> +							&user_sseu, &ctx_sseu);
> +			if (ret)
> +				break;
> +
> +			ret = i915_gem_context_reconfigure_sseu(ctx, engine,
> +								ctx_sseu);
> +		}
> +		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index fc499bcbd105..9f0b965125c4 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -643,6 +643,22 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>   	return NOTIFY_DONE;
>   }
>   
> +static int
> +i915_request_await_request(struct i915_request *to, struct i915_request *from);
> +
> +static int add_global_barrier(struct i915_request *rq)
> +{
> +	struct i915_request *barrier;
> +	int ret = 0;
> +
> +	barrier = i915_gem_active_raw(&rq->i915->gt.global_barrier,
> +				      &rq->i915->drm.struct_mutex);
> +	if (barrier)
> +		ret = i915_request_await_request(rq, barrier);
> +
> +	return ret;
> +}
> +
>   /**
>    * i915_request_alloc - allocate a request structure
>    *
> @@ -804,6 +820,10 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	 */
>   	rq->head = rq->ring->emit;
>   
> +	ret = add_global_barrier(rq);
> +	if (ret)
> +		goto err_unwind;

Who ever clears the barrier?

> +
>   	/* Unconditionally invalidate GPU caches and TLBs. */
>   	ret = engine->emit_flush(rq, EMIT_INVALIDATE);
>   	if (ret)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 8fb6e66a7a84..e52c9511b5fb 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2240,6 +2240,72 @@ static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
>   }
>   static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS;
>   
> +u32 gen8_make_rpcs(const struct sseu_dev_info *sseu,
> +		   union intel_sseu ctx_sseu)
> +{
> +	u32 rpcs = 0;
> +
> +	/*
> +	 * Starting in Gen9, render power gating can leave
> +	 * slice/subslice/EU in a partially enabled state. We
> +	 * must make an explicit request through RPCS for full
> +	 * enablement.
> +	 */
> +	if (sseu->has_slice_pg) {
> +		rpcs |= GEN8_RPCS_S_CNT_ENABLE;
> +		rpcs |= hweight8(ctx_sseu.slice_mask) << GEN8_RPCS_S_CNT_SHIFT;
> +		rpcs |= GEN8_RPCS_ENABLE;
> +	}
> +
> +	if (sseu->has_subslice_pg) {
> +		rpcs |= GEN8_RPCS_SS_CNT_ENABLE;
> +		rpcs |= hweight8(ctx_sseu.subslice_mask) <<
> +			GEN8_RPCS_SS_CNT_SHIFT;
> +		rpcs |= GEN8_RPCS_ENABLE;
> +	}
> +
> +	if (sseu->has_eu_pg) {
> +		rpcs |= ctx_sseu.min_eus_per_subslice <<
> +			GEN8_RPCS_EU_MIN_SHIFT;
> +		rpcs |= ctx_sseu.max_eus_per_subslice <<
> +			GEN8_RPCS_EU_MAX_SHIFT;
> +		rpcs |= GEN8_RPCS_ENABLE;
> +	}
> +
> +	return rpcs;
> +}
> +
> +static int gen8_emit_rpcs_config(struct i915_request *rq,
> +				 struct i915_gem_context *ctx,
> +				 union intel_sseu sseu)
> +{
> +	struct drm_i915_private *dev_priv = rq->i915;
> +	struct intel_context *ce = to_intel_context(ctx, dev_priv->engine[RCS]);
> +	u64 offset;
> +	u32 *cs;
> +
> +	/* Let the deferred state allocation take care of this. */
> +	if (!ce->state)
> +		return 0;
> +
> +	cs = intel_ring_begin(rq, 4);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	offset = ce->state->node.start +
> +		LRC_STATE_PN * PAGE_SIZE +
> +		(CTX_R_PWR_CLK_STATE + 1) * 4;
> +
> +	*cs++ = MI_STORE_DWORD_IMM_GEN4;
> +	*cs++ = lower_32_bits(offset);
> +	*cs++ = upper_32_bits(offset);
> +	*cs++ = gen8_make_rpcs(&INTEL_INFO(dev_priv)->sseu, sseu);
> +
> +	intel_ring_advance(rq, cs);
> +
> +	return 0;
> +}
> +
>   static int gen8_init_rcs_context(struct i915_request *rq)
>   {
>   	int ret;
> @@ -2333,6 +2399,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   	engine->emit_breadcrumb = gen8_emit_breadcrumb;
>   	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
>   
> +	engine->emit_rpcs_config = gen8_emit_rpcs_config;
> +
>   	engine->set_default_submission = execlists_set_default_submission;
>   
>   	if (INTEL_GEN(engine->i915) < 11) {
> @@ -2481,41 +2549,6 @@ int logical_xcs_ring_init(struct intel_engine_cs *engine)
>   	return logical_ring_init(engine);
>   }
>   
> -u32 gen8_make_rpcs(const struct sseu_dev_info *sseu,
> -		   union intel_sseu ctx_sseu)
> -{
> -	u32 rpcs = 0;
> -
> -	/*
> -	 * Starting in Gen9, render power gating can leave
> -	 * slice/subslice/EU in a partially enabled state. We
> -	 * must make an explicit request through RPCS for full
> -	 * enablement.
> -	*/
> -	if (sseu->has_slice_pg) {
> -		rpcs |= GEN8_RPCS_S_CNT_ENABLE;
> -		rpcs |= hweight8(ctx_sseu.slice_mask) << GEN8_RPCS_S_CNT_SHIFT;
> -		rpcs |= GEN8_RPCS_ENABLE;
> -	}
> -
> -	if (sseu->has_subslice_pg) {
> -		rpcs |= GEN8_RPCS_SS_CNT_ENABLE;
> -		rpcs |= hweight8(ctx_sseu.subslice_mask) <<
> -			GEN8_RPCS_SS_CNT_SHIFT;
> -		rpcs |= GEN8_RPCS_ENABLE;
> -	}
> -
> -	if (sseu->has_eu_pg) {
> -		rpcs |= ctx_sseu.min_eus_per_subslice <<
> -			GEN8_RPCS_EU_MIN_SHIFT;
> -		rpcs |= ctx_sseu.max_eus_per_subslice <<
> -			GEN8_RPCS_EU_MAX_SHIFT;
> -		rpcs |= GEN8_RPCS_ENABLE;
> -	}
> -
> -	return rpcs;
> -}
> -
>   static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
>   {
>   	u32 indirect_ctx_offset;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 001cf6bcb349..643466d4fa2d 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2061,6 +2061,8 @@ static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
>   			engine->emit_breadcrumb_sz++;
>   	}
>   
> +	engine->emit_rpcs_config = NULL; /* Only supported on Gen8+ */
> +
>   	engine->set_default_submission = i9xx_set_default_submission;
>   
>   	if (INTEL_GEN(dev_priv) >= 6)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index acef385c4c80..6bf4d3b57ced 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -456,6 +456,10 @@ struct intel_engine_cs {
>   	void		(*emit_breadcrumb)(struct i915_request *rq, u32 *cs);
>   	int		emit_breadcrumb_sz;
>   
> +	int		(*emit_rpcs_config)(struct i915_request *rq,
> +					    struct i915_gem_context *ctx,
> +					    union intel_sseu sseu);
> +
>   	/* Pass the request to the hardware queue (e.g. directly into
>   	 * the legacy ringbuffer or to the end of an execlist).
>   	 *
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 7f5634ce8e88..24b90836ce1d 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1456,9 +1456,47 @@ struct drm_i915_gem_context_param {
>   #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
>   #define   I915_CONTEXT_DEFAULT_PRIORITY		0
>   #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
> +	/*
> +	 * When using the following param, value should be a pointer to
> +	 * drm_i915_gem_context_param_sseu.
> +	 */
> +#define I915_CONTEXT_PARAM_SSEU		0x7
>   	__u64 value;
>   };
>   
> +struct drm_i915_gem_context_param_sseu {
> +	/*
> +	 * Engine class & instance to be configured or queried.
> +	 */
> +	__u32 class;
> +	__u32 instance;

Chris and I were talking about whether u32 is overkill and we should 
settle for u16:u16 for class:instance. I think 16-bit should be enough. 
But it can also be u32, I don't think there are any real downsides here 
unless we want to be consistent in all places.

> +
> +	/*
> +	 * Mask of slices to enable for the context. Valid values are a subset
> +	 * of the bitmask value returned for I915_PARAM_SLICE_MASK.
> +	 */
> +	__u8 slice_mask;
> +
> +	/*
> +	 * Mask of subslices to enable for the context. Valid values are a
> +	 * subset of the bitmask value return by I915_PARAM_SUBSLICE_MASK.
> +	 */
> +	__u8 subslice_mask;

Is this future proof enough, say for Gen11?

> +
> +	/*
> +	 * Minimum/Maximum number of EUs to enable per subslice for the
> +	 * context. min_eus_per_subslice must be inferior or equal to
> +	 * max_eus_per_subslice.
> +	 */
> +	__u8 min_eus_per_subslice;
> +	__u8 max_eus_per_subslice;
> +
> +	/*
> +	 * Unused for now. Must be cleared to zero.
> +	 */
> +	__u32 rsvd;
> +};
> +
>   enum drm_i915_oa_format {
>   	I915_OA_FORMAT_A13 = 1,	    /* HSW only */
>   	I915_OA_FORMAT_A29,	    /* HSW only */
> 

Regards,

Tvrtko
Chris Wilson May 23, 2018, 3:18 p.m. UTC | #2
Quoting Tvrtko Ursulin (2018-05-23 16:13:38)
> 
> On 22/05/2018 19:00, Lionel Landwerlin wrote:
> > @@ -804,6 +820,10 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
> >        */
> >       rq->head = rq->ring->emit;
> >   
> > +     ret = add_global_barrier(rq);
> > +     if (ret)
> > +             goto err_unwind;
> 
> Who ever clears the barrier?

Automatically cleared on request retirement (as it is hooked in with a
i915_gem_active).
-Chris
Lionel Landwerlin May 23, 2018, 5:12 p.m. UTC | #3
On 23/05/18 16:13, Tvrtko Ursulin wrote:
>
> On 22/05/2018 19:00, Lionel Landwerlin wrote:
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>
>> We want to allow userspace to reconfigure the subslice configuration for
>> its own use case. To do so, we expose a context parameter to allow
>> adjustment of the RPCS register stored within the context image (and
>> currently not accessible via LRI). If the context is adjusted before
>> first use, the adjustment is for "free"; otherwise if the context is
>> active we flush the context off the GPU (stalling all users) and forcing
>> the GPU to save the context to memory where we can modify it and so
>> ensure that the register is reloaded on next execution.
>>
>> The overhead of managing additional EU subslices can be significant,
>> especially in multi-context workloads. Non-GPGPU contexts should
>> preferably disable the subslices it is not using, and others should
>> fine-tune the number to match their workload.
>>
>> We expose complete control over the RPCS register, allowing
>> configuration of slice/subslice, via masks packed into a u64 for
>> simplicity. For example,
>>
>>     struct drm_i915_gem_context_param arg;
>>     struct drm_i915_gem_context_param_sseu sseu = { .class = 0,
>>                                                     .instance = 0, };
>>
>>     memset(&arg, 0, sizeof(arg));
>>     arg.ctx_id = ctx;
>>     arg.param = I915_CONTEXT_PARAM_SSEU;
>>     arg.value = (uintptr_t) &sseu;
>>     if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg) == 0) {
>>         sseu.packed.subslice_mask = 0;
>>
>>         drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &arg);
>>     }
>>
>> could be used to disable all subslices where supported.
>>
>> v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() 
>> (Lionel)
>>
>> v3: Add ability to program this per engine (Chris)
>>
>> v4: Move most get_sseu() into i915_gem_context.c (Lionel)
>>
>> v5: Validate sseu configuration against the device's capabilities 
>> (Lionel)
>>
>> v6: Change context powergating settings through MI_SDM on kernel 
>> context (Chris)
>>
>> v7: Synchronize the requests following a powergating setting change 
>> using a global
>>      dependency (Chris)
>>      Iterate timelines through dev_priv.gt.active_rings (Tvrtko)
>>      Disable RPCS configuration setting for non capable users 
>> (Lionel/Tvrtko)
>>
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>> c: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
>> CC: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> CC: Zhipeng Gong <zhipeng.gong@intel.com>
>> CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h         |  13 ++
>>   drivers/gpu/drm/i915/i915_gem.c         |   2 +
>>   drivers/gpu/drm/i915/i915_gem_context.c | 167 ++++++++++++++++++++++++
>>   drivers/gpu/drm/i915/i915_request.c     |  20 +++
>>   drivers/gpu/drm/i915/intel_lrc.c        | 103 ++++++++++-----
>>   drivers/gpu/drm/i915/intel_ringbuffer.c |   2 +
>>   drivers/gpu/drm/i915/intel_ringbuffer.h |   4 +
>>   include/uapi/drm/i915_drm.h             |  38 ++++++
>>   8 files changed, 314 insertions(+), 35 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h 
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index 21631b51b37b..09cfcfe1c339 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -2067,6 +2067,12 @@ struct drm_i915_private {
>>           u32 active_requests;
>>           u32 request_serial;
>>   +        /**
>> +         * Global barrier to ensuring ordering of sseu transitions
>> +         * requests.
>> +         */
>> +        struct i915_gem_active global_barrier;
>> +
>>           /**
>>            * Is the GPU currently considered idle, or busy executing
>>            * userspace requests? Whilst idle, we allow runtime power
>> @@ -3227,6 +3233,13 @@ i915_vm_to_ppgtt(struct i915_address_space *vm)
>>       return container_of(vm, struct i915_hw_ppgtt, base);
>>   }
>>   +static inline void i915_gem_set_global_barrier(struct 
>> drm_i915_private *i915,
>> +                           struct i915_request *rq)
>> +{
>> +    lockdep_assert_held(&i915->drm.struct_mutex);
>> +    i915_gem_active_set(&i915->gt.global_barrier, rq);
>> +}
>> +
>>   /* i915_gem_fence_reg.c */
>>   struct drm_i915_fence_reg *
>>   i915_reserve_fence(struct drm_i915_private *dev_priv);
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c 
>> b/drivers/gpu/drm/i915/i915_gem.c
>> index 03874b50ada9..9c2a0d04bd39 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -5548,6 +5548,8 @@ int i915_gem_init_early(struct drm_i915_private 
>> *dev_priv)
>>       if (!dev_priv->priorities)
>>           goto err_dependencies;
>>   +    init_request_active(&dev_priv->gt.global_barrier, NULL);
>> +
>>       INIT_LIST_HEAD(&dev_priv->gt.timelines);
>>       INIT_LIST_HEAD(&dev_priv->gt.active_rings);
>>       INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
>> b/drivers/gpu/drm/i915/i915_gem_context.c
>> index ea9ae1046827..5c5a12f1c265 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -730,6 +730,103 @@ int i915_gem_context_destroy_ioctl(struct 
>> drm_device *dev, void *data,
>>       return 0;
>>   }
>>   +static int
>> +intel_sseu_from_user_sseu(const struct sseu_dev_info *sseu,
>> +              const struct drm_i915_gem_context_param_sseu *user_sseu,
>> +              union intel_sseu *ctx_sseu)
>> +{
>> +    if ((user_sseu->slice_mask & ~sseu->slice_mask) != 0 ||
>> +        user_sseu->slice_mask == 0)
>> +        return -EINVAL;
>> +
>> +    if ((user_sseu->subslice_mask & ~sseu->subslice_mask[0]) != 0 ||
>> +        user_sseu->subslice_mask == 0)
>> +        return -EINVAL;
>> +
>> +    if (user_sseu->min_eus_per_subslice > sseu->max_eus_per_subslice)
>> +        return -EINVAL;
>> +
>> +    if (user_sseu->max_eus_per_subslice > sseu->max_eus_per_subslice ||
>> +        user_sseu->max_eus_per_subslice < 
>> user_sseu->min_eus_per_subslice ||
>> +        user_sseu->max_eus_per_subslice == 0)
>> +        return -EINVAL;
>> +
>> +    ctx_sseu->slice_mask = user_sseu->slice_mask;
>> +    ctx_sseu->subslice_mask = user_sseu->subslice_mask;
>> +    ctx_sseu->min_eus_per_subslice = user_sseu->min_eus_per_subslice;
>> +    ctx_sseu->max_eus_per_subslice = user_sseu->max_eus_per_subslice;
>> +
>> +    return 0;
>> +}
>> +
>> +static int
>> +i915_gem_context_reconfigure_sseu(struct i915_gem_context *ctx,
>> +                  struct intel_engine_cs *engine,
>> +                  union intel_sseu sseu)
>> +{
>> +    struct drm_i915_private *dev_priv = ctx->i915;
>> +    struct i915_request *rq;
>> +    struct intel_ring *ring;
>> +    enum intel_engine_id id;
>> +    int ret;
>> +
>> +    /*
>> +     * First notify user when this capability is not available so 
>> that it
>> +     * can be detected with any valid input.
>> +     */
>> +    if (!engine->emit_rpcs_config)
>> +        return -ENODEV;
>> +
>> +    if (memcmp(&to_intel_context(ctx, engine)->sseu,
>> +           &sseu, sizeof(sseu)) == 0) {
>> +        return 0;
>
> But union is still there so I don't know.

Moving to struct.

>
>> +    }
>> +
>> +    lockdep_assert_held(&dev_priv->drm.struct_mutex);
>
> Best to move above return -ENODEV line.

Done.

>
>> +
>> +    i915_retire_requests(dev_priv);
>> +
>> +    /* Now use the RCS to actually reconfigure. */
>> +    engine = dev_priv->engine[RCS];
>> +
>> +    rq = i915_request_alloc(engine, dev_priv->kernel_context);
>> +    if (IS_ERR(rq))
>> +        return PTR_ERR(rq);
>> +
>> +    ret = engine->emit_rpcs_config(rq, ctx,
>> +                       intel_engine_prepare_sseu(engine, sseu));
>> +    if (ret) {
>> +        __i915_request_add(rq, true);
>> +        return ret;
>> +    }
>> +
>> +    /* Queue this switch after all other activity */
>> +    list_for_each_entry(ring, &dev_priv->gt.active_rings, 
>> active_link) {
>> +        struct i915_request *prev;
>> +
>> +        prev = last_request_on_engine(ring->timeline, engine);
>> +        if (prev)
>> + i915_sw_fence_await_sw_fence_gfp(&rq->submit,
>> +                             &prev->submit,
>> +                             I915_FENCE_GFP);
>> +    }
>> +
>> +    i915_gem_set_global_barrier(dev_priv, rq);
>> +    __i915_request_add(rq, true);
>> +
>> +    /*
>> +     * Apply the configuration to all engine. Our hardware doesn't
>> +     * currently support different configurations for each engine.
>> +     */
>> +    for_each_engine(engine, dev_priv, id) {
>> +        struct intel_context *ce = to_intel_context(ctx, engine);
>> +
>> +        ce->sseu.value = sseu.value;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   int i915_gem_context_getparam_ioctl(struct drm_device *dev, void 
>> *data,
>>                       struct drm_file *file)
>>   {
>> @@ -767,6 +864,37 @@ int i915_gem_context_getparam_ioctl(struct 
>> drm_device *dev, void *data,
>>       case I915_CONTEXT_PARAM_PRIORITY:
>>           args->value = ctx->sched.priority;
>>           break;
>> +    case I915_CONTEXT_PARAM_SSEU: {
>> +        struct drm_i915_gem_context_param_sseu param_sseu;
>> +        struct intel_engine_cs *engine;
>> +        struct intel_context *ce;
>> +
>> +        if (copy_from_user(&param_sseu, u64_to_user_ptr(args->value),
>> +                   sizeof(param_sseu))) {
>> +            ret = -EFAULT;
>> +            break;
>> +        }
>> +
>> +        engine = intel_engine_lookup_user(to_i915(dev),
>> +                          param_sseu.class,
>> +                          param_sseu.instance);
>> +        if (!engine) {
>> +            ret = -EINVAL;
>> +            break;
>> +        }
>> +
>> +        ce = to_intel_context(ctx, engine);
>> +
>> +        param_sseu.slice_mask = ce->sseu.slice_mask;
>> +        param_sseu.subslice_mask = ce->sseu.subslice_mask;
>> +        param_sseu.min_eus_per_subslice = 
>> ce->sseu.min_eus_per_subslice;
>> +        param_sseu.max_eus_per_subslice = 
>> ce->sseu.max_eus_per_subslice;
>> +
>> +        if (copy_to_user(u64_to_user_ptr(args->value), &param_sseu,
>> +                 sizeof(param_sseu)))
>> +            ret = -EFAULT;
>> +        break;
>
> Should we think about maybe not implementing the getter? I mean, is it 
> useful or just code for the driver which could be dropped?

Well, render fd can be transfer between processes, so I think it makes 
sense to have a way to tell what is the current configuration.

>
>> +    }
>>       default:
>>           ret = -EINVAL;
>>           break;
>> @@ -841,7 +969,46 @@ int i915_gem_context_setparam_ioctl(struct 
>> drm_device *dev, void *data,
>>                   ctx->sched.priority = priority;
>>           }
>>           break;
>> +    case I915_CONTEXT_PARAM_SSEU:
>> +        {
>> +            struct drm_i915_private *dev_priv = to_i915(dev);
>> +            struct drm_i915_gem_context_param_sseu user_sseu;
>> +            struct intel_engine_cs *engine;
>> +            union intel_sseu ctx_sseu;
>>   +            if (args->size) {
>> +                ret = -EINVAL;
>> +                break;
>> +            }
>> +
>> +            if (!capable(CAP_SYS_ADMIN)) {
>
> Hm not sure, will come back to it in the next patch.
>
>> +                ret = -EPERM;
>> +                break;
>> +            }
>> +
>> +            if (copy_from_user(&user_sseu, 
>> u64_to_user_ptr(args->value),
>> +                       sizeof(user_sseu))) {
>> +                ret = -EFAULT;
>> +                break;
>> +            }
>> +
>> +            engine = intel_engine_lookup_user(dev_priv,
>> +                              user_sseu.class,
>> +                              user_sseu.instance);
>> +            if (!engine) {
>> +                ret = -EINVAL;
>> +                break;
>> +            }
>> +
>> +            ret = 
>> intel_sseu_from_user_sseu(&INTEL_INFO(dev_priv)->sseu,
>> +                            &user_sseu, &ctx_sseu);
>> +            if (ret)
>> +                break;
>> +
>> +            ret = i915_gem_context_reconfigure_sseu(ctx, engine,
>> +                                ctx_sseu);
>> +        }
>> +        break;
>>       default:
>>           ret = -EINVAL;
>>           break;
>> diff --git a/drivers/gpu/drm/i915/i915_request.c 
>> b/drivers/gpu/drm/i915/i915_request.c
>> index fc499bcbd105..9f0b965125c4 100644
>> --- a/drivers/gpu/drm/i915/i915_request.c
>> +++ b/drivers/gpu/drm/i915/i915_request.c
>> @@ -643,6 +643,22 @@ submit_notify(struct i915_sw_fence *fence, enum 
>> i915_sw_fence_notify state)
>>       return NOTIFY_DONE;
>>   }
>>   +static int
>> +i915_request_await_request(struct i915_request *to, struct 
>> i915_request *from);
>> +
>> +static int add_global_barrier(struct i915_request *rq)
>> +{
>> +    struct i915_request *barrier;
>> +    int ret = 0;
>> +
>> +    barrier = i915_gem_active_raw(&rq->i915->gt.global_barrier,
>> +                      &rq->i915->drm.struct_mutex);
>> +    if (barrier)
>> +        ret = i915_request_await_request(rq, barrier);
>> +
>> +    return ret;
>> +}
>> +
>>   /**
>>    * i915_request_alloc - allocate a request structure
>>    *
>> @@ -804,6 +820,10 @@ i915_request_alloc(struct intel_engine_cs 
>> *engine, struct i915_gem_context *ctx)
>>        */
>>       rq->head = rq->ring->emit;
>>   +    ret = add_global_barrier(rq);
>> +    if (ret)
>> +        goto err_unwind;
>
> Who ever clears the barrier?

It's cleared when the request is retired.

>
>> +
>>       /* Unconditionally invalidate GPU caches and TLBs. */
>>       ret = engine->emit_flush(rq, EMIT_INVALIDATE);
>>       if (ret)
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
>> b/drivers/gpu/drm/i915/intel_lrc.c
>> index 8fb6e66a7a84..e52c9511b5fb 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -2240,6 +2240,72 @@ static void gen8_emit_breadcrumb_rcs(struct 
>> i915_request *request, u32 *cs)
>>   }
>>   static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS;
>>   +u32 gen8_make_rpcs(const struct sseu_dev_info *sseu,
>> +           union intel_sseu ctx_sseu)
>> +{
>> +    u32 rpcs = 0;
>> +
>> +    /*
>> +     * Starting in Gen9, render power gating can leave
>> +     * slice/subslice/EU in a partially enabled state. We
>> +     * must make an explicit request through RPCS for full
>> +     * enablement.
>> +     */
>> +    if (sseu->has_slice_pg) {
>> +        rpcs |= GEN8_RPCS_S_CNT_ENABLE;
>> +        rpcs |= hweight8(ctx_sseu.slice_mask) << GEN8_RPCS_S_CNT_SHIFT;
>> +        rpcs |= GEN8_RPCS_ENABLE;
>> +    }
>> +
>> +    if (sseu->has_subslice_pg) {
>> +        rpcs |= GEN8_RPCS_SS_CNT_ENABLE;
>> +        rpcs |= hweight8(ctx_sseu.subslice_mask) <<
>> +            GEN8_RPCS_SS_CNT_SHIFT;
>> +        rpcs |= GEN8_RPCS_ENABLE;
>> +    }
>> +
>> +    if (sseu->has_eu_pg) {
>> +        rpcs |= ctx_sseu.min_eus_per_subslice <<
>> +            GEN8_RPCS_EU_MIN_SHIFT;
>> +        rpcs |= ctx_sseu.max_eus_per_subslice <<
>> +            GEN8_RPCS_EU_MAX_SHIFT;
>> +        rpcs |= GEN8_RPCS_ENABLE;
>> +    }
>> +
>> +    return rpcs;
>> +}
>> +
>> +static int gen8_emit_rpcs_config(struct i915_request *rq,
>> +                 struct i915_gem_context *ctx,
>> +                 union intel_sseu sseu)
>> +{
>> +    struct drm_i915_private *dev_priv = rq->i915;
>> +    struct intel_context *ce = to_intel_context(ctx, 
>> dev_priv->engine[RCS]);
>> +    u64 offset;
>> +    u32 *cs;
>> +
>> +    /* Let the deferred state allocation take care of this. */
>> +    if (!ce->state)
>> +        return 0;
>> +
>> +    cs = intel_ring_begin(rq, 4);
>> +    if (IS_ERR(cs))
>> +        return PTR_ERR(cs);
>> +
>> +    offset = ce->state->node.start +
>> +        LRC_STATE_PN * PAGE_SIZE +
>> +        (CTX_R_PWR_CLK_STATE + 1) * 4;
>> +
>> +    *cs++ = MI_STORE_DWORD_IMM_GEN4;
>> +    *cs++ = lower_32_bits(offset);
>> +    *cs++ = upper_32_bits(offset);
>> +    *cs++ = gen8_make_rpcs(&INTEL_INFO(dev_priv)->sseu, sseu);
>> +
>> +    intel_ring_advance(rq, cs);
>> +
>> +    return 0;
>> +}
>> +
>>   static int gen8_init_rcs_context(struct i915_request *rq)
>>   {
>>       int ret;
>> @@ -2333,6 +2399,8 @@ logical_ring_default_vfuncs(struct 
>> intel_engine_cs *engine)
>>       engine->emit_breadcrumb = gen8_emit_breadcrumb;
>>       engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
>>   +    engine->emit_rpcs_config = gen8_emit_rpcs_config;
>> +
>>       engine->set_default_submission = execlists_set_default_submission;
>>         if (INTEL_GEN(engine->i915) < 11) {
>> @@ -2481,41 +2549,6 @@ int logical_xcs_ring_init(struct 
>> intel_engine_cs *engine)
>>       return logical_ring_init(engine);
>>   }
>>   -u32 gen8_make_rpcs(const struct sseu_dev_info *sseu,
>> -           union intel_sseu ctx_sseu)
>> -{
>> -    u32 rpcs = 0;
>> -
>> -    /*
>> -     * Starting in Gen9, render power gating can leave
>> -     * slice/subslice/EU in a partially enabled state. We
>> -     * must make an explicit request through RPCS for full
>> -     * enablement.
>> -    */
>> -    if (sseu->has_slice_pg) {
>> -        rpcs |= GEN8_RPCS_S_CNT_ENABLE;
>> -        rpcs |= hweight8(ctx_sseu.slice_mask) << GEN8_RPCS_S_CNT_SHIFT;
>> -        rpcs |= GEN8_RPCS_ENABLE;
>> -    }
>> -
>> -    if (sseu->has_subslice_pg) {
>> -        rpcs |= GEN8_RPCS_SS_CNT_ENABLE;
>> -        rpcs |= hweight8(ctx_sseu.subslice_mask) <<
>> -            GEN8_RPCS_SS_CNT_SHIFT;
>> -        rpcs |= GEN8_RPCS_ENABLE;
>> -    }
>> -
>> -    if (sseu->has_eu_pg) {
>> -        rpcs |= ctx_sseu.min_eus_per_subslice <<
>> -            GEN8_RPCS_EU_MIN_SHIFT;
>> -        rpcs |= ctx_sseu.max_eus_per_subslice <<
>> -            GEN8_RPCS_EU_MAX_SHIFT;
>> -        rpcs |= GEN8_RPCS_ENABLE;
>> -    }
>> -
>> -    return rpcs;
>> -}
>> -
>>   static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs 
>> *engine)
>>   {
>>       u32 indirect_ctx_offset;
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
>> b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> index 001cf6bcb349..643466d4fa2d 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> @@ -2061,6 +2061,8 @@ static void intel_ring_default_vfuncs(struct 
>> drm_i915_private *dev_priv,
>>               engine->emit_breadcrumb_sz++;
>>       }
>>   +    engine->emit_rpcs_config = NULL; /* Only supported on Gen8+ */
>> +
>>       engine->set_default_submission = i9xx_set_default_submission;
>>         if (INTEL_GEN(dev_priv) >= 6)
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
>> b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index acef385c4c80..6bf4d3b57ced 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -456,6 +456,10 @@ struct intel_engine_cs {
>>       void        (*emit_breadcrumb)(struct i915_request *rq, u32 *cs);
>>       int        emit_breadcrumb_sz;
>>   +    int        (*emit_rpcs_config)(struct i915_request *rq,
>> +                        struct i915_gem_context *ctx,
>> +                        union intel_sseu sseu);
>> +
>>       /* Pass the request to the hardware queue (e.g. directly into
>>        * the legacy ringbuffer or to the end of an execlist).
>>        *
>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>> index 7f5634ce8e88..24b90836ce1d 100644
>> --- a/include/uapi/drm/i915_drm.h
>> +++ b/include/uapi/drm/i915_drm.h
>> @@ -1456,9 +1456,47 @@ struct drm_i915_gem_context_param {
>>   #define   I915_CONTEXT_MAX_USER_PRIORITY    1023 /* inclusive */
>>   #define   I915_CONTEXT_DEFAULT_PRIORITY        0
>>   #define   I915_CONTEXT_MIN_USER_PRIORITY    -1023 /* inclusive */
>> +    /*
>> +     * When using the following param, value should be a pointer to
>> +     * drm_i915_gem_context_param_sseu.
>> +     */
>> +#define I915_CONTEXT_PARAM_SSEU        0x7
>>       __u64 value;
>>   };
>>   +struct drm_i915_gem_context_param_sseu {
>> +    /*
>> +     * Engine class & instance to be configured or queried.
>> +     */
>> +    __u32 class;
>> +    __u32 instance;
>
> Chris and I were talking about whether u32 is overkill and we should 
> settle for u16:u16 for class:instance. I think 16-bit should be 
> enough. But it can also be u32, I don't think there are any real 
> downsides here unless we want to be consistent in all places.

Let me know what you think is best.

>
>> +
>> +    /*
>> +     * Mask of slices to enable for the context. Valid values are a 
>> subset
>> +     * of the bitmask value returned for I915_PARAM_SLICE_MASK.
>> +     */
>> +    __u8 slice_mask;
>> +
>> +    /*
>> +     * Mask of subslices to enable for the context. Valid values are a
>> +     * subset of the bitmask value return by I915_PARAM_SUBSLICE_MASK.
>> +     */
>> +    __u8 subslice_mask;
>
> Is this future proof enough, say for Gen11?

As far as I can see, this fits.
No objection to bump it to 16/32bits if you'd like.

>
>> +
>> +    /*
>> +     * Minimum/Maximum number of EUs to enable per subslice for the
>> +     * context. min_eus_per_subslice must be inferior or equal to
>> +     * max_eus_per_subslice.
>> +     */
>> +    __u8 min_eus_per_subslice;
>> +    __u8 max_eus_per_subslice;
>> +
>> +    /*
>> +     * Unused for now. Must be cleared to zero.
>> +     */
>> +    __u32 rsvd;
>> +};
>> +
>>   enum drm_i915_oa_format {
>>       I915_OA_FORMAT_A13 = 1,        /* HSW only */
>>       I915_OA_FORMAT_A29,        /* HSW only */
>>
>
> Regards,
>
> Tvrtko
>
Tvrtko Ursulin May 24, 2018, 10:43 a.m. UTC | #4
On 23/05/2018 18:12, Lionel Landwerlin wrote:
> On 23/05/18 16:13, Tvrtko Ursulin wrote:
>>
>> On 22/05/2018 19:00, Lionel Landwerlin wrote:
>>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>>
>>> We want to allow userspace to reconfigure the subslice configuration for
>>> its own use case. To do so, we expose a context parameter to allow
>>> adjustment of the RPCS register stored within the context image (and
>>> currently not accessible via LRI). If the context is adjusted before
>>> first use, the adjustment is for "free"; otherwise if the context is
>>> active we flush the context off the GPU (stalling all users) and forcing
>>> the GPU to save the context to memory where we can modify it and so
>>> ensure that the register is reloaded on next execution.
>>>
>>> The overhead of managing additional EU subslices can be significant,
>>> especially in multi-context workloads. Non-GPGPU contexts should
>>> preferably disable the subslices it is not using, and others should
>>> fine-tune the number to match their workload.
>>>
>>> We expose complete control over the RPCS register, allowing
>>> configuration of slice/subslice, via masks packed into a u64 for
>>> simplicity. For example,
>>>
>>>     struct drm_i915_gem_context_param arg;
>>>     struct drm_i915_gem_context_param_sseu sseu = { .class = 0,
>>>                                                     .instance = 0, };
>>>
>>>     memset(&arg, 0, sizeof(arg));
>>>     arg.ctx_id = ctx;
>>>     arg.param = I915_CONTEXT_PARAM_SSEU;
>>>     arg.value = (uintptr_t) &sseu;
>>>     if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg) == 0) {
>>>         sseu.packed.subslice_mask = 0;
>>>
>>>         drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &arg);
>>>     }
>>>
>>> could be used to disable all subslices where supported.
>>>
>>> v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() 
>>> (Lionel)
>>>
>>> v3: Add ability to program this per engine (Chris)
>>>
>>> v4: Move most get_sseu() into i915_gem_context.c (Lionel)
>>>
>>> v5: Validate sseu configuration against the device's capabilities 
>>> (Lionel)
>>>
>>> v6: Change context powergating settings through MI_SDM on kernel 
>>> context (Chris)
>>>
>>> v7: Synchronize the requests following a powergating setting change 
>>> using a global
>>>      dependency (Chris)
>>>      Iterate timelines through dev_priv.gt.active_rings (Tvrtko)
>>>      Disable RPCS configuration setting for non capable users 
>>> (Lionel/Tvrtko)
>>>
>>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>>> c: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
>>> CC: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> CC: Zhipeng Gong <zhipeng.gong@intel.com>
>>> CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/i915_drv.h         |  13 ++
>>>   drivers/gpu/drm/i915/i915_gem.c         |   2 +
>>>   drivers/gpu/drm/i915/i915_gem_context.c | 167 ++++++++++++++++++++++++
>>>   drivers/gpu/drm/i915/i915_request.c     |  20 +++
>>>   drivers/gpu/drm/i915/intel_lrc.c        | 103 ++++++++++-----
>>>   drivers/gpu/drm/i915/intel_ringbuffer.c |   2 +
>>>   drivers/gpu/drm/i915/intel_ringbuffer.h |   4 +
>>>   include/uapi/drm/i915_drm.h             |  38 ++++++
>>>   8 files changed, 314 insertions(+), 35 deletions(-)

[snip]

>>>   int i915_gem_context_getparam_ioctl(struct drm_device *dev, void 
>>> *data,
>>>                       struct drm_file *file)
>>>   {
>>> @@ -767,6 +864,37 @@ int i915_gem_context_getparam_ioctl(struct 
>>> drm_device *dev, void *data,
>>>       case I915_CONTEXT_PARAM_PRIORITY:
>>>           args->value = ctx->sched.priority;
>>>           break;
>>> +    case I915_CONTEXT_PARAM_SSEU: {
>>> +        struct drm_i915_gem_context_param_sseu param_sseu;
>>> +        struct intel_engine_cs *engine;
>>> +        struct intel_context *ce;
>>> +
>>> +        if (copy_from_user(&param_sseu, u64_to_user_ptr(args->value),
>>> +                   sizeof(param_sseu))) {
>>> +            ret = -EFAULT;
>>> +            break;
>>> +        }
>>> +
>>> +        engine = intel_engine_lookup_user(to_i915(dev),
>>> +                          param_sseu.class,
>>> +                          param_sseu.instance);
>>> +        if (!engine) {
>>> +            ret = -EINVAL;
>>> +            break;
>>> +        }
>>> +
>>> +        ce = to_intel_context(ctx, engine);
>>> +
>>> +        param_sseu.slice_mask = ce->sseu.slice_mask;
>>> +        param_sseu.subslice_mask = ce->sseu.subslice_mask;
>>> +        param_sseu.min_eus_per_subslice = 
>>> ce->sseu.min_eus_per_subslice;
>>> +        param_sseu.max_eus_per_subslice = 
>>> ce->sseu.max_eus_per_subslice;
>>> +
>>> +        if (copy_to_user(u64_to_user_ptr(args->value), &param_sseu,
>>> +                 sizeof(param_sseu)))
>>> +            ret = -EFAULT;
>>> +        break;
>>
>> Should we think about maybe not implementing the getter? I mean, is it 
>> useful or just code for the driver which could be dropped?
> 
> Well, render fd can be transfer between processes, so I think it makes 
> sense to have a way to tell what is the current configuration.

I was thinking that userspace can already get the default configuration 
via existing get params / topology query.

But if you change the ctx sseu config and pass that fd out its a bit 
evil. :)

Anyway, no strong feelings to keep it. Was just thinking whether we can 
save ourselves adding some code.

[snip]

>>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>> index 7f5634ce8e88..24b90836ce1d 100644
>>> --- a/include/uapi/drm/i915_drm.h
>>> +++ b/include/uapi/drm/i915_drm.h
>>> @@ -1456,9 +1456,47 @@ struct drm_i915_gem_context_param {
>>>   #define   I915_CONTEXT_MAX_USER_PRIORITY    1023 /* inclusive */
>>>   #define   I915_CONTEXT_DEFAULT_PRIORITY        0
>>>   #define   I915_CONTEXT_MIN_USER_PRIORITY    -1023 /* inclusive */
>>> +    /*
>>> +     * When using the following param, value should be a pointer to
>>> +     * drm_i915_gem_context_param_sseu.
>>> +     */
>>> +#define I915_CONTEXT_PARAM_SSEU        0x7
>>>       __u64 value;
>>>   };
>>>   +struct drm_i915_gem_context_param_sseu {
>>> +    /*
>>> +     * Engine class & instance to be configured or queried.
>>> +     */
>>> +    __u32 class;
>>> +    __u32 instance;
>>
>> Chris and I were talking about whether u32 is overkill and we should 
>> settle for u16:u16 for class:instance. I think 16-bit should be 
>> enough. But it can also be u32, I don't think there are any real 
>> downsides here unless we want to be consistent in all places.
> 
> Let me know what you think is best.

I'd say u16:u16, Chris?

> 
>>
>>> +
>>> +    /*
>>> +     * Mask of slices to enable for the context. Valid values are a 
>>> subset
>>> +     * of the bitmask value returned for I915_PARAM_SLICE_MASK.
>>> +     */
>>> +    __u8 slice_mask;
>>> +
>>> +    /*
>>> +     * Mask of subslices to enable for the context. Valid values are a
>>> +     * subset of the bitmask value return by I915_PARAM_SUBSLICE_MASK.
>>> +     */
>>> +    __u8 subslice_mask;
>>
>> Is this future proof enough, say for Gen11?
> 
> As far as I can see, this fits.
> No objection to bump it to 16/32bits if you'd like.

Feel like I've asked you this before, sorry - nothing in the future will 
need per slice subslice mask?

Regards,

Tvrtko
Lionel Landwerlin May 24, 2018, 11:01 a.m. UTC | #5
On 24/05/18 11:43, Tvrtko Ursulin wrote:
>
>>
>>>
>>>> +
>>>> +    /*
>>>> +     * Mask of slices to enable for the context. Valid values are 
>>>> a subset
>>>> +     * of the bitmask value returned for I915_PARAM_SLICE_MASK.
>>>> +     */
>>>> +    __u8 slice_mask;
>>>> +
>>>> +    /*
>>>> +     * Mask of subslices to enable for the context. Valid values 
>>>> are a
>>>> +     * subset of the bitmask value return by 
>>>> I915_PARAM_SUBSLICE_MASK.
>>>> +     */
>>>> +    __u8 subslice_mask;
>>>
>>> Is this future proof enough, say for Gen11?
>>
>> As far as I can see, this fits.
>> No objection to bump it to 16/32bits if you'd like.
>
> Feel like I've asked you this before, sorry - nothing in the future 
> will need per slice subslice mask?

As far as I can see this remains the same uniform subslice per slice 
programming style.
We could play it safe and put all the masks in 64bits in the uAPI.

What do you think?

>
> Regards,
>
> Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 21631b51b37b..09cfcfe1c339 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2067,6 +2067,12 @@  struct drm_i915_private {
 		u32 active_requests;
 		u32 request_serial;
 
+		/**
+		 * Global barrier to ensuring ordering of sseu transitions
+		 * requests.
+		 */
+		struct i915_gem_active global_barrier;
+
 		/**
 		 * Is the GPU currently considered idle, or busy executing
 		 * userspace requests? Whilst idle, we allow runtime power
@@ -3227,6 +3233,13 @@  i915_vm_to_ppgtt(struct i915_address_space *vm)
 	return container_of(vm, struct i915_hw_ppgtt, base);
 }
 
+static inline void i915_gem_set_global_barrier(struct drm_i915_private *i915,
+					       struct i915_request *rq)
+{
+	lockdep_assert_held(&i915->drm.struct_mutex);
+	i915_gem_active_set(&i915->gt.global_barrier, rq);
+}
+
 /* i915_gem_fence_reg.c */
 struct drm_i915_fence_reg *
 i915_reserve_fence(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 03874b50ada9..9c2a0d04bd39 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5548,6 +5548,8 @@  int i915_gem_init_early(struct drm_i915_private *dev_priv)
 	if (!dev_priv->priorities)
 		goto err_dependencies;
 
+	init_request_active(&dev_priv->gt.global_barrier, NULL);
+
 	INIT_LIST_HEAD(&dev_priv->gt.timelines);
 	INIT_LIST_HEAD(&dev_priv->gt.active_rings);
 	INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index ea9ae1046827..5c5a12f1c265 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -730,6 +730,103 @@  int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
 	return 0;
 }
 
+static int
+intel_sseu_from_user_sseu(const struct sseu_dev_info *sseu,
+			  const struct drm_i915_gem_context_param_sseu *user_sseu,
+			  union intel_sseu *ctx_sseu)
+{
+	if ((user_sseu->slice_mask & ~sseu->slice_mask) != 0 ||
+	    user_sseu->slice_mask == 0)
+		return -EINVAL;
+
+	if ((user_sseu->subslice_mask & ~sseu->subslice_mask[0]) != 0 ||
+	    user_sseu->subslice_mask == 0)
+		return -EINVAL;
+
+	if (user_sseu->min_eus_per_subslice > sseu->max_eus_per_subslice)
+		return -EINVAL;
+
+	if (user_sseu->max_eus_per_subslice > sseu->max_eus_per_subslice ||
+	    user_sseu->max_eus_per_subslice < user_sseu->min_eus_per_subslice ||
+	    user_sseu->max_eus_per_subslice == 0)
+		return -EINVAL;
+
+	ctx_sseu->slice_mask = user_sseu->slice_mask;
+	ctx_sseu->subslice_mask = user_sseu->subslice_mask;
+	ctx_sseu->min_eus_per_subslice = user_sseu->min_eus_per_subslice;
+	ctx_sseu->max_eus_per_subslice = user_sseu->max_eus_per_subslice;
+
+	return 0;
+}
+
+static int
+i915_gem_context_reconfigure_sseu(struct i915_gem_context *ctx,
+				  struct intel_engine_cs *engine,
+				  union intel_sseu sseu)
+{
+	struct drm_i915_private *dev_priv = ctx->i915;
+	struct i915_request *rq;
+	struct intel_ring *ring;
+	enum intel_engine_id id;
+	int ret;
+
+	/*
+	 * First notify user when this capability is not available so that it
+	 * can be detected with any valid input.
+	 */
+	if (!engine->emit_rpcs_config)
+		return -ENODEV;
+
+	if (memcmp(&to_intel_context(ctx, engine)->sseu,
+		   &sseu, sizeof(sseu)) == 0) {
+		return 0;
+	}
+
+	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+
+	i915_retire_requests(dev_priv);
+
+	/* Now use the RCS to actually reconfigure. */
+	engine = dev_priv->engine[RCS];
+
+	rq = i915_request_alloc(engine, dev_priv->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	ret = engine->emit_rpcs_config(rq, ctx,
+				       intel_engine_prepare_sseu(engine, sseu));
+	if (ret) {
+		__i915_request_add(rq, true);
+		return ret;
+	}
+
+	/* Queue this switch after all other activity */
+	list_for_each_entry(ring, &dev_priv->gt.active_rings, active_link) {
+		struct i915_request *prev;
+
+		prev = last_request_on_engine(ring->timeline, engine);
+		if (prev)
+			i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+							 &prev->submit,
+							 I915_FENCE_GFP);
+	}
+
+	i915_gem_set_global_barrier(dev_priv, rq);
+	__i915_request_add(rq, true);
+
+	/*
+	 * Apply the configuration to all engine. Our hardware doesn't
+	 * currently support different configurations for each engine.
+	 */
+	for_each_engine(engine, dev_priv, id) {
+		struct intel_context *ce = to_intel_context(ctx, engine);
+
+		ce->sseu.value = sseu.value;
+	}
+
+	return 0;
+}
+
 int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 				    struct drm_file *file)
 {
@@ -767,6 +864,37 @@  int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_PRIORITY:
 		args->value = ctx->sched.priority;
 		break;
+	case I915_CONTEXT_PARAM_SSEU: {
+		struct drm_i915_gem_context_param_sseu param_sseu;
+		struct intel_engine_cs *engine;
+		struct intel_context *ce;
+
+		if (copy_from_user(&param_sseu, u64_to_user_ptr(args->value),
+				   sizeof(param_sseu))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		engine = intel_engine_lookup_user(to_i915(dev),
+						  param_sseu.class,
+						  param_sseu.instance);
+		if (!engine) {
+			ret = -EINVAL;
+			break;
+		}
+
+		ce = to_intel_context(ctx, engine);
+
+		param_sseu.slice_mask = ce->sseu.slice_mask;
+		param_sseu.subslice_mask = ce->sseu.subslice_mask;
+		param_sseu.min_eus_per_subslice = ce->sseu.min_eus_per_subslice;
+		param_sseu.max_eus_per_subslice = ce->sseu.max_eus_per_subslice;
+
+		if (copy_to_user(u64_to_user_ptr(args->value), &param_sseu,
+				 sizeof(param_sseu)))
+			ret = -EFAULT;
+		break;
+	}
 	default:
 		ret = -EINVAL;
 		break;
@@ -841,7 +969,46 @@  int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 				ctx->sched.priority = priority;
 		}
 		break;
+	case I915_CONTEXT_PARAM_SSEU:
+		{
+			struct drm_i915_private *dev_priv = to_i915(dev);
+			struct drm_i915_gem_context_param_sseu user_sseu;
+			struct intel_engine_cs *engine;
+			union intel_sseu ctx_sseu;
 
+			if (args->size) {
+				ret = -EINVAL;
+				break;
+			}
+
+			if (!capable(CAP_SYS_ADMIN)) {
+				ret = -EPERM;
+				break;
+			}
+
+			if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
+					   sizeof(user_sseu))) {
+				ret = -EFAULT;
+				break;
+			}
+
+			engine = intel_engine_lookup_user(dev_priv,
+							  user_sseu.class,
+							  user_sseu.instance);
+			if (!engine) {
+				ret = -EINVAL;
+				break;
+			}
+
+			ret = intel_sseu_from_user_sseu(&INTEL_INFO(dev_priv)->sseu,
+							&user_sseu, &ctx_sseu);
+			if (ret)
+				break;
+
+			ret = i915_gem_context_reconfigure_sseu(ctx, engine,
+								ctx_sseu);
+		}
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index fc499bcbd105..9f0b965125c4 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -643,6 +643,22 @@  submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 	return NOTIFY_DONE;
 }
 
+static int
+i915_request_await_request(struct i915_request *to, struct i915_request *from);
+
+static int add_global_barrier(struct i915_request *rq)
+{
+	struct i915_request *barrier;
+	int ret = 0;
+
+	barrier = i915_gem_active_raw(&rq->i915->gt.global_barrier,
+				      &rq->i915->drm.struct_mutex);
+	if (barrier)
+		ret = i915_request_await_request(rq, barrier);
+
+	return ret;
+}
+
 /**
  * i915_request_alloc - allocate a request structure
  *
@@ -804,6 +820,10 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 */
 	rq->head = rq->ring->emit;
 
+	ret = add_global_barrier(rq);
+	if (ret)
+		goto err_unwind;
+
 	/* Unconditionally invalidate GPU caches and TLBs. */
 	ret = engine->emit_flush(rq, EMIT_INVALIDATE);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 8fb6e66a7a84..e52c9511b5fb 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2240,6 +2240,72 @@  static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 }
 static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS;
 
+u32 gen8_make_rpcs(const struct sseu_dev_info *sseu,
+		   union intel_sseu ctx_sseu)
+{
+	u32 rpcs = 0;
+
+	/*
+	 * Starting in Gen9, render power gating can leave
+	 * slice/subslice/EU in a partially enabled state. We
+	 * must make an explicit request through RPCS for full
+	 * enablement.
+	 */
+	if (sseu->has_slice_pg) {
+		rpcs |= GEN8_RPCS_S_CNT_ENABLE;
+		rpcs |= hweight8(ctx_sseu.slice_mask) << GEN8_RPCS_S_CNT_SHIFT;
+		rpcs |= GEN8_RPCS_ENABLE;
+	}
+
+	if (sseu->has_subslice_pg) {
+		rpcs |= GEN8_RPCS_SS_CNT_ENABLE;
+		rpcs |= hweight8(ctx_sseu.subslice_mask) <<
+			GEN8_RPCS_SS_CNT_SHIFT;
+		rpcs |= GEN8_RPCS_ENABLE;
+	}
+
+	if (sseu->has_eu_pg) {
+		rpcs |= ctx_sseu.min_eus_per_subslice <<
+			GEN8_RPCS_EU_MIN_SHIFT;
+		rpcs |= ctx_sseu.max_eus_per_subslice <<
+			GEN8_RPCS_EU_MAX_SHIFT;
+		rpcs |= GEN8_RPCS_ENABLE;
+	}
+
+	return rpcs;
+}
+
+static int gen8_emit_rpcs_config(struct i915_request *rq,
+				 struct i915_gem_context *ctx,
+				 union intel_sseu sseu)
+{
+	struct drm_i915_private *dev_priv = rq->i915;
+	struct intel_context *ce = to_intel_context(ctx, dev_priv->engine[RCS]);
+	u64 offset;
+	u32 *cs;
+
+	/* Let the deferred state allocation take care of this. */
+	if (!ce->state)
+		return 0;
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	offset = ce->state->node.start +
+		LRC_STATE_PN * PAGE_SIZE +
+		(CTX_R_PWR_CLK_STATE + 1) * 4;
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4;
+	*cs++ = lower_32_bits(offset);
+	*cs++ = upper_32_bits(offset);
+	*cs++ = gen8_make_rpcs(&INTEL_INFO(dev_priv)->sseu, sseu);
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
 static int gen8_init_rcs_context(struct i915_request *rq)
 {
 	int ret;
@@ -2333,6 +2399,8 @@  logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 	engine->emit_breadcrumb = gen8_emit_breadcrumb;
 	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
 
+	engine->emit_rpcs_config = gen8_emit_rpcs_config;
+
 	engine->set_default_submission = execlists_set_default_submission;
 
 	if (INTEL_GEN(engine->i915) < 11) {
@@ -2481,41 +2549,6 @@  int logical_xcs_ring_init(struct intel_engine_cs *engine)
 	return logical_ring_init(engine);
 }
 
-u32 gen8_make_rpcs(const struct sseu_dev_info *sseu,
-		   union intel_sseu ctx_sseu)
-{
-	u32 rpcs = 0;
-
-	/*
-	 * Starting in Gen9, render power gating can leave
-	 * slice/subslice/EU in a partially enabled state. We
-	 * must make an explicit request through RPCS for full
-	 * enablement.
-	*/
-	if (sseu->has_slice_pg) {
-		rpcs |= GEN8_RPCS_S_CNT_ENABLE;
-		rpcs |= hweight8(ctx_sseu.slice_mask) << GEN8_RPCS_S_CNT_SHIFT;
-		rpcs |= GEN8_RPCS_ENABLE;
-	}
-
-	if (sseu->has_subslice_pg) {
-		rpcs |= GEN8_RPCS_SS_CNT_ENABLE;
-		rpcs |= hweight8(ctx_sseu.subslice_mask) <<
-			GEN8_RPCS_SS_CNT_SHIFT;
-		rpcs |= GEN8_RPCS_ENABLE;
-	}
-
-	if (sseu->has_eu_pg) {
-		rpcs |= ctx_sseu.min_eus_per_subslice <<
-			GEN8_RPCS_EU_MIN_SHIFT;
-		rpcs |= ctx_sseu.max_eus_per_subslice <<
-			GEN8_RPCS_EU_MAX_SHIFT;
-		rpcs |= GEN8_RPCS_ENABLE;
-	}
-
-	return rpcs;
-}
-
 static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
 {
 	u32 indirect_ctx_offset;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 001cf6bcb349..643466d4fa2d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2061,6 +2061,8 @@  static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
 			engine->emit_breadcrumb_sz++;
 	}
 
+	engine->emit_rpcs_config = NULL; /* Only supported on Gen8+ */
+
 	engine->set_default_submission = i9xx_set_default_submission;
 
 	if (INTEL_GEN(dev_priv) >= 6)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index acef385c4c80..6bf4d3b57ced 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -456,6 +456,10 @@  struct intel_engine_cs {
 	void		(*emit_breadcrumb)(struct i915_request *rq, u32 *cs);
 	int		emit_breadcrumb_sz;
 
+	int		(*emit_rpcs_config)(struct i915_request *rq,
+					    struct i915_gem_context *ctx,
+					    union intel_sseu sseu);
+
 	/* Pass the request to the hardware queue (e.g. directly into
 	 * the legacy ringbuffer or to the end of an execlist).
 	 *
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 7f5634ce8e88..24b90836ce1d 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1456,9 +1456,47 @@  struct drm_i915_gem_context_param {
 #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
 #define   I915_CONTEXT_DEFAULT_PRIORITY		0
 #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
+	/*
+	 * When using the following param, value should be a pointer to
+	 * drm_i915_gem_context_param_sseu.
+	 */
+#define I915_CONTEXT_PARAM_SSEU		0x7
 	__u64 value;
 };
 
+struct drm_i915_gem_context_param_sseu {
+	/*
+	 * Engine class & instance to be configured or queried.
+	 */
+	__u32 class;
+	__u32 instance;
+
+	/*
+	 * Mask of slices to enable for the context. Valid values are a subset
+	 * of the bitmask value returned for I915_PARAM_SLICE_MASK.
+	 */
+	__u8 slice_mask;
+
+	/*
+	 * Mask of subslices to enable for the context. Valid values are a
+	 * subset of the bitmask value return by I915_PARAM_SUBSLICE_MASK.
+	 */
+	__u8 subslice_mask;
+
+	/*
+	 * Minimum/Maximum number of EUs to enable per subslice for the
+	 * context. min_eus_per_subslice must be inferior or equal to
+	 * max_eus_per_subslice.
+	 */
+	__u8 min_eus_per_subslice;
+	__u8 max_eus_per_subslice;
+
+	/*
+	 * Unused for now. Must be cleared to zero.
+	 */
+	__u32 rsvd;
+};
+
 enum drm_i915_oa_format {
 	I915_OA_FORMAT_A13 = 1,	    /* HSW only */
 	I915_OA_FORMAT_A29,	    /* HSW only */