diff mbox series

drm/i915/oa: Reconfigure contexts on the fly

Message ID 20190705123057.19346-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series drm/i915/oa: Reconfigure contexts on the fly | expand

Commit Message

Chris Wilson July 5, 2019, 12:30 p.m. UTC
Avoid a global idle barrier by reconfiguring each context by rewriting
them with MI_STORE_DWORD from the kernel context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c |   2 +
 drivers/gpu/drm/i915/gt/intel_lrc.c         |   7 +-
 drivers/gpu/drm/i915/i915_perf.c            | 255 +++++++++++++++-----
 3 files changed, 200 insertions(+), 64 deletions(-)

Comments

Lionel Landwerlin July 5, 2019, 12:42 p.m. UTC | #1
Wow nice. I didn't have the courage to actually write it, knowing how 
easy it could be to screw an offset and write at random in GGTT...

I only have one concern below.

Thanks a lot,

-Lionel

On 05/07/2019 15:30, Chris Wilson wrote:
> Avoid a global idle barrier by reconfiguring each context by rewriting
> them with MI_STORE_DWORD from the kernel context.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_context.c |   2 +
>   drivers/gpu/drm/i915/gt/intel_lrc.c         |   7 +-
>   drivers/gpu/drm/i915/i915_perf.c            | 255 +++++++++++++++-----
>   3 files changed, 200 insertions(+), 64 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index e367dce2a696..1f0d10bb88c1 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -624,7 +624,9 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio)
>   	ctx->sched.priority = I915_USER_PRIORITY(prio);
>   	ctx->ring_size = PAGE_SIZE;
>   
> +	/* Isolate the kernel context from prying eyes and sticky fingers */
>   	GEM_BUG_ON(!i915_gem_context_is_kernel(ctx));
> +	list_del_init(&ctx->link);
>   
>   	return ctx;
>   }
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index e1ae1399c72b..9cc5374401e1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -1552,9 +1552,12 @@ __execlists_update_reg_state(struct intel_context *ce,
>   	regs[CTX_RING_TAIL + 1] = ring->tail;
>   
>   	/* RPCS */
> -	if (engine->class == RENDER_CLASS)
> +	if (engine->class == RENDER_CLASS) {
>   		regs[CTX_R_PWR_CLK_STATE + 1] =
>   			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
> +
> +		i915_oa_init_reg_state(engine, ce, regs);
> +	}
>   }
>   
>   static int
> @@ -2966,8 +2969,6 @@ static void execlists_init_reg_state(u32 *regs,
>   	if (rcs) {
>   		regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
>   		CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
> -
> -		i915_oa_init_reg_state(engine, ce, regs);
>   	}
>   
>   	regs[CTX_END] = MI_BATCH_BUFFER_END;
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 357e63beb373..a2a0752ffa57 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1630,6 +1630,27 @@ static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
>   				      ~GT_NOA_ENABLE));
>   }
>   
> +static u32 oa_config_flex_reg(const struct i915_oa_config *oa_config,
> +			      i915_reg_t reg)
> +{
> +	u32 mmio = i915_mmio_reg_offset(reg);
> +	int i;
> +
> +	/*
> +	 * This arbitrary default will select the 'EU FPU0 Pipeline
> +	 * Active' event. In the future it's anticipated that there
> +	 * will be an explicit 'No Event' we can select, but not yet...
> +	 */
> +	if (!oa_config)
> +		return 0;
> +
> +	for (i = 0; i < oa_config->flex_regs_len; i++) {
> +		if (i915_mmio_reg_offset(oa_config->flex_regs[i].addr) == mmio)
> +			return oa_config->flex_regs[i].value;
> +	}
> +
> +	return 0;
> +}
>   /*
>    * NB: It must always remain pointer safe to run this even if the OA unit
>    * has been disabled.
> @@ -1663,28 +1684,8 @@ gen8_update_reg_state_unlocked(struct intel_context *ce,
>   		GEN8_OA_COUNTER_RESUME);
>   
>   	for (i = 0; i < ARRAY_SIZE(flex_regs); i++) {
> -		u32 state_offset = ctx_flexeu0 + i * 2;
> -		u32 mmio = i915_mmio_reg_offset(flex_regs[i]);
> -
> -		/*
> -		 * This arbitrary default will select the 'EU FPU0 Pipeline
> -		 * Active' event. In the future it's anticipated that there
> -		 * will be an explicit 'No Event' we can select, but not yet...
> -		 */
> -		u32 value = 0;
> -
> -		if (oa_config) {
> -			u32 j;
> -
> -			for (j = 0; j < oa_config->flex_regs_len; j++) {
> -				if (i915_mmio_reg_offset(oa_config->flex_regs[j].addr) == mmio) {
> -					value = oa_config->flex_regs[j].value;
> -					break;
> -				}
> -			}
> -		}
> -
> -		CTX_REG(reg_state, state_offset, flex_regs[i], value);
> +		CTX_REG(reg_state, ctx_flexeu0 + i * 2, flex_regs[i],
> +			oa_config_flex_reg(oa_config, flex_regs[i]));
>   	}
>   
>   	CTX_REG(reg_state,
> @@ -1692,6 +1693,150 @@ gen8_update_reg_state_unlocked(struct intel_context *ce,
>   		intel_sseu_make_rpcs(i915, &ce->sseu));
>   }
>   
> +struct flex {
> +	i915_reg_t reg;
> +	u32 offset;
> +	u32 value;
> +};
> +
> +static int
> +gen8_store_flex(struct i915_request *rq,
> +		struct intel_context *ce,
> +		const struct flex *flex, unsigned int count)
> +{
> +	u32 offset;
> +	u32 *cs;
> +
> +	cs = intel_ring_begin(rq, 4 * count);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);


Is the right of the kernel context large enough to hold the MI_SDIs for 
all the contexts?


> +
> +	offset = i915_ggtt_offset(ce->state) + LRC_STATE_PN * PAGE_SIZE;
> +	do {
> +		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> +		*cs++ = offset + (flex->offset + 1) * sizeof(u32);
> +		*cs++ = 0;
> +		*cs++ = flex->value;
> +	} while (flex++, --count);
> +
> +	intel_ring_advance(rq, cs);
> +
> +	return 0;
> +}
> +
> +static int
> +gen8_load_flex(struct i915_request *rq,
> +	       struct intel_context *ce,
> +	       const struct flex *flex, unsigned int count)
> +{
> +	u32 *cs;
> +
> +	GEM_BUG_ON(!count || count > 63);
> +
> +	cs = intel_ring_begin(rq, 2 * count + 2);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM(count);
> +	do {
> +		*cs++ = i915_mmio_reg_offset(flex->reg);
> +		*cs++ = flex->value;
> +	} while (flex++, --count);
> +	*cs++ = MI_NOOP;
> +
> +	intel_ring_advance(rq, cs);
> +
> +	return 0;
> +}
> +
> +static int
> +gen8_emit_oa_config(struct i915_request *rq,
> +		    struct intel_context *ce,
> +		    const struct i915_oa_config *oa_config,
> +		    bool self)
> +{
> +	struct drm_i915_private *i915 = rq->i915;
> +	/* The MMIO offsets for Flex EU registers aren't contiguous */
> +	const u32 ctx_flexeu0 = i915->perf.oa.ctx_flexeu0_offset;
> +#define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N))
> +	struct flex regs[] = {
> +		{
> +			GEN8_R_PWR_CLK_STATE,
> +			CTX_R_PWR_CLK_STATE,
> +			intel_sseu_make_rpcs(i915, &ce->sseu)
> +		},
> +		{
> +			GEN8_OACTXCONTROL,
> +			i915->perf.oa.ctx_oactxctrl_offset,
> +			((i915->perf.oa.period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
> +			 (i915->perf.oa.periodic ? GEN8_OA_TIMER_ENABLE : 0) |
> +			 GEN8_OA_COUNTER_RESUME)
> +		},
> +		{ EU_PERF_CNTL0, ctx_flexeuN(0) },
> +		{ EU_PERF_CNTL1, ctx_flexeuN(1) },
> +		{ EU_PERF_CNTL2, ctx_flexeuN(2) },
> +		{ EU_PERF_CNTL3, ctx_flexeuN(3) },
> +		{ EU_PERF_CNTL4, ctx_flexeuN(4) },
> +		{ EU_PERF_CNTL5, ctx_flexeuN(5) },
> +		{ EU_PERF_CNTL6, ctx_flexeuN(6) },
> +	};
> +#undef ctx_flexeuN
> +	int i;
> +
> +	for (i = 2; i < ARRAY_SIZE(regs); i++)
> +		regs[i].value = oa_config_flex_reg(oa_config, regs[i].reg);


I guess this mapping could be done only once in 
gen8_configure_all_contexts().


> +
> +	if (self)
> +		return gen8_load_flex(rq, ce, regs, ARRAY_SIZE(regs));
> +	else
> +		return gen8_store_flex(rq, ce, regs, ARRAY_SIZE(regs));
> +}
> +
> +static int gen8_modify_context(struct intel_context *ce,
> +			       const struct i915_oa_config *oa_config)
> +{
> +	struct i915_request *rq;
> +	int err;
> +
> +	lockdep_assert_held(&ce->pin_mutex);
> +
> +	rq = i915_request_create(ce->engine->kernel_context);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	/* Serialise with the remote context */
> +	err = i915_active_request_set(&ce->ring->timeline->last_request, rq);
> +	if (err)
> +		goto out_add;
> +
> +	/* Keep the remote context alive until after we finish editing */
> +	err = i915_active_ref(&ce->active, rq->fence.context, rq);
> +	if (err)
> +		goto out_add;
> +
> +	err = gen8_emit_oa_config(rq, ce, oa_config, false);
> +
> +out_add:
> +	i915_request_add(rq);
> +	return err;
> +}
> +
> +static int gen8_modify_self(struct intel_context *ce,
> +			    const struct i915_oa_config *oa_config)
> +{
> +	struct i915_request *rq;
> +	int err;
> +
> +	rq = i915_request_create(ce);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	err = gen8_emit_oa_config(rq, ce, oa_config, true);
> +
> +	i915_request_add(rq);
> +	return err;
> +}
> +
>   /*
>    * Manages updating the per-context aspects of the OA stream
>    * configuration across all contexts.
> @@ -1716,15 +1861,15 @@ gen8_update_reg_state_unlocked(struct intel_context *ce,
>    *
>    * Note: it's only the RCS/Render context that has any OA state.
>    */
> -static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
> +static int gen8_configure_all_contexts(struct drm_i915_private *i915,
>   				       const struct i915_oa_config *oa_config)
>   {
> -	unsigned int map_type = i915_coherent_map_type(dev_priv);
>   	struct i915_gem_context *ctx;
> -	struct i915_request *rq;
> -	int ret;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	int err;
>   
> -	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> +	lockdep_assert_held(&i915->drm.struct_mutex);
>   
>   	/*
>   	 * The OA register config is setup through the context image. This image
> @@ -1735,59 +1880,47 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
>   	 * We could emit the OA register config through the batch buffer but
>   	 * this might leave small interval of time where the OA unit is
>   	 * configured at an invalid sampling period.
> -	 *
> -	 * So far the best way to work around this issue seems to be draining
> -	 * the GPU from any submitted work.
>   	 */
> -	ret = i915_gem_wait_for_idle(dev_priv,
> -				     I915_WAIT_LOCKED,
> -				     MAX_SCHEDULE_TIMEOUT);
> -	if (ret)
> -		return ret;
> -
> -	/* Update all contexts now that we've stalled the submission. */
> -	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
> +	list_for_each_entry(ctx, &i915->contexts.list, link) {
>   		struct i915_gem_engines_iter it;
>   		struct intel_context *ce;
>   
>   		for_each_gem_engine(ce,
>   				    i915_gem_context_lock_engines(ctx),
>   				    it) {
> -			u32 *regs;
> -
>   			if (ce->engine->class != RENDER_CLASS)
>   				continue;
>   
> -			/* OA settings will be set upon first use */
> -			if (!ce->state)
> -				continue;
> -
> -			regs = i915_gem_object_pin_map(ce->state->obj,
> -						       map_type);
> -			if (IS_ERR(regs)) {
> -				i915_gem_context_unlock_engines(ctx);
> -				return PTR_ERR(regs);
> -			}
> -
> -			ce->state->obj->mm.dirty = true;
> -			regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs);
> +			err = intel_context_lock_pinned(ce);
> +			if (err)
> +				break;
>   
> -			gen8_update_reg_state_unlocked(ce, regs, oa_config);
> +			/* Otherwise OA settings will be set upon first use */
> +			if (intel_context_is_pinned(ce))
> +				err = gen8_modify_context(ce, oa_config);
>   
> -			i915_gem_object_unpin_map(ce->state->obj);
> +			intel_context_unlock_pinned(ce);
> +			if (err)
> +				break;
>   		}
>   		i915_gem_context_unlock_engines(ctx);
> +		if (err)
> +			return err;
>   	}
>   
>   	/*
> -	 * Apply the configuration by doing one context restore of the edited
> -	 * context image.
> +	 * After updating all other contexts, we need to modify ourselves.
> +	 * If we don't modify the kernel_context, we do not get events while
> +	 * idle.
>   	 */
> -	rq = i915_request_create(dev_priv->engine[RCS0]->kernel_context);
> -	if (IS_ERR(rq))
> -		return PTR_ERR(rq);
> +	for_each_engine(engine, i915, id) {
> +		if (engine->class != RENDER_CLASS)
> +			continue;
>   
> -	i915_request_add(rq);
> +		err = gen8_modify_self(engine->kernel_context, oa_config);
> +		if (err)
> +			return err;
> +	}
>   
>   	return 0;
>   }
Lionel Landwerlin July 5, 2019, 12:43 p.m. UTC | #2
On 05/07/2019 15:42, Lionel Landwerlin wrote:
>>
>> +
>> +static int
>> +gen8_store_flex(struct i915_request *rq,
>> +        struct intel_context *ce,
>> +        const struct flex *flex, unsigned int count)
>> +{
>> +    u32 offset;
>> +    u32 *cs;
>> +
>> +    cs = intel_ring_begin(rq, 4 * count);
>> +    if (IS_ERR(cs))
>> +        return PTR_ERR(cs);
>
>
> Is the right of the kernel context large enough to hold the MI_SDIs 
> for all the contexts?
>
>

s/right/ring/
Chris Wilson July 5, 2019, 12:54 p.m. UTC | #3
Quoting Lionel Landwerlin (2019-07-05 13:42:51)
> Wow nice. I didn't have the courage to actually write it, knowing how 
> easy it could be to screw an offset and write at random in GGTT...
> 
> I only have one concern below.
> 
> Thanks a lot,
> 
> -Lionel
> 
> On 05/07/2019 15:30, Chris Wilson wrote:
> >       CTX_REG(reg_state,
> > @@ -1692,6 +1693,150 @@ gen8_update_reg_state_unlocked(struct intel_context *ce,
> >               intel_sseu_make_rpcs(i915, &ce->sseu));
> >   }
> >   
> > +struct flex {
> > +     i915_reg_t reg;
> > +     u32 offset;
> > +     u32 value;
> > +};
> > +
> > +static int
> > +gen8_store_flex(struct i915_request *rq,
> > +             struct intel_context *ce,
> > +             const struct flex *flex, unsigned int count)
> > +{
> > +     u32 offset;
> > +     u32 *cs;
> > +
> > +     cs = intel_ring_begin(rq, 4 * count);
> > +     if (IS_ERR(cs))
> > +             return PTR_ERR(cs);
> 
> 
> Is the right of the kernel context large enough to hold the MI_SDIs for 
> all the contexts?

At the moment we are using 9 registers, add in say 128 bytes tops for
the request overhead, that's <300 bytes. The kernel context uses 4k
rings, so enough for a few updates before we have to flush. We may have
to wait for external rings to idle and be interrupt -- but that's the
same as before, including the chance that we may be interrupted in the
middle of conversion.

The worst case is that we overrun the ring and we should get a juicy
warning (GEM_BUG_ON or -ENOSPC) in that case. We can increase the
kernel_context ring if that's an issue or just fallback to suballocating
a batchbuffer for the updates.

> > +static int
> > +gen8_emit_oa_config(struct i915_request *rq,
> > +                 struct intel_context *ce,
> > +                 const struct i915_oa_config *oa_config,
> > +                 bool self)
> > +{
> > +     struct drm_i915_private *i915 = rq->i915;
> > +     /* The MMIO offsets for Flex EU registers aren't contiguous */
> > +     const u32 ctx_flexeu0 = i915->perf.oa.ctx_flexeu0_offset;
> > +#define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N))
> > +     struct flex regs[] = {
> > +             {
> > +                     GEN8_R_PWR_CLK_STATE,
> > +                     CTX_R_PWR_CLK_STATE,
> > +                     intel_sseu_make_rpcs(i915, &ce->sseu)
> > +             },
> > +             {
> > +                     GEN8_OACTXCONTROL,
> > +                     i915->perf.oa.ctx_oactxctrl_offset,
> > +                     ((i915->perf.oa.period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
> > +                      (i915->perf.oa.periodic ? GEN8_OA_TIMER_ENABLE : 0) |
> > +                      GEN8_OA_COUNTER_RESUME)
> > +             },
> > +             { EU_PERF_CNTL0, ctx_flexeuN(0) },
> > +             { EU_PERF_CNTL1, ctx_flexeuN(1) },
> > +             { EU_PERF_CNTL2, ctx_flexeuN(2) },
> > +             { EU_PERF_CNTL3, ctx_flexeuN(3) },
> > +             { EU_PERF_CNTL4, ctx_flexeuN(4) },
> > +             { EU_PERF_CNTL5, ctx_flexeuN(5) },
> > +             { EU_PERF_CNTL6, ctx_flexeuN(6) },
> > +     };
> > +#undef ctx_flexeuN
> > +     int i;
> > +
> > +     for (i = 2; i < ARRAY_SIZE(regs); i++)
> > +             regs[i].value = oa_config_flex_reg(oa_config, regs[i].reg);
> 
> 
> I guess this mapping could be done only once in 
> gen8_configure_all_contexts().

Ah, it's just ce->sseu giving the appearance of it being context
specific :)
-Chris
Lionel Landwerlin July 5, 2019, 1:06 p.m. UTC | #4
On 05/07/2019 15:54, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2019-07-05 13:42:51)
>> Wow nice. I didn't have the courage to actually write it, knowing how
>> easy it could be to screw an offset and write at random in GGTT...
>>
>> I only have one concern below.
>>
>> Thanks a lot,
>>
>> -Lionel
>>
>> On 05/07/2019 15:30, Chris Wilson wrote:
>>>        CTX_REG(reg_state,
>>> @@ -1692,6 +1693,150 @@ gen8_update_reg_state_unlocked(struct intel_context *ce,
>>>                intel_sseu_make_rpcs(i915, &ce->sseu));
>>>    }
>>>    
>>> +struct flex {
>>> +     i915_reg_t reg;
>>> +     u32 offset;
>>> +     u32 value;
>>> +};
>>> +
>>> +static int
>>> +gen8_store_flex(struct i915_request *rq,
>>> +             struct intel_context *ce,
>>> +             const struct flex *flex, unsigned int count)
>>> +{
>>> +     u32 offset;
>>> +     u32 *cs;
>>> +
>>> +     cs = intel_ring_begin(rq, 4 * count);
>>> +     if (IS_ERR(cs))
>>> +             return PTR_ERR(cs);
>>
>> Is the right of the kernel context large enough to hold the MI_SDIs for
>> all the contexts?
> At the moment we are using 9 registers, add in say 128 bytes tops for
> the request overhead, that's <300 bytes. The kernel context uses 4k
> rings, so enough for a few updates before we have to flush. We may have
> to wait for external rings to idle and be interrupt -- but that's the
> same as before, including the chance that we may be interrupted in the
> middle of conversion.
>
> The worst case is that we overrun the ring and we should get a juicy
> warning (GEM_BUG_ON or -ENOSPC) in that case. We can increase the
> kernel_context ring if that's an issue or just fallback to suballocating
> a batchbuffer for the updates.


Ah, thanks. I didn't notice it was one request per context to reconfigure.

Still I wouldn't want to have this fail somewhat randomly because 
something else stayed on the HW just a bit too long.


Maybe checking the available space in the ring in 
gen8_configure_all_contexts() and wait on last_request if there isn't 
enough?


-Lionel


>
>>> +static int
>>> +gen8_emit_oa_config(struct i915_request *rq,
>>> +                 struct intel_context *ce,
>>> +                 const struct i915_oa_config *oa_config,
>>> +                 bool self)
>>> +{
>>> +     struct drm_i915_private *i915 = rq->i915;
>>> +     /* The MMIO offsets for Flex EU registers aren't contiguous */
>>> +     const u32 ctx_flexeu0 = i915->perf.oa.ctx_flexeu0_offset;
>>> +#define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N))
>>> +     struct flex regs[] = {
>>> +             {
>>> +                     GEN8_R_PWR_CLK_STATE,
>>> +                     CTX_R_PWR_CLK_STATE,
>>> +                     intel_sseu_make_rpcs(i915, &ce->sseu)
>>> +             },
>>> +             {
>>> +                     GEN8_OACTXCONTROL,
>>> +                     i915->perf.oa.ctx_oactxctrl_offset,
>>> +                     ((i915->perf.oa.period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
>>> +                      (i915->perf.oa.periodic ? GEN8_OA_TIMER_ENABLE : 0) |
>>> +                      GEN8_OA_COUNTER_RESUME)
>>> +             },
>>> +             { EU_PERF_CNTL0, ctx_flexeuN(0) },
>>> +             { EU_PERF_CNTL1, ctx_flexeuN(1) },
>>> +             { EU_PERF_CNTL2, ctx_flexeuN(2) },
>>> +             { EU_PERF_CNTL3, ctx_flexeuN(3) },
>>> +             { EU_PERF_CNTL4, ctx_flexeuN(4) },
>>> +             { EU_PERF_CNTL5, ctx_flexeuN(5) },
>>> +             { EU_PERF_CNTL6, ctx_flexeuN(6) },
>>> +     };
>>> +#undef ctx_flexeuN
>>> +     int i;
>>> +
>>> +     for (i = 2; i < ARRAY_SIZE(regs); i++)
>>> +             regs[i].value = oa_config_flex_reg(oa_config, regs[i].reg);
>>
>> I guess this mapping could be done only once in
>> gen8_configure_all_contexts().
> Ah, it's just ce->sseu giving the appearance of it being context
> specific :)
> -Chris
>
Chris Wilson July 5, 2019, 1:14 p.m. UTC | #5
Quoting Lionel Landwerlin (2019-07-05 14:06:25)
> On 05/07/2019 15:54, Chris Wilson wrote:
> > Quoting Lionel Landwerlin (2019-07-05 13:42:51)
> >> Wow nice. I didn't have the courage to actually write it, knowing how
> >> easy it could be to screw an offset and write at random in GGTT...
> >>
> >> I only have one concern below.
> >>
> >> Thanks a lot,
> >>
> >> -Lionel
> >>
> >> On 05/07/2019 15:30, Chris Wilson wrote:
> >>>        CTX_REG(reg_state,
> >>> @@ -1692,6 +1693,150 @@ gen8_update_reg_state_unlocked(struct intel_context *ce,
> >>>                intel_sseu_make_rpcs(i915, &ce->sseu));
> >>>    }
> >>>    
> >>> +struct flex {
> >>> +     i915_reg_t reg;
> >>> +     u32 offset;
> >>> +     u32 value;
> >>> +};
> >>> +
> >>> +static int
> >>> +gen8_store_flex(struct i915_request *rq,
> >>> +             struct intel_context *ce,
> >>> +             const struct flex *flex, unsigned int count)
> >>> +{
> >>> +     u32 offset;
> >>> +     u32 *cs;
> >>> +
> >>> +     cs = intel_ring_begin(rq, 4 * count);
> >>> +     if (IS_ERR(cs))
> >>> +             return PTR_ERR(cs);
> >>
> >> Is the right of the kernel context large enough to hold the MI_SDIs for
> >> all the contexts?
> > At the moment we are using 9 registers, add in say 128 bytes tops for
> > the request overhead, that's <300 bytes. The kernel context uses 4k
> > rings, so enough for a few updates before we have to flush. We may have
> > to wait for external rings to idle and be interrupt -- but that's the
> > same as before, including the chance that we may be interrupted in the
> > middle of conversion.
> >
> > The worst case is that we overrun the ring and we should get a juicy
> > warning (GEM_BUG_ON or -ENOSPC) in that case. We can increase the
> > kernel_context ring if that's an issue or just fallback to suballocating
> > a batchbuffer for the updates.
> 
> 
> Ah, thanks. I didn't notice it was one request per context to reconfigure.
> 
> Still I wouldn't want to have this fail somewhat randomly because 
> something else stayed on the HW just a bit too long.

Same problem we have now, since the wait-for-idle may randomly be
interrupted (as may the mapping of the context images).

> Maybe checking the available space in the ring in 
> gen8_configure_all_contexts() and wait on last_request if there isn't 
> enough?

The wait is automatic by virtue of intel_ring_begin. The error I was
alluding to before is that if we try and create a packet
(intel_ring_begin) too large, it will fail an assert. The number of
registers we need to write should have an upper bound, we should be able
to spot a problem before it happens.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index e367dce2a696..1f0d10bb88c1 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -624,7 +624,9 @@  i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio)
 	ctx->sched.priority = I915_USER_PRIORITY(prio);
 	ctx->ring_size = PAGE_SIZE;
 
+	/* Isolate the kernel context from prying eyes and sticky fingers */
 	GEM_BUG_ON(!i915_gem_context_is_kernel(ctx));
+	list_del_init(&ctx->link);
 
 	return ctx;
 }
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index e1ae1399c72b..9cc5374401e1 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1552,9 +1552,12 @@  __execlists_update_reg_state(struct intel_context *ce,
 	regs[CTX_RING_TAIL + 1] = ring->tail;
 
 	/* RPCS */
-	if (engine->class == RENDER_CLASS)
+	if (engine->class == RENDER_CLASS) {
 		regs[CTX_R_PWR_CLK_STATE + 1] =
 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
+
+		i915_oa_init_reg_state(engine, ce, regs);
+	}
 }
 
 static int
@@ -2966,8 +2969,6 @@  static void execlists_init_reg_state(u32 *regs,
 	if (rcs) {
 		regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
 		CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
-
-		i915_oa_init_reg_state(engine, ce, regs);
 	}
 
 	regs[CTX_END] = MI_BATCH_BUFFER_END;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 357e63beb373..a2a0752ffa57 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1630,6 +1630,27 @@  static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
 				      ~GT_NOA_ENABLE));
 }
 
+static u32 oa_config_flex_reg(const struct i915_oa_config *oa_config,
+			      i915_reg_t reg)
+{
+	u32 mmio = i915_mmio_reg_offset(reg);
+	int i;
+
+	/*
+	 * This arbitrary default will select the 'EU FPU0 Pipeline
+	 * Active' event. In the future it's anticipated that there
+	 * will be an explicit 'No Event' we can select, but not yet...
+	 */
+	if (!oa_config)
+		return 0;
+
+	for (i = 0; i < oa_config->flex_regs_len; i++) {
+		if (i915_mmio_reg_offset(oa_config->flex_regs[i].addr) == mmio)
+			return oa_config->flex_regs[i].value;
+	}
+
+	return 0;
+}
 /*
  * NB: It must always remain pointer safe to run this even if the OA unit
  * has been disabled.
@@ -1663,28 +1684,8 @@  gen8_update_reg_state_unlocked(struct intel_context *ce,
 		GEN8_OA_COUNTER_RESUME);
 
 	for (i = 0; i < ARRAY_SIZE(flex_regs); i++) {
-		u32 state_offset = ctx_flexeu0 + i * 2;
-		u32 mmio = i915_mmio_reg_offset(flex_regs[i]);
-
-		/*
-		 * This arbitrary default will select the 'EU FPU0 Pipeline
-		 * Active' event. In the future it's anticipated that there
-		 * will be an explicit 'No Event' we can select, but not yet...
-		 */
-		u32 value = 0;
-
-		if (oa_config) {
-			u32 j;
-
-			for (j = 0; j < oa_config->flex_regs_len; j++) {
-				if (i915_mmio_reg_offset(oa_config->flex_regs[j].addr) == mmio) {
-					value = oa_config->flex_regs[j].value;
-					break;
-				}
-			}
-		}
-
-		CTX_REG(reg_state, state_offset, flex_regs[i], value);
+		CTX_REG(reg_state, ctx_flexeu0 + i * 2, flex_regs[i],
+			oa_config_flex_reg(oa_config, flex_regs[i]));
 	}
 
 	CTX_REG(reg_state,
@@ -1692,6 +1693,150 @@  gen8_update_reg_state_unlocked(struct intel_context *ce,
 		intel_sseu_make_rpcs(i915, &ce->sseu));
 }
 
+struct flex {
+	i915_reg_t reg;
+	u32 offset;
+	u32 value;
+};
+
+static int
+gen8_store_flex(struct i915_request *rq,
+		struct intel_context *ce,
+		const struct flex *flex, unsigned int count)
+{
+	u32 offset;
+	u32 *cs;
+
+	cs = intel_ring_begin(rq, 4 * count);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	offset = i915_ggtt_offset(ce->state) + LRC_STATE_PN * PAGE_SIZE;
+	do {
+		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+		*cs++ = offset + (flex->offset + 1) * sizeof(u32);
+		*cs++ = 0;
+		*cs++ = flex->value;
+	} while (flex++, --count);
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static int
+gen8_load_flex(struct i915_request *rq,
+	       struct intel_context *ce,
+	       const struct flex *flex, unsigned int count)
+{
+	u32 *cs;
+
+	GEM_BUG_ON(!count || count > 63);
+
+	cs = intel_ring_begin(rq, 2 * count + 2);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = MI_LOAD_REGISTER_IMM(count);
+	do {
+		*cs++ = i915_mmio_reg_offset(flex->reg);
+		*cs++ = flex->value;
+	} while (flex++, --count);
+	*cs++ = MI_NOOP;
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static int
+gen8_emit_oa_config(struct i915_request *rq,
+		    struct intel_context *ce,
+		    const struct i915_oa_config *oa_config,
+		    bool self)
+{
+	struct drm_i915_private *i915 = rq->i915;
+	/* The MMIO offsets for Flex EU registers aren't contiguous */
+	const u32 ctx_flexeu0 = i915->perf.oa.ctx_flexeu0_offset;
+#define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N))
+	struct flex regs[] = {
+		{
+			GEN8_R_PWR_CLK_STATE,
+			CTX_R_PWR_CLK_STATE,
+			intel_sseu_make_rpcs(i915, &ce->sseu)
+		},
+		{
+			GEN8_OACTXCONTROL,
+			i915->perf.oa.ctx_oactxctrl_offset,
+			((i915->perf.oa.period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
+			 (i915->perf.oa.periodic ? GEN8_OA_TIMER_ENABLE : 0) |
+			 GEN8_OA_COUNTER_RESUME)
+		},
+		{ EU_PERF_CNTL0, ctx_flexeuN(0) },
+		{ EU_PERF_CNTL1, ctx_flexeuN(1) },
+		{ EU_PERF_CNTL2, ctx_flexeuN(2) },
+		{ EU_PERF_CNTL3, ctx_flexeuN(3) },
+		{ EU_PERF_CNTL4, ctx_flexeuN(4) },
+		{ EU_PERF_CNTL5, ctx_flexeuN(5) },
+		{ EU_PERF_CNTL6, ctx_flexeuN(6) },
+	};
+#undef ctx_flexeuN
+	int i;
+
+	for (i = 2; i < ARRAY_SIZE(regs); i++)
+		regs[i].value = oa_config_flex_reg(oa_config, regs[i].reg);
+
+	if (self)
+		return gen8_load_flex(rq, ce, regs, ARRAY_SIZE(regs));
+	else
+		return gen8_store_flex(rq, ce, regs, ARRAY_SIZE(regs));
+}
+
+static int gen8_modify_context(struct intel_context *ce,
+			       const struct i915_oa_config *oa_config)
+{
+	struct i915_request *rq;
+	int err;
+
+	lockdep_assert_held(&ce->pin_mutex);
+
+	rq = i915_request_create(ce->engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	/* Serialise with the remote context */
+	err = i915_active_request_set(&ce->ring->timeline->last_request, rq);
+	if (err)
+		goto out_add;
+
+	/* Keep the remote context alive until after we finish editing */
+	err = i915_active_ref(&ce->active, rq->fence.context, rq);
+	if (err)
+		goto out_add;
+
+	err = gen8_emit_oa_config(rq, ce, oa_config, false);
+
+out_add:
+	i915_request_add(rq);
+	return err;
+}
+
+static int gen8_modify_self(struct intel_context *ce,
+			    const struct i915_oa_config *oa_config)
+{
+	struct i915_request *rq;
+	int err;
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	err = gen8_emit_oa_config(rq, ce, oa_config, true);
+
+	i915_request_add(rq);
+	return err;
+}
+
 /*
  * Manages updating the per-context aspects of the OA stream
  * configuration across all contexts.
@@ -1716,15 +1861,15 @@  gen8_update_reg_state_unlocked(struct intel_context *ce,
  *
  * Note: it's only the RCS/Render context that has any OA state.
  */
-static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
+static int gen8_configure_all_contexts(struct drm_i915_private *i915,
 				       const struct i915_oa_config *oa_config)
 {
-	unsigned int map_type = i915_coherent_map_type(dev_priv);
 	struct i915_gem_context *ctx;
-	struct i915_request *rq;
-	int ret;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	int err;
 
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+	lockdep_assert_held(&i915->drm.struct_mutex);
 
 	/*
 	 * The OA register config is setup through the context image. This image
@@ -1735,59 +1880,47 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 	 * We could emit the OA register config through the batch buffer but
 	 * this might leave small interval of time where the OA unit is
 	 * configured at an invalid sampling period.
-	 *
-	 * So far the best way to work around this issue seems to be draining
-	 * the GPU from any submitted work.
 	 */
-	ret = i915_gem_wait_for_idle(dev_priv,
-				     I915_WAIT_LOCKED,
-				     MAX_SCHEDULE_TIMEOUT);
-	if (ret)
-		return ret;
-
-	/* Update all contexts now that we've stalled the submission. */
-	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
+	list_for_each_entry(ctx, &i915->contexts.list, link) {
 		struct i915_gem_engines_iter it;
 		struct intel_context *ce;
 
 		for_each_gem_engine(ce,
 				    i915_gem_context_lock_engines(ctx),
 				    it) {
-			u32 *regs;
-
 			if (ce->engine->class != RENDER_CLASS)
 				continue;
 
-			/* OA settings will be set upon first use */
-			if (!ce->state)
-				continue;
-
-			regs = i915_gem_object_pin_map(ce->state->obj,
-						       map_type);
-			if (IS_ERR(regs)) {
-				i915_gem_context_unlock_engines(ctx);
-				return PTR_ERR(regs);
-			}
-
-			ce->state->obj->mm.dirty = true;
-			regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs);
+			err = intel_context_lock_pinned(ce);
+			if (err)
+				break;
 
-			gen8_update_reg_state_unlocked(ce, regs, oa_config);
+			/* Otherwise OA settings will be set upon first use */
+			if (intel_context_is_pinned(ce))
+				err = gen8_modify_context(ce, oa_config);
 
-			i915_gem_object_unpin_map(ce->state->obj);
+			intel_context_unlock_pinned(ce);
+			if (err)
+				break;
 		}
 		i915_gem_context_unlock_engines(ctx);
+		if (err)
+			return err;
 	}
 
 	/*
-	 * Apply the configuration by doing one context restore of the edited
-	 * context image.
+	 * After updating all other contexts, we need to modify ourselves.
+	 * If we don't modify the kernel_context, we do not get events while
+	 * idle.
 	 */
-	rq = i915_request_create(dev_priv->engine[RCS0]->kernel_context);
-	if (IS_ERR(rq))
-		return PTR_ERR(rq);
+	for_each_engine(engine, i915, id) {
+		if (engine->class != RENDER_CLASS)
+			continue;
 
-	i915_request_add(rq);
+		err = gen8_modify_self(engine->kernel_context, oa_config);
+		if (err)
+			return err;
+	}
 
 	return 0;
 }