Message ID | 20190708111944.20095-1-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/2] drm/i915/oa: Reconfigure contexts on the fly | expand |
On 08/07/2019 12:19, Chris Wilson wrote: > Avoid a global idle barrier by reconfiguring each context by rewriting > them with MI_STORE_DWORD from the kernel context. > > v2: We only need to determine the desired register values once, they are > the same for all contexts. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> > --- > drivers/gpu/drm/i915/gem/i915_gem_context.c | 2 + > drivers/gpu/drm/i915/gt/intel_lrc.c | 7 +- > drivers/gpu/drm/i915/i915_perf.c | 248 +++++++++++++++----- > 3 files changed, 195 insertions(+), 62 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c > index e367dce2a696..1f0d10bb88c1 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c > @@ -624,7 +624,9 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio) > ctx->sched.priority = I915_USER_PRIORITY(prio); > ctx->ring_size = PAGE_SIZE; > > + /* Isolate the kernel context from prying eyes and sticky fingers */ > GEM_BUG_ON(!i915_gem_context_is_kernel(ctx)); > + list_del_init(&ctx->link); Why exactly? Any repercussions for i915_sysfs/i915_l3_write? debugfs I gather you won't care about? Should adding to i915->contexts.list be done from gem_context_register? What remains not a kernel context, but on a list? Won't preempt context be missed in re-configuration? Regards, Tvrtko > > return ctx; > } > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c > index e1ae1399c72b..9cc5374401e1 100644 > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c > @@ -1552,9 +1552,12 @@ __execlists_update_reg_state(struct intel_context *ce, > regs[CTX_RING_TAIL + 1] = ring->tail; > > /* RPCS */ > - if (engine->class == RENDER_CLASS) > + if (engine->class == RENDER_CLASS) { > regs[CTX_R_PWR_CLK_STATE + 1] = > intel_sseu_make_rpcs(engine->i915, &ce->sseu); > + > + i915_oa_init_reg_state(engine, ce, regs); > + } > } > > static int > @@ -2966,8 +2969,6 @@ static void execlists_init_reg_state(u32 *regs, > if (rcs) { > regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); > CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0); > - > - i915_oa_init_reg_state(engine, ce, regs); > } > > regs[CTX_END] = MI_BATCH_BUFFER_END; > diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c > index 357e63beb373..25c34a0b521e 100644 > --- a/drivers/gpu/drm/i915/i915_perf.c > +++ b/drivers/gpu/drm/i915/i915_perf.c > @@ -1630,6 +1630,27 @@ static void hsw_disable_metric_set(struct drm_i915_private *dev_priv) > ~GT_NOA_ENABLE)); > } > > +static u32 oa_config_flex_reg(const struct i915_oa_config *oa_config, > + i915_reg_t reg) > +{ > + u32 mmio = i915_mmio_reg_offset(reg); > + int i; > + > + /* > + * This arbitrary default will select the 'EU FPU0 Pipeline > + * Active' event. In the future it's anticipated that there > + * will be an explicit 'No Event' we can select, but not yet... > + */ > + if (!oa_config) > + return 0; > + > + for (i = 0; i < oa_config->flex_regs_len; i++) { > + if (i915_mmio_reg_offset(oa_config->flex_regs[i].addr) == mmio) > + return oa_config->flex_regs[i].value; > + } > + > + return 0; > +} > /* > * NB: It must always remain pointer safe to run this even if the OA unit > * has been disabled. > @@ -1663,28 +1684,8 @@ gen8_update_reg_state_unlocked(struct intel_context *ce, > GEN8_OA_COUNTER_RESUME); > > for (i = 0; i < ARRAY_SIZE(flex_regs); i++) { > - u32 state_offset = ctx_flexeu0 + i * 2; > - u32 mmio = i915_mmio_reg_offset(flex_regs[i]); > - > - /* > - * This arbitrary default will select the 'EU FPU0 Pipeline > - * Active' event. In the future it's anticipated that there > - * will be an explicit 'No Event' we can select, but not yet... > - */ > - u32 value = 0; > - > - if (oa_config) { > - u32 j; > - > - for (j = 0; j < oa_config->flex_regs_len; j++) { > - if (i915_mmio_reg_offset(oa_config->flex_regs[j].addr) == mmio) { > - value = oa_config->flex_regs[j].value; > - break; > - } > - } > - } > - > - CTX_REG(reg_state, state_offset, flex_regs[i], value); > + CTX_REG(reg_state, ctx_flexeu0 + i * 2, flex_regs[i], > + oa_config_flex_reg(oa_config, flex_regs[i])); > } > > CTX_REG(reg_state, > @@ -1692,6 +1693,107 @@ gen8_update_reg_state_unlocked(struct intel_context *ce, > intel_sseu_make_rpcs(i915, &ce->sseu)); > } > > +struct flex { > + i915_reg_t reg; > + u32 offset; > + u32 value; > +}; > + > +static int > +gen8_store_flex(struct i915_request *rq, > + struct intel_context *ce, > + const struct flex *flex, unsigned int count) > +{ > + u32 offset; > + u32 *cs; > + > + cs = intel_ring_begin(rq, 4 * count); > + if (IS_ERR(cs)) > + return PTR_ERR(cs); > + > + offset = i915_ggtt_offset(ce->state) + LRC_STATE_PN * PAGE_SIZE; > + do { > + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; > + *cs++ = offset + (flex->offset + 1) * sizeof(u32); > + *cs++ = 0; > + *cs++ = flex->value; > + } while (flex++, --count); > + > + intel_ring_advance(rq, cs); > + > + return 0; > +} > + > +static int > +gen8_load_flex(struct i915_request *rq, > + struct intel_context *ce, > + const struct flex *flex, unsigned int count) > +{ > + u32 *cs; > + > + GEM_BUG_ON(!count || count > 63); > + > + cs = intel_ring_begin(rq, 2 * count + 2); > + if (IS_ERR(cs)) > + return PTR_ERR(cs); > + > + *cs++ = MI_LOAD_REGISTER_IMM(count); > + do { > + *cs++ = i915_mmio_reg_offset(flex->reg); > + *cs++ = flex->value; > + } while (flex++, --count); > + *cs++ = MI_NOOP; > + > + intel_ring_advance(rq, cs); > + > + return 0; > +} > + > +static int gen8_modify_context(struct intel_context *ce, > + const struct flex *flex, unsigned int count) > +{ > + struct i915_request *rq; > + int err; > + > + lockdep_assert_held(&ce->pin_mutex); > + > + rq = i915_request_create(ce->engine->kernel_context); > + if (IS_ERR(rq)) > + return PTR_ERR(rq); > + > + /* Serialise with the remote context */ > + err = i915_active_request_set(&ce->ring->timeline->last_request, rq); > + if (err) > + goto out_add; > + > + /* Keep the remote context alive until after we finish editing */ > + err = i915_active_ref(&ce->active, rq->fence.context, rq); > + if (err) > + goto out_add; > + > + err = gen8_store_flex(rq, ce, flex, count); > + > +out_add: > + i915_request_add(rq); > + return err; > +} > + > +static int gen8_modify_self(struct intel_context *ce, > + const struct flex *flex, unsigned int count) > +{ > + struct i915_request *rq; > + int err; > + > + rq = i915_request_create(ce); > + if (IS_ERR(rq)) > + return PTR_ERR(rq); > + > + err = gen8_load_flex(rq, ce, flex, count); > + > + i915_request_add(rq); > + return err; > +} > + > /* > * Manages updating the per-context aspects of the OA stream > * configuration across all contexts. > @@ -1716,15 +1818,43 @@ gen8_update_reg_state_unlocked(struct intel_context *ce, > * > * Note: it's only the RCS/Render context that has any OA state. > */ > -static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv, > +static int gen8_configure_all_contexts(struct drm_i915_private *i915, > const struct i915_oa_config *oa_config) > { > - unsigned int map_type = i915_coherent_map_type(dev_priv); > + /* The MMIO offsets for Flex EU registers aren't contiguous */ > + const u32 ctx_flexeu0 = i915->perf.oa.ctx_flexeu0_offset; > +#define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N)) > + struct flex regs[] = { > + { > + GEN8_R_PWR_CLK_STATE, > + CTX_R_PWR_CLK_STATE, > + }, > + { > + GEN8_OACTXCONTROL, > + i915->perf.oa.ctx_oactxctrl_offset, > + ((i915->perf.oa.period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) | > + (i915->perf.oa.periodic ? GEN8_OA_TIMER_ENABLE : 0) | > + GEN8_OA_COUNTER_RESUME) > + }, > + { EU_PERF_CNTL0, ctx_flexeuN(0) }, > + { EU_PERF_CNTL1, ctx_flexeuN(1) }, > + { EU_PERF_CNTL2, ctx_flexeuN(2) }, > + { EU_PERF_CNTL3, ctx_flexeuN(3) }, > + { EU_PERF_CNTL4, ctx_flexeuN(4) }, > + { EU_PERF_CNTL5, ctx_flexeuN(5) }, > + { EU_PERF_CNTL6, ctx_flexeuN(6) }, > + }; > +#undef ctx_flexeuN > + struct intel_engine_cs *engine; > struct i915_gem_context *ctx; > - struct i915_request *rq; > - int ret; > + enum intel_engine_id id; > + int err; > + int i; > + > + for (i = 2; i < ARRAY_SIZE(regs); i++) > + regs[i].value = oa_config_flex_reg(oa_config, regs[i].reg); > > - lockdep_assert_held(&dev_priv->drm.struct_mutex); > + lockdep_assert_held(&i915->drm.struct_mutex); > > /* > * The OA register config is setup through the context image. This image > @@ -1736,58 +1866,58 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv, > * this might leave small interval of time where the OA unit is > * configured at an invalid sampling period. > * > - * So far the best way to work around this issue seems to be draining > - * the GPU from any submitted work. > + * Note that since we emit all requests from a single ring, there > + * is still an implicit global barrier here that may cause a high > + * priority context to wait for an otherwise independent low priority > + * context. Contexts idle at the time of reconfiguration are not > + * trapped behind the barrier. > */ > - ret = i915_gem_wait_for_idle(dev_priv, > - I915_WAIT_LOCKED, > - MAX_SCHEDULE_TIMEOUT); > - if (ret) > - return ret; > - > - /* Update all contexts now that we've stalled the submission. */ > - list_for_each_entry(ctx, &dev_priv->contexts.list, link) { > + list_for_each_entry(ctx, &i915->contexts.list, link) { > struct i915_gem_engines_iter it; > struct intel_context *ce; > > for_each_gem_engine(ce, > i915_gem_context_lock_engines(ctx), > it) { > - u32 *regs; > - > if (ce->engine->class != RENDER_CLASS) > continue; > > - /* OA settings will be set upon first use */ > - if (!ce->state) > - continue; > - > - regs = i915_gem_object_pin_map(ce->state->obj, > - map_type); > - if (IS_ERR(regs)) { > - i915_gem_context_unlock_engines(ctx); > - return PTR_ERR(regs); > - } > + err = intel_context_lock_pinned(ce); > + if (err) > + break; > > - ce->state->obj->mm.dirty = true; > - regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs); > + regs[0].value = intel_sseu_make_rpcs(i915, &ce->sseu); > > - gen8_update_reg_state_unlocked(ce, regs, oa_config); > + /* Otherwise OA settings will be set upon first use */ > + if (intel_context_is_pinned(ce)) > + err = gen8_modify_context(ce, regs, ARRAY_SIZE(regs)); > > - i915_gem_object_unpin_map(ce->state->obj); > + intel_context_unlock_pinned(ce); > + if (err) > + break; > } > i915_gem_context_unlock_engines(ctx); > + if (err) > + return err; > } > > /* > - * Apply the configuration by doing one context restore of the edited > - * context image. > + * After updating all other contexts, we need to modify ourselves. > + * If we don't modify the kernel_context, we do not get events while > + * idle. > */ > - rq = i915_request_create(dev_priv->engine[RCS0]->kernel_context); > - if (IS_ERR(rq)) > - return PTR_ERR(rq); > + for_each_engine(engine, i915, id) { > + struct intel_context *ce = engine->kernel_context; > > - i915_request_add(rq); > + if (engine->class != RENDER_CLASS) > + continue; > + > + regs[0].value = intel_sseu_make_rpcs(i915, &ce->sseu); > + > + err = gen8_modify_self(ce, regs, ARRAY_SIZE(regs)); > + if (err) > + return err; > + } > > return 0; > } >
Quoting Tvrtko Ursulin (2019-07-08 13:10:34) > > On 08/07/2019 12:19, Chris Wilson wrote: > > Avoid a global idle barrier by reconfiguring each context by rewriting > > them with MI_STORE_DWORD from the kernel context. > > > > v2: We only need to determine the desired register values once, they are > > the same for all contexts. > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com> > > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> > > --- > > drivers/gpu/drm/i915/gem/i915_gem_context.c | 2 + > > drivers/gpu/drm/i915/gt/intel_lrc.c | 7 +- > > drivers/gpu/drm/i915/i915_perf.c | 248 +++++++++++++++----- > > 3 files changed, 195 insertions(+), 62 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c > > index e367dce2a696..1f0d10bb88c1 100644 > > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c > > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c > > @@ -624,7 +624,9 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio) > > ctx->sched.priority = I915_USER_PRIORITY(prio); > > ctx->ring_size = PAGE_SIZE; > > > > + /* Isolate the kernel context from prying eyes and sticky fingers */ > > GEM_BUG_ON(!i915_gem_context_is_kernel(ctx)); > > + list_del_init(&ctx->link); > > Why exactly? Otherwise we recursively try to modify the context. > Any repercussions for i915_sysfs/i915_l3_write? debugfs I gather you > won't care about? No, because what matters for those is user contexts. > Should adding to i915->contexts.list be done from gem_context_register? > What remains not a kernel context, but on a list? And I also plan to pull it under its own mutex. > Won't preempt context be missed in re-configuration? What preempt-context? :-p And I would skip kernel_context if I could, but for whatever reason oa events are being sent to userspace even while the GPU is idle and igt expects the regular tick. -Chris
On 08/07/2019 13:16, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2019-07-08 13:10:34) >> >> On 08/07/2019 12:19, Chris Wilson wrote: >>> Avoid a global idle barrier by reconfiguring each context by rewriting >>> them with MI_STORE_DWORD from the kernel context. >>> >>> v2: We only need to determine the desired register values once, they are >>> the same for all contexts. >>> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> >>> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com> >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> >>> --- >>> drivers/gpu/drm/i915/gem/i915_gem_context.c | 2 + >>> drivers/gpu/drm/i915/gt/intel_lrc.c | 7 +- >>> drivers/gpu/drm/i915/i915_perf.c | 248 +++++++++++++++----- >>> 3 files changed, 195 insertions(+), 62 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c >>> index e367dce2a696..1f0d10bb88c1 100644 >>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c >>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c >>> @@ -624,7 +624,9 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio) >>> ctx->sched.priority = I915_USER_PRIORITY(prio); >>> ctx->ring_size = PAGE_SIZE; >>> >>> + /* Isolate the kernel context from prying eyes and sticky fingers */ >>> GEM_BUG_ON(!i915_gem_context_is_kernel(ctx)); >>> + list_del_init(&ctx->link); >> >> Why exactly? > > Otherwise we recursively try to modify the context. From gen8_configure_all_contexts, twice, or really recursively? If former isn't that solvable by simply skipping kernel contexts in the first loop? >> Any repercussions for i915_sysfs/i915_l3_write? debugfs I gather you >> won't care about? > > No, because what matters for those is user contexts. There isn't some time cost associated with l3_remap calls when switching contexts? > >> Should adding to i915->contexts.list be done from gem_context_register? >> What remains not a kernel context, but on a list? > > And I also plan to pull it under its own mutex. > >> Won't preempt context be missed in re-configuration? > > What preempt-context? :-p And I would skip kernel_context if I could, but > for whatever reason oa events are being sent to userspace even while the > GPU is idle and igt expects the regular tick. My tree still has it? Is it out of date? Solution with keeping it on the list and skipping sounds more future proof if doable. Regards, Tvrtko
Quoting Tvrtko Ursulin (2019-07-08 13:24:39) > > On 08/07/2019 13:16, Chris Wilson wrote: > > Quoting Tvrtko Ursulin (2019-07-08 13:10:34) > >> > >> On 08/07/2019 12:19, Chris Wilson wrote: > >>> Avoid a global idle barrier by reconfiguring each context by rewriting > >>> them with MI_STORE_DWORD from the kernel context. > >>> > >>> v2: We only need to determine the desired register values once, they are > >>> the same for all contexts. > >>> > >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > >>> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com> > >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >>> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> > >>> --- > >>> drivers/gpu/drm/i915/gem/i915_gem_context.c | 2 + > >>> drivers/gpu/drm/i915/gt/intel_lrc.c | 7 +- > >>> drivers/gpu/drm/i915/i915_perf.c | 248 +++++++++++++++----- > >>> 3 files changed, 195 insertions(+), 62 deletions(-) > >>> > >>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c > >>> index e367dce2a696..1f0d10bb88c1 100644 > >>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c > >>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c > >>> @@ -624,7 +624,9 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio) > >>> ctx->sched.priority = I915_USER_PRIORITY(prio); > >>> ctx->ring_size = PAGE_SIZE; > >>> > >>> + /* Isolate the kernel context from prying eyes and sticky fingers */ > >>> GEM_BUG_ON(!i915_gem_context_is_kernel(ctx)); > >>> + list_del_init(&ctx->link); > >> > >> Why exactly? > > > > Otherwise we recursively try to modify the context. > > From gen8_configure_all_contexts, twice, or really recursively? If > former isn't that solvable by simply skipping kernel contexts in the > first loop? > > >> Any repercussions for i915_sysfs/i915_l3_write? debugfs I gather you > >> won't care about? > > > > No, because what matters for those is user contexts. > > There isn't some time cost associated with l3_remap calls when switching > contexts? No, it's even weirder than that as it is not a context register (at least on hsw where we support it). I guess, we should just unlazy and emit a request from the sysfs. > >> Should adding to i915->contexts.list be done from gem_context_register? > >> What remains not a kernel context, but on a list? > > > > And I also plan to pull it under its own mutex. > > > >> Won't preempt context be missed in re-configuration? > > > > What preempt-context? :-p And I would skip kernel_context if I could, but > > for whatever reason oa events are being sent to userspace even while the > > GPU is idle and igt expects the regular tick. > > My tree still has it? Is it out of date? Solution with keeping it on the > list and skipping sounds more future proof if doable. It's only used by guc for the guc command to preempt-to-idle. It should not be recorded as an oa event but an inter-context blip (which is exactly what it is). Similarly, I also think oa should not prying into the kernel context (even though it's not used for much at present). -Chris
On 08/07/2019 14:39, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2019-07-08 13:24:39) >> >> On 08/07/2019 13:16, Chris Wilson wrote: >>> Quoting Tvrtko Ursulin (2019-07-08 13:10:34) >>>> >>>> On 08/07/2019 12:19, Chris Wilson wrote: >>>>> Avoid a global idle barrier by reconfiguring each context by rewriting >>>>> them with MI_STORE_DWORD from the kernel context. >>>>> >>>>> v2: We only need to determine the desired register values once, they are >>>>> the same for all contexts. >>>>> >>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> >>>>> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com> >>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>>>> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> >>>>> --- >>>>> drivers/gpu/drm/i915/gem/i915_gem_context.c | 2 + >>>>> drivers/gpu/drm/i915/gt/intel_lrc.c | 7 +- >>>>> drivers/gpu/drm/i915/i915_perf.c | 248 +++++++++++++++----- >>>>> 3 files changed, 195 insertions(+), 62 deletions(-) >>>>> >>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c >>>>> index e367dce2a696..1f0d10bb88c1 100644 >>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c >>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c >>>>> @@ -624,7 +624,9 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio) >>>>> ctx->sched.priority = I915_USER_PRIORITY(prio); >>>>> ctx->ring_size = PAGE_SIZE; >>>>> >>>>> + /* Isolate the kernel context from prying eyes and sticky fingers */ >>>>> GEM_BUG_ON(!i915_gem_context_is_kernel(ctx)); >>>>> + list_del_init(&ctx->link); >>>> >>>> Why exactly? >>> >>> Otherwise we recursively try to modify the context. >> >> From gen8_configure_all_contexts, twice, or really recursively? If >> former isn't that solvable by simply skipping kernel contexts in the >> first loop? >> >>>> Any repercussions for i915_sysfs/i915_l3_write? debugfs I gather you >>>> won't care about? >>> >>> No, because what matters for those is user contexts. >> >> There isn't some time cost associated with l3_remap calls when switching >> contexts? > > No, it's even weirder than that as it is not a context register (at > least on hsw where we support it). I guess, we should just unlazy and emit > a request from the sysfs. > >>>> Should adding to i915->contexts.list be done from gem_context_register? >>>> What remains not a kernel context, but on a list? >>> >>> And I also plan to pull it under its own mutex. >>> >>>> Won't preempt context be missed in re-configuration? >>> >>> What preempt-context? :-p And I would skip kernel_context if I could, but >>> for whatever reason oa events are being sent to userspace even while the >>> GPU is idle and igt expects the regular tick. >> >> My tree still has it? Is it out of date? Solution with keeping it on the >> list and skipping sounds more future proof if doable. > > It's only used by guc for the guc command to preempt-to-idle. It should > not be recorded as an oa event but an inter-context blip (which is > exactly what it is). > > Similarly, I also think oa should not prying into the kernel context > (even though it's not used for much at present). Will there be a separate blitter context? OA can handle the changing config between kernel and other contexts? Regards, Tvrtko
Quoting Tvrtko Ursulin (2019-07-11 08:23:37) > > On 08/07/2019 14:39, Chris Wilson wrote: > > Quoting Tvrtko Ursulin (2019-07-08 13:24:39) > >> > >> On 08/07/2019 13:16, Chris Wilson wrote: > >>> Quoting Tvrtko Ursulin (2019-07-08 13:10:34) > >>>> > >>>> On 08/07/2019 12:19, Chris Wilson wrote: > >>>>> Avoid a global idle barrier by reconfiguring each context by rewriting > >>>>> them with MI_STORE_DWORD from the kernel context. > >>>>> > >>>>> v2: We only need to determine the desired register values once, they are > >>>>> the same for all contexts. > >>>>> > >>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > >>>>> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com> > >>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >>>>> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> > >>>>> --- > >>>>> drivers/gpu/drm/i915/gem/i915_gem_context.c | 2 + > >>>>> drivers/gpu/drm/i915/gt/intel_lrc.c | 7 +- > >>>>> drivers/gpu/drm/i915/i915_perf.c | 248 +++++++++++++++----- > >>>>> 3 files changed, 195 insertions(+), 62 deletions(-) > >>>>> > >>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c > >>>>> index e367dce2a696..1f0d10bb88c1 100644 > >>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c > >>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c > >>>>> @@ -624,7 +624,9 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio) > >>>>> ctx->sched.priority = I915_USER_PRIORITY(prio); > >>>>> ctx->ring_size = PAGE_SIZE; > >>>>> > >>>>> + /* Isolate the kernel context from prying eyes and sticky fingers */ > >>>>> GEM_BUG_ON(!i915_gem_context_is_kernel(ctx)); > >>>>> + list_del_init(&ctx->link); > >>>> > >>>> Why exactly? > >>> > >>> Otherwise we recursively try to modify the context. > >> > >> From gen8_configure_all_contexts, twice, or really recursively? If > >> former isn't that solvable by simply skipping kernel contexts in the > >> first loop? > >> > >>>> Any repercussions for i915_sysfs/i915_l3_write? debugfs I gather you > >>>> won't care about? > >>> > >>> No, because what matters for those is user contexts. > >> > >> There isn't some time cost associated with l3_remap calls when switching > >> contexts? > > > > No, it's even weirder than that as it is not a context register (at > > least on hsw where we support it). I guess, we should just unlazy and emit > > a request from the sysfs. > > > >>>> Should adding to i915->contexts.list be done from gem_context_register? > >>>> What remains not a kernel context, but on a list? > >>> > >>> And I also plan to pull it under its own mutex. > >>> > >>>> Won't preempt context be missed in re-configuration? > >>> > >>> What preempt-context? :-p And I would skip kernel_context if I could, but > >>> for whatever reason oa events are being sent to userspace even while the > >>> GPU is idle and igt expects the regular tick. > >> > >> My tree still has it? Is it out of date? Solution with keeping it on the > >> list and skipping sounds more future proof if doable. > > > > It's only used by guc for the guc command to preempt-to-idle. It should > > not be recorded as an oa event but an inter-context blip (which is > > exactly what it is). > > > > Similarly, I also think oa should not prying into the kernel context > > (even though it's not used for much at present). > > Will there be a separate blitter context? Yes, no. I will try to push as much of the migration into the culprit, but there will always be a need for kswapd/kcompactd for background cleanup and a get-out-of-jail. That should be the kernel_context as far as I am concerned. > OA can handle the changing config between kernel and other contexts? Imo, the less spying on things it is not privileged to spy on the better. I have no idea how stable it is, the code gives a very underwhelming impression. -Chris
Quoting Chris Wilson (2019-07-08 14:39:35) > Quoting Tvrtko Ursulin (2019-07-08 13:24:39) > > > > On 08/07/2019 13:16, Chris Wilson wrote: > > > Quoting Tvrtko Ursulin (2019-07-08 13:10:34) > > >> > > >> On 08/07/2019 12:19, Chris Wilson wrote: > > >>> Avoid a global idle barrier by reconfiguring each context by rewriting > > >>> them with MI_STORE_DWORD from the kernel context. > > >>> > > >>> v2: We only need to determine the desired register values once, they are > > >>> the same for all contexts. > > >>> > > >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > >>> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com> > > >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > >>> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> > > >>> --- > > >>> drivers/gpu/drm/i915/gem/i915_gem_context.c | 2 + > > >>> drivers/gpu/drm/i915/gt/intel_lrc.c | 7 +- > > >>> drivers/gpu/drm/i915/i915_perf.c | 248 +++++++++++++++----- > > >>> 3 files changed, 195 insertions(+), 62 deletions(-) > > >>> > > >>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c > > >>> index e367dce2a696..1f0d10bb88c1 100644 > > >>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c > > >>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c > > >>> @@ -624,7 +624,9 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio) > > >>> ctx->sched.priority = I915_USER_PRIORITY(prio); > > >>> ctx->ring_size = PAGE_SIZE; > > >>> > > >>> + /* Isolate the kernel context from prying eyes and sticky fingers */ > > >>> GEM_BUG_ON(!i915_gem_context_is_kernel(ctx)); > > >>> + list_del_init(&ctx->link); > > >> > > >> Why exactly? > > > > > > Otherwise we recursively try to modify the context. > > > > From gen8_configure_all_contexts, twice, or really recursively? If > > former isn't that solvable by simply skipping kernel contexts in the > > first loop? > > > > >> Any repercussions for i915_sysfs/i915_l3_write? debugfs I gather you > > >> won't care about? > > > > > > No, because what matters for those is user contexts. > > > > There isn't some time cost associated with l3_remap calls when switching > > contexts? > > No, it's even weirder than that as it is not a context register (at > least on hsw where we support it). I guess, we should just unlazy and emit > a request from the sysfs. Actually, I am wrong here. The L3 remapping registers do appear to be saved in the context image. At least that is what my testing indicates. I can still argue that they don't apply to the kernel context. But my plan to remove the walk over GEM contexts and the lazy update fails. -Chris
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index e367dce2a696..1f0d10bb88c1 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -624,7 +624,9 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio) ctx->sched.priority = I915_USER_PRIORITY(prio); ctx->ring_size = PAGE_SIZE; + /* Isolate the kernel context from prying eyes and sticky fingers */ GEM_BUG_ON(!i915_gem_context_is_kernel(ctx)); + list_del_init(&ctx->link); return ctx; } diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index e1ae1399c72b..9cc5374401e1 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1552,9 +1552,12 @@ __execlists_update_reg_state(struct intel_context *ce, regs[CTX_RING_TAIL + 1] = ring->tail; /* RPCS */ - if (engine->class == RENDER_CLASS) + if (engine->class == RENDER_CLASS) { regs[CTX_R_PWR_CLK_STATE + 1] = intel_sseu_make_rpcs(engine->i915, &ce->sseu); + + i915_oa_init_reg_state(engine, ce, regs); + } } static int @@ -2966,8 +2969,6 @@ static void execlists_init_reg_state(u32 *regs, if (rcs) { regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0); - - i915_oa_init_reg_state(engine, ce, regs); } regs[CTX_END] = MI_BATCH_BUFFER_END; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 357e63beb373..25c34a0b521e 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -1630,6 +1630,27 @@ static void hsw_disable_metric_set(struct drm_i915_private *dev_priv) ~GT_NOA_ENABLE)); } +static u32 oa_config_flex_reg(const struct i915_oa_config *oa_config, + i915_reg_t reg) +{ + u32 mmio = i915_mmio_reg_offset(reg); + int i; + + /* + * This arbitrary default will select the 'EU FPU0 Pipeline + * Active' event. In the future it's anticipated that there + * will be an explicit 'No Event' we can select, but not yet... + */ + if (!oa_config) + return 0; + + for (i = 0; i < oa_config->flex_regs_len; i++) { + if (i915_mmio_reg_offset(oa_config->flex_regs[i].addr) == mmio) + return oa_config->flex_regs[i].value; + } + + return 0; +} /* * NB: It must always remain pointer safe to run this even if the OA unit * has been disabled. @@ -1663,28 +1684,8 @@ gen8_update_reg_state_unlocked(struct intel_context *ce, GEN8_OA_COUNTER_RESUME); for (i = 0; i < ARRAY_SIZE(flex_regs); i++) { - u32 state_offset = ctx_flexeu0 + i * 2; - u32 mmio = i915_mmio_reg_offset(flex_regs[i]); - - /* - * This arbitrary default will select the 'EU FPU0 Pipeline - * Active' event. In the future it's anticipated that there - * will be an explicit 'No Event' we can select, but not yet... - */ - u32 value = 0; - - if (oa_config) { - u32 j; - - for (j = 0; j < oa_config->flex_regs_len; j++) { - if (i915_mmio_reg_offset(oa_config->flex_regs[j].addr) == mmio) { - value = oa_config->flex_regs[j].value; - break; - } - } - } - - CTX_REG(reg_state, state_offset, flex_regs[i], value); + CTX_REG(reg_state, ctx_flexeu0 + i * 2, flex_regs[i], + oa_config_flex_reg(oa_config, flex_regs[i])); } CTX_REG(reg_state, @@ -1692,6 +1693,107 @@ gen8_update_reg_state_unlocked(struct intel_context *ce, intel_sseu_make_rpcs(i915, &ce->sseu)); } +struct flex { + i915_reg_t reg; + u32 offset; + u32 value; +}; + +static int +gen8_store_flex(struct i915_request *rq, + struct intel_context *ce, + const struct flex *flex, unsigned int count) +{ + u32 offset; + u32 *cs; + + cs = intel_ring_begin(rq, 4 * count); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + offset = i915_ggtt_offset(ce->state) + LRC_STATE_PN * PAGE_SIZE; + do { + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = offset + (flex->offset + 1) * sizeof(u32); + *cs++ = 0; + *cs++ = flex->value; + } while (flex++, --count); + + intel_ring_advance(rq, cs); + + return 0; +} + +static int +gen8_load_flex(struct i915_request *rq, + struct intel_context *ce, + const struct flex *flex, unsigned int count) +{ + u32 *cs; + + GEM_BUG_ON(!count || count > 63); + + cs = intel_ring_begin(rq, 2 * count + 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_LOAD_REGISTER_IMM(count); + do { + *cs++ = i915_mmio_reg_offset(flex->reg); + *cs++ = flex->value; + } while (flex++, --count); + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + return 0; +} + +static int gen8_modify_context(struct intel_context *ce, + const struct flex *flex, unsigned int count) +{ + struct i915_request *rq; + int err; + + lockdep_assert_held(&ce->pin_mutex); + + rq = i915_request_create(ce->engine->kernel_context); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + /* Serialise with the remote context */ + err = i915_active_request_set(&ce->ring->timeline->last_request, rq); + if (err) + goto out_add; + + /* Keep the remote context alive until after we finish editing */ + err = i915_active_ref(&ce->active, rq->fence.context, rq); + if (err) + goto out_add; + + err = gen8_store_flex(rq, ce, flex, count); + +out_add: + i915_request_add(rq); + return err; +} + +static int gen8_modify_self(struct intel_context *ce, + const struct flex *flex, unsigned int count) +{ + struct i915_request *rq; + int err; + + rq = i915_request_create(ce); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + err = gen8_load_flex(rq, ce, flex, count); + + i915_request_add(rq); + return err; +} + /* * Manages updating the per-context aspects of the OA stream * configuration across all contexts. @@ -1716,15 +1818,43 @@ gen8_update_reg_state_unlocked(struct intel_context *ce, * * Note: it's only the RCS/Render context that has any OA state. */ -static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv, +static int gen8_configure_all_contexts(struct drm_i915_private *i915, const struct i915_oa_config *oa_config) { - unsigned int map_type = i915_coherent_map_type(dev_priv); + /* The MMIO offsets for Flex EU registers aren't contiguous */ + const u32 ctx_flexeu0 = i915->perf.oa.ctx_flexeu0_offset; +#define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N)) + struct flex regs[] = { + { + GEN8_R_PWR_CLK_STATE, + CTX_R_PWR_CLK_STATE, + }, + { + GEN8_OACTXCONTROL, + i915->perf.oa.ctx_oactxctrl_offset, + ((i915->perf.oa.period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) | + (i915->perf.oa.periodic ? GEN8_OA_TIMER_ENABLE : 0) | + GEN8_OA_COUNTER_RESUME) + }, + { EU_PERF_CNTL0, ctx_flexeuN(0) }, + { EU_PERF_CNTL1, ctx_flexeuN(1) }, + { EU_PERF_CNTL2, ctx_flexeuN(2) }, + { EU_PERF_CNTL3, ctx_flexeuN(3) }, + { EU_PERF_CNTL4, ctx_flexeuN(4) }, + { EU_PERF_CNTL5, ctx_flexeuN(5) }, + { EU_PERF_CNTL6, ctx_flexeuN(6) }, + }; +#undef ctx_flexeuN + struct intel_engine_cs *engine; struct i915_gem_context *ctx; - struct i915_request *rq; - int ret; + enum intel_engine_id id; + int err; + int i; + + for (i = 2; i < ARRAY_SIZE(regs); i++) + regs[i].value = oa_config_flex_reg(oa_config, regs[i].reg); - lockdep_assert_held(&dev_priv->drm.struct_mutex); + lockdep_assert_held(&i915->drm.struct_mutex); /* * The OA register config is setup through the context image. This image @@ -1736,58 +1866,58 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv, * this might leave small interval of time where the OA unit is * configured at an invalid sampling period. * - * So far the best way to work around this issue seems to be draining - * the GPU from any submitted work. + * Note that since we emit all requests from a single ring, there + * is still an implicit global barrier here that may cause a high + * priority context to wait for an otherwise independent low priority + * context. Contexts idle at the time of reconfiguration are not + * trapped behind the barrier. */ - ret = i915_gem_wait_for_idle(dev_priv, - I915_WAIT_LOCKED, - MAX_SCHEDULE_TIMEOUT); - if (ret) - return ret; - - /* Update all contexts now that we've stalled the submission. */ - list_for_each_entry(ctx, &dev_priv->contexts.list, link) { + list_for_each_entry(ctx, &i915->contexts.list, link) { struct i915_gem_engines_iter it; struct intel_context *ce; for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) { - u32 *regs; - if (ce->engine->class != RENDER_CLASS) continue; - /* OA settings will be set upon first use */ - if (!ce->state) - continue; - - regs = i915_gem_object_pin_map(ce->state->obj, - map_type); - if (IS_ERR(regs)) { - i915_gem_context_unlock_engines(ctx); - return PTR_ERR(regs); - } + err = intel_context_lock_pinned(ce); + if (err) + break; - ce->state->obj->mm.dirty = true; - regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs); + regs[0].value = intel_sseu_make_rpcs(i915, &ce->sseu); - gen8_update_reg_state_unlocked(ce, regs, oa_config); + /* Otherwise OA settings will be set upon first use */ + if (intel_context_is_pinned(ce)) + err = gen8_modify_context(ce, regs, ARRAY_SIZE(regs)); - i915_gem_object_unpin_map(ce->state->obj); + intel_context_unlock_pinned(ce); + if (err) + break; } i915_gem_context_unlock_engines(ctx); + if (err) + return err; } /* - * Apply the configuration by doing one context restore of the edited - * context image. + * After updating all other contexts, we need to modify ourselves. + * If we don't modify the kernel_context, we do not get events while + * idle. */ - rq = i915_request_create(dev_priv->engine[RCS0]->kernel_context); - if (IS_ERR(rq)) - return PTR_ERR(rq); + for_each_engine(engine, i915, id) { + struct intel_context *ce = engine->kernel_context; - i915_request_add(rq); + if (engine->class != RENDER_CLASS) + continue; + + regs[0].value = intel_sseu_make_rpcs(i915, &ce->sseu); + + err = gen8_modify_self(ce, regs, ARRAY_SIZE(regs)); + if (err) + return err; + } return 0; }