diff mbox

[v3,4/6] drm/i915: reprogram NOA muxes on context switch when using perf

Message ID 20180508180347.32080-5-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Lionel Landwerlin May 8, 2018, 6:03 p.m. UTC
If some of the contexts submitting workloads to the GPU have been
configured to shutdown slices/subslices, we might loose the NOA
configurations written in the NOA muxes. We need to reprogram them
when we detect a powergating configuration change.

In this change i915/perf is responsible for setting up a reprogramming
batchbuffer which we execute just before the userspace submitted
batchbuffer. We do this while preemption is still disable, only if
needed. The decision to execute this reprogramming batchbuffer is made
when we assign a request to an execlist port.

v2: Only reprogram when detecting configuration changes (Chris/Lionel)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h           |   6 ++
 drivers/gpu/drm/i915/i915_perf.c          | 108 ++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_request.h       |   6 ++
 drivers/gpu/drm/i915/intel_gpu_commands.h |   1 +
 drivers/gpu/drm/i915/intel_lrc.c          |  59 +++++++++++-
 drivers/gpu/drm/i915/intel_ringbuffer.c   |   2 +
 drivers/gpu/drm/i915/intel_ringbuffer.h   |   2 +
 7 files changed, 183 insertions(+), 1 deletion(-)

Comments

Chris Wilson May 9, 2018, 8:59 a.m. UTC | #1
Quoting Lionel Landwerlin (2018-05-08 19:03:45)
> If some of the contexts submitting workloads to the GPU have been
> configured to shutdown slices/subslices, we might loose the NOA
> configurations written in the NOA muxes. We need to reprogram them
> when we detect a powergating configuration change.
> 
> In this change i915/perf is responsible for setting up a reprogramming
> batchbuffer which we execute just before the userspace submitted
> batchbuffer. We do this while preemption is still disable, only if
> needed. The decision to execute this reprogramming batchbuffer is made
> when we assign a request to an execlist port.
> 
> v2: Only reprogram when detecting configuration changes (Chris/Lionel)
> 
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h           |   6 ++
>  drivers/gpu/drm/i915/i915_perf.c          | 108 ++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_request.h       |   6 ++
>  drivers/gpu/drm/i915/intel_gpu_commands.h |   1 +
>  drivers/gpu/drm/i915/intel_lrc.c          |  59 +++++++++++-
>  drivers/gpu/drm/i915/intel_ringbuffer.c   |   2 +
>  drivers/gpu/drm/i915/intel_ringbuffer.h   |   2 +
>  7 files changed, 183 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 04e27806e581..07cdbfb4ad7a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1461,6 +1461,12 @@ struct i915_perf_stream {
>          * @oa_config: The OA configuration used by the stream.
>          */
>         struct i915_oa_config *oa_config;
> +
> +       /**
> +        * @noa_reprogram_vma: A batchbuffer reprogramming the NOA muxes, used
> +        * after switching powergating configurations.
> +        */
> +       struct i915_vma *noa_reprogram_vma;
>  };
>  
>  /**
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 5b279a82445a..1b9e3d6a53a2 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1691,6 +1691,96 @@ static int gen8_emit_oa_config(struct i915_request *rq,
>         return 0;
>  }
>  
> +#define MAX_LRI_SIZE (125U)
> +
> +static u32 noa_reprogram_bb_size(struct i915_oa_config *oa_config)
> +{
> +       u32 n_lri;
> +
> +       /* Very unlikely but possible that we have no muxes to configure. */
> +       if (!oa_config->mux_regs_len)
> +               return 0;
> +
> +       n_lri = (oa_config->mux_regs_len / MAX_LRI_SIZE) +
> +               (oa_config->mux_regs_len % MAX_LRI_SIZE) != 0;
> +
> +       return n_lri * 4 + oa_config->mux_regs_len * 8 + /* MI_LOAD_REGISTER_IMMs */
> +               6 * 4 + /* PIPE_CONTROL */
> +               4; /* MI_BATCH_BUFFER_END */
> +}
> +
> +static int
> +alloc_noa_reprogram_bo(struct i915_perf_stream *stream)
> +{
> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +       struct i915_oa_config *oa_config = stream->oa_config;
> +       struct drm_i915_gem_object *bo;
> +       struct i915_vma *vma;
> +       u32 buffer_size;
> +       u32 *cs;
> +       int i, ret, n_loaded_regs;
> +
> +       buffer_size = ALIGN(noa_reprogram_bb_size(oa_config), PAGE_SIZE);
> +       if (buffer_size == 0)
> +               return 0;
> +
> +       bo = i915_gem_object_create(dev_priv, buffer_size);
> +       if (IS_ERR(bo)) {
> +               DRM_ERROR("Failed to allocate NOA reprogramming buffer\n");
> +               return PTR_ERR(bo);
> +       }
> +
> +       cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
> +       if (IS_ERR(cs)) {
> +               ret = PTR_ERR(cs);
> +               goto err_unref_bo;
> +       }
> +
> +       n_loaded_regs = 0;
> +       for (i = 0; i < oa_config->mux_regs_len; i++) {
> +               if ((n_loaded_regs % MAX_LRI_SIZE) == 0) {
> +                       u32 n_lri = min(oa_config->mux_regs_len - n_loaded_regs,
> +                                       MAX_LRI_SIZE);
> +                       *cs++ = MI_LOAD_REGISTER_IMM(n_lri);
> +               }
> +
> +               *cs++ = i915_mmio_reg_offset(oa_config->mux_regs[i].addr);
> +               *cs++ = oa_config->mux_regs[i].value;
> +               n_loaded_regs++;
> +       }
> +
> +       cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_MMIO_WRITE, 0);

What's the pc for? Isn't it a bit dangerous to request a mmio write to
to reg 0?

> +
> +       *cs++ = MI_BATCH_BUFFER_END;
> +
> +       i915_gem_object_unpin_map(bo);
> +
> +       ret = i915_gem_object_set_to_gtt_domain(bo, false);
> +       if (ret)
> +               goto err_unref_bo;
> +
> +       vma = i915_vma_instance(bo, &dev_priv->ggtt.base, NULL);
> +       if (IS_ERR(vma)) {
> +               ret = PTR_ERR(vma);
> +               goto err_unref_bo;
> +       }
> +
> +       ret = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
> +       if (ret)
> +               goto err_unref_vma;

No GGTT access, so just PIN_USER for GPU access.

> +       stream->noa_reprogram_vma = vma;
> +
> +       return 0;
> +
> +err_unref_vma:
> +       i915_vma_put(vma);
> +err_unref_bo:
> +       i915_gem_object_put(bo);
> +
> +       return ret;
> +}
> +
>  static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_priv,
>                                                  const struct i915_oa_config *oa_config)
>  {
> @@ -2102,6 +2192,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>                 goto err_config;
>         }
>  
> +       ret = alloc_noa_reprogram_bo(stream);
> +       if (ret) {
> +               DRM_DEBUG("Enable to alloc NOA reprogramming BO\n");
> +               goto err_config;
> +       }
> +
>         /* PRM - observability performance counters:
>          *
>          *   OACONTROL, performance counter enable, note:
> @@ -2136,6 +2232,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>  
>         dev_priv->perf.oa.exclusive_stream = stream;
>  
> +       ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
> +                                                     stream->oa_config);
> +       if (ret)
> +               goto err_enable;
> +
> +       stream->ops = &i915_oa_stream_ops;
> +
>         mutex_unlock(&dev_priv->drm.struct_mutex);
>  
>         return 0;
> @@ -2497,6 +2600,11 @@ static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
>         if (stream->ctx)
>                 i915_gem_context_put(stream->ctx);
>  
> +       if (stream->noa_reprogram_vma) {
> +               i915_vma_unpin(stream->noa_reprogram_vma);
> +               i915_gem_object_put(stream->noa_reprogram_vma->obj);

This will be unhappy due to the lack of active tracking.
i915_vma_unpin_and_release(&stream->noa_reprogram_vma).

Hmm, worse than that. It's not being tracked at all, so expect it to be
freed while still in use.

> +       }
> +
>         kfree(stream);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index eddbd4245cb3..3357ceb4bdb1 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -149,6 +149,12 @@ struct i915_request {
>         /** Preallocate space in the ring for the emitting the request */
>         u32 reserved_space;
>  
> +       /*
> +        * Pointer in the batchbuffer to where the i915/perf NOA reprogramming
> +        * can be inserted just before HW submission.
> +        */
> +       u32 *perf_prog_start;

Just u32 offset, no need for the extra space of a pointer here.
(Probably should just limit rings to 64k and pack the offsets into u16.)
Or have them as offset from head, and u8 dwords? Hmm.

I have cold shivers from the thought of more special locations inside
the batch. Not the first, and not the last, so I feel like there should
be a better way (or at least more consistent).

> +
>         /** Batch buffer related to this request if any (used for
>          * error state dump only).
>          */
> diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
> index 105e2a9e874a..09b0fded8b17 100644
> --- a/drivers/gpu/drm/i915/intel_gpu_commands.h
> +++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
> @@ -146,6 +146,7 @@
>  #define   MI_BATCH_GTT             (2<<6) /* aliased with (1<<7) on gen4 */
>  #define MI_BATCH_BUFFER_START_GEN8     MI_INSTR(0x31, 1)
>  #define   MI_BATCH_RESOURCE_STREAMER (1<<10)
> +#define   MI_BATCH_SECOND_LEVEL      (1<<22)
>  
>  /*
>   * 3D instructions used by the kernel
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 1c08bd2be6c3..df4a7bb07eae 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -510,6 +510,38 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
>         return true;
>  }
>  
> +static void maybe_enable_noa_repgoram(struct i915_request *rq)
> +{
> +       struct intel_engine_cs *engine = rq->engine;
> +       struct drm_i915_private *dev_priv = engine->i915;
> +       struct i915_perf_stream *stream = dev_priv->perf.oa.exclusive_stream;

Could there be any more pointer chasing? </chandler>

> +       struct intel_context *ce;
> +       u32 *cs = rq->perf_prog_start;
> +
> +       /* Slice/subslice/EU powergating only matters on the RCS. */
> +       if (engine->id != RCS)
> +               return;
> +
> +       /* Nothing need to reprogram when perf isn't enabled. */
> +       if (!stream)
> +               return;
> +
> +       ce = &rq->ctx->__engine[engine->id];

rq->hw_context; but to_intel_context() until that patch lands.

> +
> +       /*
> +        * If the powergating configuration doesn't change, no need to
> +        * reprogram.
> +        */
> +       if (engine->last_sseu.value == ce->sseu.value)
> +               return;
> +
> +       *cs++ = MI_BATCH_BUFFER_START_GEN8 | MI_BATCH_SECOND_LEVEL;

You are not a second level batch. You are calling from the ring to a
global address of _0_.

> +       *cs++ = 0;

low 32bits = 0

> +       *cs++ = i915_ggtt_offset(stream->noa_reprogram_vma);

high 32bits = address you wanted. (order reversed)

> +
> +       engine->last_sseu = ce->sseu;
> +}
> +
>  static void port_assign(struct execlist_port *port, struct i915_request *rq)
>  {
>         GEM_BUG_ON(rq == port_request(port));
> @@ -517,6 +549,8 @@ static void port_assign(struct execlist_port *port, struct i915_request *rq)
>         if (port_isset(port))
>                 i915_request_put(port_request(port));
>  
> +       maybe_enable_noa_repgoram(rq);

How should this work for the guc? How should this work for amalgamated
contexts? Should this only be done for the first request in the sequence
and definitely not done for lite-restore.

> +
>         port_set(port, port_pack(i915_request_get(rq), port_count(port)));
>  }
>  
> @@ -1047,6 +1081,13 @@ static void execlists_submission_tasklet(unsigned long data)
>                             buf[2*head + 1] == execlists->preempt_complete_status) {
>                                 GEM_TRACE("%s preempt-idle\n", engine->name);
>  
> +                               /*
> +                                * Clear out the state of the sseu on the
> +                                * engine, as it's not clear what it will be
> +                                * after preemption.
> +                                */
> +                               memset(&engine->last_sseu, 0, sizeof(engine->last_sseu));

Oh, you are missing complete_preempt_context. But you also need to reset
after reset, so cancel_port_requests is also sensisble.

> +
>                                 execlists_cancel_port_requests(execlists);
>                                 execlists_unwind_incomplete_requests(execlists);
>  
> @@ -1937,10 +1978,26 @@ static int gen8_emit_bb_start(struct i915_request *rq,
>                 rq->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
>         }
>  
> -       cs = intel_ring_begin(rq, 6);
> +       cs = intel_ring_begin(rq, rq->engine->id == RCS ? 10 : 6);
>         if (IS_ERR(cs))
>                 return PTR_ERR(cs);
>  
> +       if (rq->engine->id == RCS) {
> +               /*
> +                * Leave some instructions to be written with an
> +                * MI_BATCH_BUFFER_START to the i915/perf NOA reprogramming
> +                * batchbuffer. We only turn those MI_NOOP into
> +                * MI_BATCH_BUFFER_START when we detect a SSEU powergating
> +                * configuration change that might affect NOA. This is only
> +                * for the RCS.
> +                */
> +               rq->perf_prog_start = cs;
> +               *cs++ = MI_NOOP;
> +               *cs++ = MI_NOOP;
> +               *cs++ = MI_NOOP;
> +               *cs++ = MI_NOOP; /* Aligning to 2 dwords */
> +       }
> +
>         /*
>          * WaDisableCtxRestoreArbitration:bdw,chv
>          *
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 8f19349a6055..cde791234397 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -525,6 +525,8 @@ static int init_ring_common(struct intel_engine_cs *engine)
>         if (INTEL_GEN(dev_priv) > 2)
>                 I915_WRITE_MODE(engine, _MASKED_BIT_DISABLE(STOP_RING));
>  
> +       memset(&engine->last_sseu, 0, sizeof(engine->last_sseu));

Why only legacy submission?

> +
>  out:
>         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 010750e8ee44..2f8518e1a49f 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -343,6 +343,8 @@ struct intel_engine_cs {
>  
>         struct drm_i915_gem_object *default_state;
>  
> +       union i915_gem_sseu last_sseu;

The struct here is internal; debatable if it wants the GEM moniker for
being part of the GEM user interface.

Looks like this wanted a sefltest.
-Chris
Chris Wilson May 9, 2018, 9:05 a.m. UTC | #2
Quoting Chris Wilson (2018-05-09 09:59:09)
> Quoting Lionel Landwerlin (2018-05-08 19:03:45)
> > If some of the contexts submitting workloads to the GPU have been
> > configured to shutdown slices/subslices, we might loose the NOA
> > configurations written in the NOA muxes. We need to reprogram them
> > when we detect a powergating configuration change.
> > 
> > In this change i915/perf is responsible for setting up a reprogramming
> > batchbuffer which we execute just before the userspace submitted
> > batchbuffer. We do this while preemption is still disable, only if
> > needed. The decision to execute this reprogramming batchbuffer is made
> > when we assign a request to an execlist port.
> > 
> > v2: Only reprogram when detecting configuration changes (Chris/Lionel)
> > 
> > Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h           |   6 ++
> >  drivers/gpu/drm/i915/i915_perf.c          | 108 ++++++++++++++++++++++
> >  drivers/gpu/drm/i915/i915_request.h       |   6 ++
> >  drivers/gpu/drm/i915/intel_gpu_commands.h |   1 +
> >  drivers/gpu/drm/i915/intel_lrc.c          |  59 +++++++++++-
> >  drivers/gpu/drm/i915/intel_ringbuffer.c   |   2 +
> >  drivers/gpu/drm/i915/intel_ringbuffer.h   |   2 +
> >  7 files changed, 183 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 04e27806e581..07cdbfb4ad7a 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1461,6 +1461,12 @@ struct i915_perf_stream {
> >          * @oa_config: The OA configuration used by the stream.
> >          */
> >         struct i915_oa_config *oa_config;
> > +
> > +       /**
> > +        * @noa_reprogram_vma: A batchbuffer reprogramming the NOA muxes, used
> > +        * after switching powergating configurations.
> > +        */
> > +       struct i915_vma *noa_reprogram_vma;
> >  };
> >  
> >  /**
> > diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> > index 5b279a82445a..1b9e3d6a53a2 100644
> > --- a/drivers/gpu/drm/i915/i915_perf.c
> > +++ b/drivers/gpu/drm/i915/i915_perf.c
> > @@ -1691,6 +1691,96 @@ static int gen8_emit_oa_config(struct i915_request *rq,
> >         return 0;
> >  }
> >  
> > +#define MAX_LRI_SIZE (125U)
> > +
> > +static u32 noa_reprogram_bb_size(struct i915_oa_config *oa_config)
> > +{
> > +       u32 n_lri;
> > +
> > +       /* Very unlikely but possible that we have no muxes to configure. */
> > +       if (!oa_config->mux_regs_len)
> > +               return 0;
> > +
> > +       n_lri = (oa_config->mux_regs_len / MAX_LRI_SIZE) +
> > +               (oa_config->mux_regs_len % MAX_LRI_SIZE) != 0;
> > +
> > +       return n_lri * 4 + oa_config->mux_regs_len * 8 + /* MI_LOAD_REGISTER_IMMs */
> > +               6 * 4 + /* PIPE_CONTROL */
> > +               4; /* MI_BATCH_BUFFER_END */
> > +}
> > +
> > +static int
> > +alloc_noa_reprogram_bo(struct i915_perf_stream *stream)
> > +{
> > +       struct drm_i915_private *dev_priv = stream->dev_priv;
> > +       struct i915_oa_config *oa_config = stream->oa_config;
> > +       struct drm_i915_gem_object *bo;
> > +       struct i915_vma *vma;
> > +       u32 buffer_size;
> > +       u32 *cs;
> > +       int i, ret, n_loaded_regs;
> > +
> > +       buffer_size = ALIGN(noa_reprogram_bb_size(oa_config), PAGE_SIZE);
> > +       if (buffer_size == 0)
> > +               return 0;
> > +
> > +       bo = i915_gem_object_create(dev_priv, buffer_size);
> > +       if (IS_ERR(bo)) {
> > +               DRM_ERROR("Failed to allocate NOA reprogramming buffer\n");
> > +               return PTR_ERR(bo);
> > +       }
> > +
> > +       cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
> > +       if (IS_ERR(cs)) {
> > +               ret = PTR_ERR(cs);
> > +               goto err_unref_bo;
> > +       }
> > +
> > +       n_loaded_regs = 0;
> > +       for (i = 0; i < oa_config->mux_regs_len; i++) {
> > +               if ((n_loaded_regs % MAX_LRI_SIZE) == 0) {
> > +                       u32 n_lri = min(oa_config->mux_regs_len - n_loaded_regs,
> > +                                       MAX_LRI_SIZE);
> > +                       *cs++ = MI_LOAD_REGISTER_IMM(n_lri);
> > +               }
> > +
> > +               *cs++ = i915_mmio_reg_offset(oa_config->mux_regs[i].addr);
> > +               *cs++ = oa_config->mux_regs[i].value;
> > +               n_loaded_regs++;
> > +       }
> > +
> > +       cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_MMIO_WRITE, 0);
> 
> What's the pc for? Isn't it a bit dangerous to request a mmio write to
> to reg 0?
> 
> > +
> > +       *cs++ = MI_BATCH_BUFFER_END;
> > +
> > +       i915_gem_object_unpin_map(bo);
> > +
> > +       ret = i915_gem_object_set_to_gtt_domain(bo, false);
> > +       if (ret)
> > +               goto err_unref_bo;
> > +
> > +       vma = i915_vma_instance(bo, &dev_priv->ggtt.base, NULL);
> > +       if (IS_ERR(vma)) {
> > +               ret = PTR_ERR(vma);
> > +               goto err_unref_bo;
> > +       }
> > +
> > +       ret = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
> > +       if (ret)
> > +               goto err_unref_vma;
> 
> No GGTT access, so just PIN_USER for GPU access.
> 
> > +       stream->noa_reprogram_vma = vma;
> > +
> > +       return 0;
> > +
> > +err_unref_vma:
> > +       i915_vma_put(vma);
> > +err_unref_bo:
> > +       i915_gem_object_put(bo);
> > +
> > +       return ret;
> > +}
> > +
> >  static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_priv,
> >                                                  const struct i915_oa_config *oa_config)
> >  {
> > @@ -2102,6 +2192,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
> >                 goto err_config;
> >         }
> >  
> > +       ret = alloc_noa_reprogram_bo(stream);
> > +       if (ret) {
> > +               DRM_DEBUG("Enable to alloc NOA reprogramming BO\n");
> > +               goto err_config;
> > +       }
> > +
> >         /* PRM - observability performance counters:
> >          *
> >          *   OACONTROL, performance counter enable, note:
> > @@ -2136,6 +2232,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
> >  
> >         dev_priv->perf.oa.exclusive_stream = stream;
> >  
> > +       ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
> > +                                                     stream->oa_config);
> > +       if (ret)
> > +               goto err_enable;
> > +
> > +       stream->ops = &i915_oa_stream_ops;
> > +
> >         mutex_unlock(&dev_priv->drm.struct_mutex);
> >  
> >         return 0;
> > @@ -2497,6 +2600,11 @@ static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
> >         if (stream->ctx)
> >                 i915_gem_context_put(stream->ctx);
> >  
> > +       if (stream->noa_reprogram_vma) {
> > +               i915_vma_unpin(stream->noa_reprogram_vma);
> > +               i915_gem_object_put(stream->noa_reprogram_vma->obj);
> 
> This will be unhappy due to the lack of active tracking.
> i915_vma_unpin_and_release(&stream->noa_reprogram_vma).
> 
> Hmm, worse than that. It's not being tracked at all, so expect it to be
> freed while still in use.
> 
> > +       }
> > +
> >         kfree(stream);
> >  }
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> > index eddbd4245cb3..3357ceb4bdb1 100644
> > --- a/drivers/gpu/drm/i915/i915_request.h
> > +++ b/drivers/gpu/drm/i915/i915_request.h
> > @@ -149,6 +149,12 @@ struct i915_request {
> >         /** Preallocate space in the ring for the emitting the request */
> >         u32 reserved_space;
> >  
> > +       /*
> > +        * Pointer in the batchbuffer to where the i915/perf NOA reprogramming
> > +        * can be inserted just before HW submission.
> > +        */
> > +       u32 *perf_prog_start;
> 
> Just u32 offset, no need for the extra space of a pointer here.
> (Probably should just limit rings to 64k and pack the offsets into u16.)
> Or have them as offset from head, and u8 dwords? Hmm.
> 
> I have cold shivers from the thought of more special locations inside
> the batch. Not the first, and not the last, so I feel like there should
> be a better way (or at least more consistent).
> 
> > +
> >         /** Batch buffer related to this request if any (used for
> >          * error state dump only).
> >          */
> > diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
> > index 105e2a9e874a..09b0fded8b17 100644
> > --- a/drivers/gpu/drm/i915/intel_gpu_commands.h
> > +++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
> > @@ -146,6 +146,7 @@
> >  #define   MI_BATCH_GTT             (2<<6) /* aliased with (1<<7) on gen4 */
> >  #define MI_BATCH_BUFFER_START_GEN8     MI_INSTR(0x31, 1)
> >  #define   MI_BATCH_RESOURCE_STREAMER (1<<10)
> > +#define   MI_BATCH_SECOND_LEVEL      (1<<22)
> >  
> >  /*
> >   * 3D instructions used by the kernel
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index 1c08bd2be6c3..df4a7bb07eae 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -510,6 +510,38 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
> >         return true;
> >  }
> >  
> > +static void maybe_enable_noa_repgoram(struct i915_request *rq)
> > +{
> > +       struct intel_engine_cs *engine = rq->engine;
> > +       struct drm_i915_private *dev_priv = engine->i915;
> > +       struct i915_perf_stream *stream = dev_priv->perf.oa.exclusive_stream;
> 
> Could there be any more pointer chasing? </chandler>

Thinking about more about how to make this part cleaner, could we not
store the engine->noa_batch and then this all becomes

vma = engine->noa_batch;
if (vma)
	return;

Locking! Missed it in the first pass, but we are unlocked here... That
also ties into lifetimes.

I'm thinking more like ctx->noa_batch with that being pinned along with
the context. We need something along those lines to track the
locking/lifetime.
-Chris
Lionel Landwerlin May 9, 2018, 9:12 a.m. UTC | #3
On 09/05/18 09:59, Chris Wilson wrote:
>> +
>> +       *cs++ = MI_BATCH_BUFFER_START_GEN8 | MI_BATCH_SECOND_LEVEL;
> You are not a second level batch. You are calling from the ring to a
> global address of _0_.
>
>> +       *cs++ = 0;
> low 32bits = 0
>
>> +       *cs++ = i915_ggtt_offset(stream->noa_reprogram_vma);
> high 32bits = address you wanted. (order reversed)
>
How is this not hanging my machine is beyond me... :(
Lionel Landwerlin May 9, 2018, 9:23 a.m. UTC | #4
On 09/05/18 10:05, Chris Wilson wrote:
>> Could there be any more pointer chasing? </chandler>
> Thinking about more about how to make this part cleaner, could we not
> store the engine->noa_batch and then this all becomes
>
> vma = engine->noa_batch;
> if (vma)
> 	return;
>
> Locking! Missed it in the first pass, but we are unlocked here... That
> also ties into lifetimes.
>
> I'm thinking more like ctx->noa_batch with that being pinned along with
> the context. We need something along those lines to track the
> locking/lifetime.
> -Chris
>
In another series tracking pid of submitted requests, I used "empty" 
requests to change the state of the engine.
I could use that approach to message-pass what needs to be used in the 
engine submission.

-
Lionel
Lionel Landwerlin May 9, 2018, 3:31 p.m. UTC | #5
On 09/05/18 09:59, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2018-05-08 19:03:45)
>> If some of the contexts submitting workloads to the GPU have been
>> configured to shutdown slices/subslices, we might loose the NOA
>> configurations written in the NOA muxes. We need to reprogram them
>> when we detect a powergating configuration change.
>>
>> In this change i915/perf is responsible for setting up a reprogramming
>> batchbuffer which we execute just before the userspace submitted
>> batchbuffer. We do this while preemption is still disable, only if
>> needed. The decision to execute this reprogramming batchbuffer is made
>> when we assign a request to an execlist port.
>>
>> v2: Only reprogram when detecting configuration changes (Chris/Lionel)
>>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h           |   6 ++
>>   drivers/gpu/drm/i915/i915_perf.c          | 108 ++++++++++++++++++++++
>>   drivers/gpu/drm/i915/i915_request.h       |   6 ++
>>   drivers/gpu/drm/i915/intel_gpu_commands.h |   1 +
>>   drivers/gpu/drm/i915/intel_lrc.c          |  59 +++++++++++-
>>   drivers/gpu/drm/i915/intel_ringbuffer.c   |   2 +
>>   drivers/gpu/drm/i915/intel_ringbuffer.h   |   2 +
>>   7 files changed, 183 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 04e27806e581..07cdbfb4ad7a 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1461,6 +1461,12 @@ struct i915_perf_stream {
>>           * @oa_config: The OA configuration used by the stream.
>>           */
>>          struct i915_oa_config *oa_config;
>> +
>> +       /**
>> +        * @noa_reprogram_vma: A batchbuffer reprogramming the NOA muxes, used
>> +        * after switching powergating configurations.
>> +        */
>> +       struct i915_vma *noa_reprogram_vma;
>>   };
>>   
>>   /**
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>> index 5b279a82445a..1b9e3d6a53a2 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -1691,6 +1691,96 @@ static int gen8_emit_oa_config(struct i915_request *rq,
>>          return 0;
>>   }
>>   
>> +#define MAX_LRI_SIZE (125U)
>> +
>> +static u32 noa_reprogram_bb_size(struct i915_oa_config *oa_config)
>> +{
>> +       u32 n_lri;
>> +
>> +       /* Very unlikely but possible that we have no muxes to configure. */
>> +       if (!oa_config->mux_regs_len)
>> +               return 0;
>> +
>> +       n_lri = (oa_config->mux_regs_len / MAX_LRI_SIZE) +
>> +               (oa_config->mux_regs_len % MAX_LRI_SIZE) != 0;
>> +
>> +       return n_lri * 4 + oa_config->mux_regs_len * 8 + /* MI_LOAD_REGISTER_IMMs */
>> +               6 * 4 + /* PIPE_CONTROL */
>> +               4; /* MI_BATCH_BUFFER_END */
>> +}
>> +
>> +static int
>> +alloc_noa_reprogram_bo(struct i915_perf_stream *stream)
>> +{
>> +       struct drm_i915_private *dev_priv = stream->dev_priv;
>> +       struct i915_oa_config *oa_config = stream->oa_config;
>> +       struct drm_i915_gem_object *bo;
>> +       struct i915_vma *vma;
>> +       u32 buffer_size;
>> +       u32 *cs;
>> +       int i, ret, n_loaded_regs;
>> +
>> +       buffer_size = ALIGN(noa_reprogram_bb_size(oa_config), PAGE_SIZE);
>> +       if (buffer_size == 0)
>> +               return 0;
>> +
>> +       bo = i915_gem_object_create(dev_priv, buffer_size);
>> +       if (IS_ERR(bo)) {
>> +               DRM_ERROR("Failed to allocate NOA reprogramming buffer\n");
>> +               return PTR_ERR(bo);
>> +       }
>> +
>> +       cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
>> +       if (IS_ERR(cs)) {
>> +               ret = PTR_ERR(cs);
>> +               goto err_unref_bo;
>> +       }
>> +
>> +       n_loaded_regs = 0;
>> +       for (i = 0; i < oa_config->mux_regs_len; i++) {
>> +               if ((n_loaded_regs % MAX_LRI_SIZE) == 0) {
>> +                       u32 n_lri = min(oa_config->mux_regs_len - n_loaded_regs,
>> +                                       MAX_LRI_SIZE);
>> +                       *cs++ = MI_LOAD_REGISTER_IMM(n_lri);
>> +               }
>> +
>> +               *cs++ = i915_mmio_reg_offset(oa_config->mux_regs[i].addr);
>> +               *cs++ = oa_config->mux_regs[i].value;
>> +               n_loaded_regs++;
>> +       }
>> +
>> +       cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_MMIO_WRITE, 0);
> What's the pc for? Isn't it a bit dangerous to request a mmio write to
> to reg 0?

Oops, I've been using this wrong :(
I thought it would flush pending MMIO writes.

>
>> +
>> +       *cs++ = MI_BATCH_BUFFER_END;
>> +
>> +       i915_gem_object_unpin_map(bo);
>> +
>> +       ret = i915_gem_object_set_to_gtt_domain(bo, false);
>> +       if (ret)
>> +               goto err_unref_bo;
>> +
>> +       vma = i915_vma_instance(bo, &dev_priv->ggtt.base, NULL);
>> +       if (IS_ERR(vma)) {
>> +               ret = PTR_ERR(vma);
>> +               goto err_unref_bo;
>> +       }
>> +
>> +       ret = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
>> +       if (ret)
>> +               goto err_unref_vma;
> No GGTT access, so just PIN_USER for GPU access.

Okay. I just don't seem to understand any of the vma stuff...

>
>> +       stream->noa_reprogram_vma = vma;
>> +
>> +       return 0;
>> +
>> +err_unref_vma:
>> +       i915_vma_put(vma);
>> +err_unref_bo:
>> +       i915_gem_object_put(bo);
>> +
>> +       return ret;
>> +}
>> +
>>   static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_priv,
>>                                                   const struct i915_oa_config *oa_config)
>>   {
>> @@ -2102,6 +2192,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>>                  goto err_config;
>>          }
>>   
>> +       ret = alloc_noa_reprogram_bo(stream);
>> +       if (ret) {
>> +               DRM_DEBUG("Enable to alloc NOA reprogramming BO\n");
>> +               goto err_config;
>> +       }
>> +
>>          /* PRM - observability performance counters:
>>           *
>>           *   OACONTROL, performance counter enable, note:
>> @@ -2136,6 +2232,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>>   
>>          dev_priv->perf.oa.exclusive_stream = stream;
>>   
>> +       ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
>> +                                                     stream->oa_config);
>> +       if (ret)
>> +               goto err_enable;
>> +
>> +       stream->ops = &i915_oa_stream_ops;
>> +
>>          mutex_unlock(&dev_priv->drm.struct_mutex);
>>   
>>          return 0;
>> @@ -2497,6 +2600,11 @@ static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
>>          if (stream->ctx)
>>                  i915_gem_context_put(stream->ctx);
>>   
>> +       if (stream->noa_reprogram_vma) {
>> +               i915_vma_unpin(stream->noa_reprogram_vma);
>> +               i915_gem_object_put(stream->noa_reprogram_vma->obj);
> This will be unhappy due to the lack of active tracking.
> i915_vma_unpin_and_release(&stream->noa_reprogram_vma).
>
> Hmm, worse than that. It's not being tracked at all, so expect it to be
> freed while still in use.

Moving to engine->noa_reprogram_vma.

I should be able to set this from i915/perf as there is a window where 
it idle the GPU before editing the window

>
>> +       }
>> +
>>          kfree(stream);
>>   }
>>   
>> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
>> index eddbd4245cb3..3357ceb4bdb1 100644
>> --- a/drivers/gpu/drm/i915/i915_request.h
>> +++ b/drivers/gpu/drm/i915/i915_request.h
>> @@ -149,6 +149,12 @@ struct i915_request {
>>          /** Preallocate space in the ring for the emitting the request */
>>          u32 reserved_space;
>>   
>> +       /*
>> +        * Pointer in the batchbuffer to where the i915/perf NOA reprogramming
>> +        * can be inserted just before HW submission.
>> +        */
>> +       u32 *perf_prog_start;
> Just u32 offset, no need for the extra space of a pointer here.
> (Probably should just limit rings to 64k and pack the offsets into u16.)
> Or have them as offset from head, and u8 dwords? Hmm.
>
> I have cold shivers from the thought of more special locations inside
> the batch. Not the first, and not the last, so I feel like there should
> be a better way (or at least more consistent).

Switching to an offset.

>
>> +
>>          /** Batch buffer related to this request if any (used for
>>           * error state dump only).
>>           */
>> diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
>> index 105e2a9e874a..09b0fded8b17 100644
>> --- a/drivers/gpu/drm/i915/intel_gpu_commands.h
>> +++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
>> @@ -146,6 +146,7 @@
>>   #define   MI_BATCH_GTT             (2<<6) /* aliased with (1<<7) on gen4 */
>>   #define MI_BATCH_BUFFER_START_GEN8     MI_INSTR(0x31, 1)
>>   #define   MI_BATCH_RESOURCE_STREAMER (1<<10)
>> +#define   MI_BATCH_SECOND_LEVEL      (1<<22)
>>   
>>   /*
>>    * 3D instructions used by the kernel
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index 1c08bd2be6c3..df4a7bb07eae 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -510,6 +510,38 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
>>          return true;
>>   }
>>   
>> +static void maybe_enable_noa_repgoram(struct i915_request *rq)
>> +{
>> +       struct intel_engine_cs *engine = rq->engine;
>> +       struct drm_i915_private *dev_priv = engine->i915;
>> +       struct i915_perf_stream *stream = dev_priv->perf.oa.exclusive_stream;
> Could there be any more pointer chasing? </chandler>

Putting this into the engine.

>
>> +       struct intel_context *ce;
>> +       u32 *cs = rq->perf_prog_start;
>> +
>> +       /* Slice/subslice/EU powergating only matters on the RCS. */
>> +       if (engine->id != RCS)
>> +               return;
>> +
>> +       /* Nothing need to reprogram when perf isn't enabled. */
>> +       if (!stream)
>> +               return;
>> +
>> +       ce = &rq->ctx->__engine[engine->id];
> rq->hw_context; but to_intel_context() until that patch lands.

Moving sseu to the request so I can do the rpcs config through MI_LRI.

>
>> +
>> +       /*
>> +        * If the powergating configuration doesn't change, no need to
>> +        * reprogram.
>> +        */
>> +       if (engine->last_sseu.value == ce->sseu.value)
>> +               return;
>> +
>> +       *cs++ = MI_BATCH_BUFFER_START_GEN8 | MI_BATCH_SECOND_LEVEL;
> You are not a second level batch. You are calling from the ring to a
> global address of _0_.
>
>> +       *cs++ = 0;
> low 32bits = 0
>
>> +       *cs++ = i915_ggtt_offset(stream->noa_reprogram_vma);
> high 32bits = address you wanted. (order reversed)

Fixing....

>
>> +
>> +       engine->last_sseu = ce->sseu;
>> +}
>> +
>>   static void port_assign(struct execlist_port *port, struct i915_request *rq)
>>   {
>>          GEM_BUG_ON(rq == port_request(port));
>> @@ -517,6 +549,8 @@ static void port_assign(struct execlist_port *port, struct i915_request *rq)
>>          if (port_isset(port))
>>                  i915_request_put(port_request(port));
>>   
>> +       maybe_enable_noa_repgoram(rq);
> How should this work for the guc? How should this work for amalgamated
> contexts? Should this only be done for the first request in the sequence
> and definitely not done for lite-restore.

I'm assuming that GuC does not reorder requests submitted by i915.

>> +
>>          port_set(port, port_pack(i915_request_get(rq), port_count(port)));
>>   }
>>   
>> @@ -1047,6 +1081,13 @@ static void execlists_submission_tasklet(unsigned long data)
>>                              buf[2*head + 1] == execlists->preempt_complete_status) {
>>                                  GEM_TRACE("%s preempt-idle\n", engine->name);
>>   
>> +                               /*
>> +                                * Clear out the state of the sseu on the
>> +                                * engine, as it's not clear what it will be
>> +                                * after preemption.
>> +                                */
>> +                               memset(&engine->last_sseu, 0, sizeof(engine->last_sseu));
> Oh, you are missing complete_preempt_context. But you also need to reset
> after reset, so cancel_port_requests is also sensisble.

Thanks, putting it there.

>
>> +
>>                                  execlists_cancel_port_requests(execlists);
>>                                  execlists_unwind_incomplete_requests(execlists);
>>   
>> @@ -1937,10 +1978,26 @@ static int gen8_emit_bb_start(struct i915_request *rq,
>>                  rq->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
>>          }
>>   
>> -       cs = intel_ring_begin(rq, 6);
>> +       cs = intel_ring_begin(rq, rq->engine->id == RCS ? 10 : 6);
>>          if (IS_ERR(cs))
>>                  return PTR_ERR(cs);
>>   
>> +       if (rq->engine->id == RCS) {
>> +               /*
>> +                * Leave some instructions to be written with an
>> +                * MI_BATCH_BUFFER_START to the i915/perf NOA reprogramming
>> +                * batchbuffer. We only turn those MI_NOOP into
>> +                * MI_BATCH_BUFFER_START when we detect a SSEU powergating
>> +                * configuration change that might affect NOA. This is only
>> +                * for the RCS.
>> +                */
>> +               rq->perf_prog_start = cs;
>> +               *cs++ = MI_NOOP;
>> +               *cs++ = MI_NOOP;
>> +               *cs++ = MI_NOOP;
>> +               *cs++ = MI_NOOP; /* Aligning to 2 dwords */
>> +       }
>> +
>>          /*
>>           * WaDisableCtxRestoreArbitration:bdw,chv
>>           *
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> index 8f19349a6055..cde791234397 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> @@ -525,6 +525,8 @@ static int init_ring_common(struct intel_engine_cs *engine)
>>          if (INTEL_GEN(dev_priv) > 2)
>>                  I915_WRITE_MODE(engine, _MASKED_BIT_DISABLE(STOP_RING));
>>   
>> +       memset(&engine->last_sseu, 0, sizeof(engine->last_sseu));
> Why only legacy submission?

Fixing...

>
>> +
>>   out:
>>          intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>>   
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index 010750e8ee44..2f8518e1a49f 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -343,6 +343,8 @@ struct intel_engine_cs {
>>   
>>          struct drm_i915_gem_object *default_state;
>>   
>> +       union i915_gem_sseu last_sseu;
> The struct here is internal; debatable if it wants the GEM moniker for
> being part of the GEM user interface.
>
> Looks like this wanted a sefltest.
> -Chris
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 04e27806e581..07cdbfb4ad7a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1461,6 +1461,12 @@  struct i915_perf_stream {
 	 * @oa_config: The OA configuration used by the stream.
 	 */
 	struct i915_oa_config *oa_config;
+
+	/**
+	 * @noa_reprogram_vma: A batchbuffer reprogramming the NOA muxes, used
+	 * after switching powergating configurations.
+	 */
+	struct i915_vma *noa_reprogram_vma;
 };
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 5b279a82445a..1b9e3d6a53a2 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1691,6 +1691,96 @@  static int gen8_emit_oa_config(struct i915_request *rq,
 	return 0;
 }
 
+#define MAX_LRI_SIZE (125U)
+
+static u32 noa_reprogram_bb_size(struct i915_oa_config *oa_config)
+{
+	u32 n_lri;
+
+	/* Very unlikely but possible that we have no muxes to configure. */
+	if (!oa_config->mux_regs_len)
+		return 0;
+
+	n_lri = (oa_config->mux_regs_len / MAX_LRI_SIZE) +
+		(oa_config->mux_regs_len % MAX_LRI_SIZE) != 0;
+
+	return n_lri * 4 + oa_config->mux_regs_len * 8 + /* MI_LOAD_REGISTER_IMMs */
+		6 * 4 + /* PIPE_CONTROL */
+		4; /* MI_BATCH_BUFFER_END */
+}
+
+static int
+alloc_noa_reprogram_bo(struct i915_perf_stream *stream)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct i915_oa_config *oa_config = stream->oa_config;
+	struct drm_i915_gem_object *bo;
+	struct i915_vma *vma;
+	u32 buffer_size;
+	u32 *cs;
+	int i, ret, n_loaded_regs;
+
+	buffer_size = ALIGN(noa_reprogram_bb_size(oa_config), PAGE_SIZE);
+	if (buffer_size == 0)
+		return 0;
+
+	bo = i915_gem_object_create(dev_priv, buffer_size);
+	if (IS_ERR(bo)) {
+		DRM_ERROR("Failed to allocate NOA reprogramming buffer\n");
+		return PTR_ERR(bo);
+	}
+
+	cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
+	if (IS_ERR(cs)) {
+		ret = PTR_ERR(cs);
+		goto err_unref_bo;
+	}
+
+	n_loaded_regs = 0;
+	for (i = 0; i < oa_config->mux_regs_len; i++) {
+		if ((n_loaded_regs % MAX_LRI_SIZE) == 0) {
+			u32 n_lri = min(oa_config->mux_regs_len - n_loaded_regs,
+					MAX_LRI_SIZE);
+			*cs++ = MI_LOAD_REGISTER_IMM(n_lri);
+		}
+
+		*cs++ = i915_mmio_reg_offset(oa_config->mux_regs[i].addr);
+		*cs++ = oa_config->mux_regs[i].value;
+		n_loaded_regs++;
+	}
+
+	cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_MMIO_WRITE, 0);
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	i915_gem_object_unpin_map(bo);
+
+	ret = i915_gem_object_set_to_gtt_domain(bo, false);
+	if (ret)
+		goto err_unref_bo;
+
+	vma = i915_vma_instance(bo, &dev_priv->ggtt.base, NULL);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto err_unref_bo;
+	}
+
+	ret = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
+	if (ret)
+		goto err_unref_vma;
+
+	stream->noa_reprogram_vma = vma;
+
+	return 0;
+
+err_unref_vma:
+	i915_vma_put(vma);
+err_unref_bo:
+	i915_gem_object_put(bo);
+
+	return ret;
+}
+
 static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_priv,
 						 const struct i915_oa_config *oa_config)
 {
@@ -2102,6 +2192,12 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 		goto err_config;
 	}
 
+	ret = alloc_noa_reprogram_bo(stream);
+	if (ret) {
+		DRM_DEBUG("Enable to alloc NOA reprogramming BO\n");
+		goto err_config;
+	}
+
 	/* PRM - observability performance counters:
 	 *
 	 *   OACONTROL, performance counter enable, note:
@@ -2136,6 +2232,13 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 
 	dev_priv->perf.oa.exclusive_stream = stream;
 
+	ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
+						      stream->oa_config);
+	if (ret)
+		goto err_enable;
+
+	stream->ops = &i915_oa_stream_ops;
+
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
 	return 0;
@@ -2497,6 +2600,11 @@  static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
 	if (stream->ctx)
 		i915_gem_context_put(stream->ctx);
 
+	if (stream->noa_reprogram_vma) {
+		i915_vma_unpin(stream->noa_reprogram_vma);
+		i915_gem_object_put(stream->noa_reprogram_vma->obj);
+	}
+
 	kfree(stream);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index eddbd4245cb3..3357ceb4bdb1 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -149,6 +149,12 @@  struct i915_request {
 	/** Preallocate space in the ring for the emitting the request */
 	u32 reserved_space;
 
+	/*
+	 * Pointer in the batchbuffer to where the i915/perf NOA reprogramming
+	 * can be inserted just before HW submission.
+	 */
+	u32 *perf_prog_start;
+
 	/** Batch buffer related to this request if any (used for
 	 * error state dump only).
 	 */
diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
index 105e2a9e874a..09b0fded8b17 100644
--- a/drivers/gpu/drm/i915/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
@@ -146,6 +146,7 @@ 
 #define   MI_BATCH_GTT		    (2<<6) /* aliased with (1<<7) on gen4 */
 #define MI_BATCH_BUFFER_START_GEN8	MI_INSTR(0x31, 1)
 #define   MI_BATCH_RESOURCE_STREAMER (1<<10)
+#define   MI_BATCH_SECOND_LEVEL      (1<<22)
 
 /*
  * 3D instructions used by the kernel
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 1c08bd2be6c3..df4a7bb07eae 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -510,6 +510,38 @@  static bool can_merge_ctx(const struct i915_gem_context *prev,
 	return true;
 }
 
+static void maybe_enable_noa_repgoram(struct i915_request *rq)
+{
+	struct intel_engine_cs *engine = rq->engine;
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct i915_perf_stream *stream = dev_priv->perf.oa.exclusive_stream;
+	struct intel_context *ce;
+	u32 *cs = rq->perf_prog_start;
+
+	/* Slice/subslice/EU powergating only matters on the RCS. */
+	if (engine->id != RCS)
+		return;
+
+	/* Nothing need to reprogram when perf isn't enabled. */
+	if (!stream)
+		return;
+
+	ce = &rq->ctx->__engine[engine->id];
+
+	/*
+	 * If the powergating configuration doesn't change, no need to
+	 * reprogram.
+	 */
+	if (engine->last_sseu.value == ce->sseu.value)
+		return;
+
+	*cs++ = MI_BATCH_BUFFER_START_GEN8 | MI_BATCH_SECOND_LEVEL;
+	*cs++ = 0;
+	*cs++ = i915_ggtt_offset(stream->noa_reprogram_vma);
+
+	engine->last_sseu = ce->sseu;
+}
+
 static void port_assign(struct execlist_port *port, struct i915_request *rq)
 {
 	GEM_BUG_ON(rq == port_request(port));
@@ -517,6 +549,8 @@  static void port_assign(struct execlist_port *port, struct i915_request *rq)
 	if (port_isset(port))
 		i915_request_put(port_request(port));
 
+	maybe_enable_noa_repgoram(rq);
+
 	port_set(port, port_pack(i915_request_get(rq), port_count(port)));
 }
 
@@ -1047,6 +1081,13 @@  static void execlists_submission_tasklet(unsigned long data)
 			    buf[2*head + 1] == execlists->preempt_complete_status) {
 				GEM_TRACE("%s preempt-idle\n", engine->name);
 
+				/*
+				 * Clear out the state of the sseu on the
+				 * engine, as it's not clear what it will be
+				 * after preemption.
+				 */
+				memset(&engine->last_sseu, 0, sizeof(engine->last_sseu));
+
 				execlists_cancel_port_requests(execlists);
 				execlists_unwind_incomplete_requests(execlists);
 
@@ -1937,10 +1978,26 @@  static int gen8_emit_bb_start(struct i915_request *rq,
 		rq->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
 	}
 
-	cs = intel_ring_begin(rq, 6);
+	cs = intel_ring_begin(rq, rq->engine->id == RCS ? 10 : 6);
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);
 
+	if (rq->engine->id == RCS) {
+		/*
+		 * Leave some instructions to be written with an
+		 * MI_BATCH_BUFFER_START to the i915/perf NOA reprogramming
+		 * batchbuffer. We only turn those MI_NOOP into
+		 * MI_BATCH_BUFFER_START when we detect a SSEU powergating
+		 * configuration change that might affect NOA. This is only
+		 * for the RCS.
+		 */
+		rq->perf_prog_start = cs;
+		*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP; /* Aligning to 2 dwords */
+	}
+
 	/*
 	 * WaDisableCtxRestoreArbitration:bdw,chv
 	 *
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 8f19349a6055..cde791234397 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -525,6 +525,8 @@  static int init_ring_common(struct intel_engine_cs *engine)
 	if (INTEL_GEN(dev_priv) > 2)
 		I915_WRITE_MODE(engine, _MASKED_BIT_DISABLE(STOP_RING));
 
+	memset(&engine->last_sseu, 0, sizeof(engine->last_sseu));
+
 out:
 	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 010750e8ee44..2f8518e1a49f 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -343,6 +343,8 @@  struct intel_engine_cs {
 
 	struct drm_i915_gem_object *default_state;
 
+	union i915_gem_sseu last_sseu;
+
 	atomic_t irq_count;
 	unsigned long irq_posted;
 #define ENGINE_IRQ_BREADCRUMB 0