diff mbox

[3/3] drm/i915: Store a pointer to intel_context in i915_request

Message ID 20180419071746.15996-3-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson April 19, 2018, 7:17 a.m. UTC
To ease the frequent and ugly pointer dance of
&request->gem_context->engine[request->engine->id] during request
submission, store that pointer as request->hw_context. One major
advantage that we will exploit later is that this decouples the logical
context state from the engine itself.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gvt/mmio_context.c       |   6 +-
 drivers/gpu/drm/i915/gvt/mmio_context.h       |   2 +-
 drivers/gpu/drm/i915/gvt/scheduler.c          | 141 ++++++----------
 drivers/gpu/drm/i915/gvt/scheduler.h          |   1 -
 drivers/gpu/drm/i915/i915_debugfs.c           |  20 ++-
 drivers/gpu/drm/i915/i915_drv.h               |   1 +
 drivers/gpu/drm/i915/i915_gem.c               |  14 +-
 drivers/gpu/drm/i915/i915_gem_context.c       |  19 +--
 drivers/gpu/drm/i915/i915_gem_context.h       |  27 +++-
 drivers/gpu/drm/i915/i915_gpu_error.c         |   2 +-
 drivers/gpu/drm/i915/i915_perf.c              |  28 ++--
 drivers/gpu/drm/i915/i915_request.c           |  23 ++-
 drivers/gpu/drm/i915/i915_request.h           |   3 +-
 drivers/gpu/drm/i915/intel_engine_cs.c        |  61 ++++---
 drivers/gpu/drm/i915/intel_guc_ads.c          |   2 +-
 drivers/gpu/drm/i915/intel_guc_submission.c   |  15 +-
 drivers/gpu/drm/i915/intel_lrc.c              | 151 +++++++++++-------
 drivers/gpu/drm/i915/intel_lrc.h              |   7 -
 drivers/gpu/drm/i915/intel_ringbuffer.c       | 104 +++++++-----
 drivers/gpu/drm/i915/intel_ringbuffer.h       |   9 +-
 drivers/gpu/drm/i915/selftests/mock_context.c |   7 +
 drivers/gpu/drm/i915/selftests/mock_engine.c  |  41 +++--
 .../gpu/drm/i915/selftests/mock_gem_device.c  |   5 +
 23 files changed, 381 insertions(+), 308 deletions(-)

Comments

Tvrtko Ursulin April 19, 2018, 10:40 a.m. UTC | #1
On 19/04/2018 08:17, Chris Wilson wrote:
> To ease the frequent and ugly pointer dance of
> &request->gem_context->engine[request->engine->id] during request
> submission, store that pointer as request->hw_context. One major
> advantage that we will exploit later is that this decouples the logical
> context state from the engine itself.

I can see merit in making all the places which work on engines or 
requests more logically and elegantly grab the engine specific context.

Authors of current unmerged work will probably be not so thrilled though.

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/gvt/mmio_context.c       |   6 +-
>   drivers/gpu/drm/i915/gvt/mmio_context.h       |   2 +-
>   drivers/gpu/drm/i915/gvt/scheduler.c          | 141 ++++++----------
>   drivers/gpu/drm/i915/gvt/scheduler.h          |   1 -
>   drivers/gpu/drm/i915/i915_debugfs.c           |  20 ++-
>   drivers/gpu/drm/i915/i915_drv.h               |   1 +
>   drivers/gpu/drm/i915/i915_gem.c               |  14 +-
>   drivers/gpu/drm/i915/i915_gem_context.c       |  19 +--
>   drivers/gpu/drm/i915/i915_gem_context.h       |  27 +++-
>   drivers/gpu/drm/i915/i915_gpu_error.c         |   2 +-
>   drivers/gpu/drm/i915/i915_perf.c              |  28 ++--
>   drivers/gpu/drm/i915/i915_request.c           |  23 ++-
>   drivers/gpu/drm/i915/i915_request.h           |   3 +-
>   drivers/gpu/drm/i915/intel_engine_cs.c        |  61 ++++---
>   drivers/gpu/drm/i915/intel_guc_ads.c          |   2 +-
>   drivers/gpu/drm/i915/intel_guc_submission.c   |  15 +-
>   drivers/gpu/drm/i915/intel_lrc.c              | 151 +++++++++++-------
>   drivers/gpu/drm/i915/intel_lrc.h              |   7 -
>   drivers/gpu/drm/i915/intel_ringbuffer.c       | 104 +++++++-----
>   drivers/gpu/drm/i915/intel_ringbuffer.h       |   9 +-
>   drivers/gpu/drm/i915/selftests/mock_context.c |   7 +
>   drivers/gpu/drm/i915/selftests/mock_engine.c  |  41 +++--
>   .../gpu/drm/i915/selftests/mock_gem_device.c  |   5 +
>   23 files changed, 381 insertions(+), 308 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gvt/mmio_context.c b/drivers/gpu/drm/i915/gvt/mmio_context.c
> index a5bac83d53a9..708170e61625 100644
> --- a/drivers/gpu/drm/i915/gvt/mmio_context.c
> +++ b/drivers/gpu/drm/i915/gvt/mmio_context.c
> @@ -446,9 +446,9 @@ static void switch_mocs(struct intel_vgpu *pre, struct intel_vgpu *next,
>   
>   #define CTX_CONTEXT_CONTROL_VAL	0x03
>   
> -bool is_inhibit_context(struct i915_gem_context *ctx, int ring_id)
> +bool is_inhibit_context(struct intel_context *ce)
>   {
> -	u32 *reg_state = ctx->engine[ring_id].lrc_reg_state;
> +	const u32 *reg_state = ce->lrc_reg_state;
>   	u32 inhibit_mask =
>   		_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
>   
> @@ -501,7 +501,7 @@ static void switch_mmio(struct intel_vgpu *pre,
>   			 * itself.
>   			 */
>   			if (mmio->in_context &&
> -			    !is_inhibit_context(s->shadow_ctx, ring_id))
> +			    !is_inhibit_context(&s->shadow_ctx->__engine[ring_id]))
>   				continue;
>   
>   			if (mmio->mask)
> diff --git a/drivers/gpu/drm/i915/gvt/mmio_context.h b/drivers/gpu/drm/i915/gvt/mmio_context.h
> index 0439eb8057a8..5c3b9ff9f96a 100644
> --- a/drivers/gpu/drm/i915/gvt/mmio_context.h
> +++ b/drivers/gpu/drm/i915/gvt/mmio_context.h
> @@ -49,7 +49,7 @@ void intel_gvt_switch_mmio(struct intel_vgpu *pre,
>   
>   void intel_gvt_init_engine_mmio_context(struct intel_gvt *gvt);
>   
> -bool is_inhibit_context(struct i915_gem_context *ctx, int ring_id);
> +bool is_inhibit_context(struct intel_context *ce);
>   
>   int intel_vgpu_restore_inhibit_context(struct intel_vgpu *vgpu,
>   				       struct i915_request *req);
> diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c
> index f64cccb2e793..c2e440200b0c 100644
> --- a/drivers/gpu/drm/i915/gvt/scheduler.c
> +++ b/drivers/gpu/drm/i915/gvt/scheduler.c
> @@ -54,11 +54,8 @@ static void set_context_pdp_root_pointer(
>   
>   static void update_shadow_pdps(struct intel_vgpu_workload *workload)
>   {
> -	struct intel_vgpu *vgpu = workload->vgpu;
> -	int ring_id = workload->ring_id;
> -	struct i915_gem_context *shadow_ctx = vgpu->submission.shadow_ctx;
>   	struct drm_i915_gem_object *ctx_obj =
> -		shadow_ctx->engine[ring_id].state->obj;
> +		workload->req->hw_context->state->obj;
>   	struct execlist_ring_context *shadow_ring_context;
>   	struct page *page;
>   
> @@ -128,9 +125,8 @@ static int populate_shadow_context(struct intel_vgpu_workload *workload)
>   	struct intel_vgpu *vgpu = workload->vgpu;
>   	struct intel_gvt *gvt = vgpu->gvt;
>   	int ring_id = workload->ring_id;
> -	struct i915_gem_context *shadow_ctx = vgpu->submission.shadow_ctx;
>   	struct drm_i915_gem_object *ctx_obj =
> -		shadow_ctx->engine[ring_id].state->obj;
> +		workload->req->hw_context->state->obj;
>   	struct execlist_ring_context *shadow_ring_context;
>   	struct page *page;
>   	void *dst;
> @@ -280,10 +276,8 @@ static int shadow_context_status_change(struct notifier_block *nb,
>   	return NOTIFY_OK;
>   }
>   
> -static void shadow_context_descriptor_update(struct i915_gem_context *ctx,
> -		struct intel_engine_cs *engine)
> +static void shadow_context_descriptor_update(struct intel_context *ce)
>   {
> -	struct intel_context *ce = &ctx->engine[engine->id];
>   	u64 desc = 0;
>   
>   	desc = ce->lrc_desc;
> @@ -292,7 +286,7 @@ static void shadow_context_descriptor_update(struct i915_gem_context *ctx,
>   	 * like GEN8_CTX_* cached in desc_template
>   	 */
>   	desc &= U64_MAX << 12;
> -	desc |= ctx->desc_template & ((1ULL << 12) - 1);
> +	desc |= ce->gem_context->desc_template & ((1ULL << 12) - 1);
>   
>   	ce->lrc_desc = desc;
>   }
> @@ -300,12 +294,11 @@ static void shadow_context_descriptor_update(struct i915_gem_context *ctx,
>   static int copy_workload_to_ring_buffer(struct intel_vgpu_workload *workload)
>   {
>   	struct intel_vgpu *vgpu = workload->vgpu;
> +	struct i915_request *req = workload->req;
>   	void *shadow_ring_buffer_va;
>   	u32 *cs;
> -	struct i915_request *req = workload->req;
>   
> -	if (IS_KABYLAKE(req->i915) &&
> -	    is_inhibit_context(req->gem_context, req->engine->id))
> +	if (IS_KABYLAKE(req->i915) && is_inhibit_context(req->hw_context))
>   		intel_vgpu_restore_inhibit_context(vgpu, req);
>   
>   	/* allocate shadow ring buffer */
> @@ -353,60 +346,56 @@ int intel_gvt_scan_and_shadow_workload(struct intel_vgpu_workload *workload)
>   	struct intel_vgpu_submission *s = &vgpu->submission;
>   	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
>   	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
> -	int ring_id = workload->ring_id;
> -	struct intel_engine_cs *engine = dev_priv->engine[ring_id];
> -	struct intel_ring *ring;
> +	struct intel_engine_cs *engine = dev_priv->engine[workload->ring_id];
> +	struct intel_context *ce;
>   	int ret;
>   
>   	lockdep_assert_held(&dev_priv->drm.struct_mutex);
>   
> -	if (workload->shadowed)
> +	if (workload->req)
>   		return 0;

GVT team will need to look at this hunk/function.

>   
> +	/* pin shadow context by gvt even the shadow context will be pinned
> +	 * when i915 alloc request. That is because gvt will update the guest
> +	 * context from shadow context when workload is completed, and at that
> +	 * moment, i915 may already unpined the shadow context to make the
> +	 * shadow_ctx pages invalid. So gvt need to pin itself. After update
> +	 * the guest context, gvt can unpin the shadow_ctx safely.
> +	 */
> +	ce = intel_context_pin(shadow_ctx, engine);
> +	if (IS_ERR(ce)) {
> +		gvt_vgpu_err("fail to pin shadow context\n");
> +		return PTR_ERR(ce);
> +	}
> +
>   	shadow_ctx->desc_template &= ~(0x3 << GEN8_CTX_ADDRESSING_MODE_SHIFT);
>   	shadow_ctx->desc_template |= workload->ctx_desc.addressing_mode <<
>   				    GEN8_CTX_ADDRESSING_MODE_SHIFT;
>   
> -	if (!test_and_set_bit(ring_id, s->shadow_ctx_desc_updated))
> -		shadow_context_descriptor_update(shadow_ctx,
> -					dev_priv->engine[ring_id]);
> +	if (!test_and_set_bit(workload->ring_id, s->shadow_ctx_desc_updated))
> +		shadow_context_descriptor_update(ce);
>   
>   	ret = intel_gvt_scan_and_shadow_ringbuffer(workload);
>   	if (ret)
> -		goto err_scan;
> +		goto err_unpin;
>   
>   	if ((workload->ring_id == RCS) &&
>   	    (workload->wa_ctx.indirect_ctx.size != 0)) {
>   		ret = intel_gvt_scan_and_shadow_wa_ctx(&workload->wa_ctx);
>   		if (ret)
> -			goto err_scan;
> -	}
> -
> -	/* pin shadow context by gvt even the shadow context will be pinned
> -	 * when i915 alloc request. That is because gvt will update the guest
> -	 * context from shadow context when workload is completed, and at that
> -	 * moment, i915 may already unpined the shadow context to make the
> -	 * shadow_ctx pages invalid. So gvt need to pin itself. After update
> -	 * the guest context, gvt can unpin the shadow_ctx safely.
> -	 */
> -	ring = engine->context_pin(engine, shadow_ctx);
> -	if (IS_ERR(ring)) {
> -		ret = PTR_ERR(ring);
> -		gvt_vgpu_err("fail to pin shadow context\n");
> -		goto err_shadow;
> +			goto err_shadow;
>   	}
>   
>   	ret = populate_shadow_context(workload);
>   	if (ret)
> -		goto err_unpin;
> -	workload->shadowed = true;
> +		goto err_shadow;
> +
>   	return 0;
>   
> -err_unpin:
> -	engine->context_unpin(engine, shadow_ctx);
>   err_shadow:
>   	release_shadow_wa_ctx(&workload->wa_ctx);
> -err_scan:
> +err_unpin:
> +	intel_context_unpin(ce);
>   	return ret;
>   }
>   
> @@ -414,7 +403,6 @@ static int intel_gvt_generate_request(struct intel_vgpu_workload *workload)
>   {
>   	int ring_id = workload->ring_id;
>   	struct drm_i915_private *dev_priv = workload->vgpu->gvt->dev_priv;
> -	struct intel_engine_cs *engine = dev_priv->engine[ring_id];
>   	struct i915_request *rq;
>   	struct intel_vgpu *vgpu = workload->vgpu;
>   	struct intel_vgpu_submission *s = &vgpu->submission;
> @@ -437,7 +425,6 @@ static int intel_gvt_generate_request(struct intel_vgpu_workload *workload)
>   	return 0;
>   
>   err_unpin:
> -	engine->context_unpin(engine, shadow_ctx);

And this one.

>   	release_shadow_wa_ctx(&workload->wa_ctx);
>   	return ret;
>   }
> @@ -495,21 +482,13 @@ static int prepare_shadow_batch_buffer(struct intel_vgpu_workload *workload)
>   	return ret;
>   }
>   
> -static int update_wa_ctx_2_shadow_ctx(struct intel_shadow_wa_ctx *wa_ctx)
> +static void update_wa_ctx_2_shadow_ctx(struct intel_shadow_wa_ctx *wa_ctx)
>   {
> -	struct intel_vgpu_workload *workload = container_of(wa_ctx,
> -					struct intel_vgpu_workload,
> -					wa_ctx);
> -	int ring_id = workload->ring_id;
> -	struct intel_vgpu_submission *s = &workload->vgpu->submission;
> -	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
> -	struct drm_i915_gem_object *ctx_obj =
> -		shadow_ctx->engine[ring_id].state->obj;
> -	struct execlist_ring_context *shadow_ring_context;
> -	struct page *page;
> -
> -	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
> -	shadow_ring_context = kmap_atomic(page);
> +	struct intel_vgpu_workload *workload =
> +		container_of(wa_ctx, struct intel_vgpu_workload, wa_ctx);
> +	struct i915_request *rq = workload->req;
> +	struct execlist_ring_context *shadow_ring_context =
> +		(struct execlist_ring_context *)rq->hw_context->lrc_reg_state;
>   
>   	shadow_ring_context->bb_per_ctx_ptr.val =
>   		(shadow_ring_context->bb_per_ctx_ptr.val &
> @@ -517,9 +496,6 @@ static int update_wa_ctx_2_shadow_ctx(struct intel_shadow_wa_ctx *wa_ctx)
>   	shadow_ring_context->rcs_indirect_ctx.val =
>   		(shadow_ring_context->rcs_indirect_ctx.val &
>   		(~INDIRECT_CTX_ADDR_MASK)) | wa_ctx->indirect_ctx.shadow_gma;
> -
> -	kunmap_atomic(shadow_ring_context);

And this one.

At which point I skip over all GVT and continue further down. :)

> -	return 0;
>   }
>   
>   static int prepare_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
> @@ -648,12 +624,9 @@ static int prepare_workload(struct intel_vgpu_workload *workload)
>   static int dispatch_workload(struct intel_vgpu_workload *workload)
>   {
>   	struct intel_vgpu *vgpu = workload->vgpu;
> -	struct intel_vgpu_submission *s = &vgpu->submission;
> -	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
>   	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
>   	int ring_id = workload->ring_id;
> -	struct intel_engine_cs *engine = dev_priv->engine[ring_id];
> -	int ret = 0;
> +	int ret;
>   
>   	gvt_dbg_sched("ring id %d prepare to dispatch workload %p\n",
>   		ring_id, workload);
> @@ -665,10 +638,6 @@ static int dispatch_workload(struct intel_vgpu_workload *workload)
>   		goto out;
>   
>   	ret = prepare_workload(workload);
> -	if (ret) {
> -		engine->context_unpin(engine, shadow_ctx);
> -		goto out;
> -	}
>   
>   out:
>   	if (ret)
> @@ -743,27 +712,23 @@ static struct intel_vgpu_workload *pick_next_workload(
>   
>   static void update_guest_context(struct intel_vgpu_workload *workload)
>   {
> +	struct i915_request *rq = workload->req;
>   	struct intel_vgpu *vgpu = workload->vgpu;
>   	struct intel_gvt *gvt = vgpu->gvt;
> -	struct intel_vgpu_submission *s = &vgpu->submission;
> -	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
> -	int ring_id = workload->ring_id;
> -	struct drm_i915_gem_object *ctx_obj =
> -		shadow_ctx->engine[ring_id].state->obj;
> +	struct drm_i915_gem_object *ctx_obj = rq->hw_context->state->obj;
>   	struct execlist_ring_context *shadow_ring_context;
>   	struct page *page;
>   	void *src;
>   	unsigned long context_gpa, context_page_num;
>   	int i;
>   
> -	gvt_dbg_sched("ring id %d workload lrca %x\n", ring_id,
> -			workload->ctx_desc.lrca);
> -
> -	context_page_num = gvt->dev_priv->engine[ring_id]->context_size;
> +	gvt_dbg_sched("ring id %d workload lrca %x\n", rq->engine->id,
> +		      workload->ctx_desc.lrca);
>   
> +	context_page_num = rq->engine->context_size;
>   	context_page_num = context_page_num >> PAGE_SHIFT;
>   
> -	if (IS_BROADWELL(gvt->dev_priv) && ring_id == RCS)
> +	if (IS_BROADWELL(gvt->dev_priv) && rq->engine->id == RCS)
>   		context_page_num = 19;
>   
>   	i = 2;
> @@ -836,6 +801,7 @@ static void complete_current_workload(struct intel_gvt *gvt, int ring_id)
>   		scheduler->current_workload[ring_id];
>   	struct intel_vgpu *vgpu = workload->vgpu;
>   	struct intel_vgpu_submission *s = &vgpu->submission;
> +	struct i915_request *rq;
>   	int event;
>   
>   	mutex_lock(&gvt->lock);
> @@ -844,11 +810,8 @@ static void complete_current_workload(struct intel_gvt *gvt, int ring_id)
>   	 * switch to make sure request is completed.
>   	 * For the workload w/o request, directly complete the workload.
>   	 */
> -	if (workload->req) {
> -		struct drm_i915_private *dev_priv =
> -			workload->vgpu->gvt->dev_priv;
> -		struct intel_engine_cs *engine =
> -			dev_priv->engine[workload->ring_id];
> +	rq = fetch_and_zero(&workload->req);
> +	if (rq) {
>   		wait_event(workload->shadow_ctx_status_wq,
>   			   !atomic_read(&workload->shadow_ctx_active));
>   
> @@ -864,8 +827,6 @@ static void complete_current_workload(struct intel_gvt *gvt, int ring_id)
>   				workload->status = 0;
>   		}
>   
> -		i915_request_put(fetch_and_zero(&workload->req));
> -
>   		if (!workload->status && !(vgpu->resetting_eng &
>   					   ENGINE_MASK(ring_id))) {
>   			update_guest_context(workload);
> @@ -874,10 +835,13 @@ static void complete_current_workload(struct intel_gvt *gvt, int ring_id)
>   					 INTEL_GVT_EVENT_MAX)
>   				intel_vgpu_trigger_virtual_event(vgpu, event);
>   		}
> -		mutex_lock(&dev_priv->drm.struct_mutex);
> +
>   		/* unpin shadow ctx as the shadow_ctx update is done */
> -		engine->context_unpin(engine, s->shadow_ctx);
> -		mutex_unlock(&dev_priv->drm.struct_mutex);
> +		mutex_lock(&rq->i915->drm.struct_mutex);
> +		intel_context_unpin(rq->hw_context);
> +		mutex_unlock(&rq->i915->drm.struct_mutex);
> +
> +		i915_request_put(rq);
>   	}
>   
>   	gvt_dbg_sched("ring id %d complete workload %p status %d\n",
> @@ -1251,7 +1215,6 @@ alloc_workload(struct intel_vgpu *vgpu)
>   	atomic_set(&workload->shadow_ctx_active, 0);
>   
>   	workload->status = -EINPROGRESS;
> -	workload->shadowed = false;
>   	workload->vgpu = vgpu;
>   
>   	return workload;
> diff --git a/drivers/gpu/drm/i915/gvt/scheduler.h b/drivers/gpu/drm/i915/gvt/scheduler.h
> index 486ed57a4ad1..b0e04017d7a2 100644
> --- a/drivers/gpu/drm/i915/gvt/scheduler.h
> +++ b/drivers/gpu/drm/i915/gvt/scheduler.h
> @@ -83,7 +83,6 @@ struct intel_vgpu_workload {
>   	struct i915_request *req;
>   	/* if this workload has been dispatched to i915? */
>   	bool dispatched;
> -	bool shadowed;
>   	int status;
>   
>   	struct intel_vgpu_mm *shadow_mm;
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 792f69e44ba5..fd202185d6e5 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -377,16 +377,19 @@ static void print_batch_pool_stats(struct seq_file *m,
>   	print_file_stats(m, "[k]batch pool", stats);
>   }
>   
> -static int per_file_ctx_stats(int id, void *ptr, void *data)
> +static int per_file_ctx_stats(int idx, void *ptr, void *data)
>   {
>   	struct i915_gem_context *ctx = ptr;
> -	int n;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	for_each_engine(engine, ctx->i915, id) {
> +		struct intel_context *ce = to_intel_context(ctx, engine);
>   
> -	for (n = 0; n < ARRAY_SIZE(ctx->engine); n++) {
> -		if (ctx->engine[n].state)
> -			per_file_stats(0, ctx->engine[n].state->obj, data);
> -		if (ctx->engine[n].ring)
> -			per_file_stats(0, ctx->engine[n].ring->vma->obj, data);
> +		if (ce->state)
> +			per_file_stats(0, ce->state->obj, data);
> +		if (ce->ring)
> +			per_file_stats(0, ce->ring->vma->obj, data);
>   	}
>   
>   	return 0;
> @@ -1960,7 +1963,8 @@ static int i915_context_status(struct seq_file *m, void *unused)
>   		seq_putc(m, '\n');
>   
>   		for_each_engine(engine, dev_priv, id) {
> -			struct intel_context *ce = &ctx->engine[engine->id];
> +			struct intel_context *ce =
> +				to_intel_context(ctx, engine);
>   
>   			seq_printf(m, "%s: ", engine->name);
>   			if (ce->state)
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 028691108125..8907f71b900a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1945,6 +1945,7 @@ struct drm_i915_private {
>   			 */
>   			struct i915_perf_stream *exclusive_stream;
>   
> +			struct intel_context *pinned_ctx;
>   			u32 specific_ctx_id;
>   
>   			struct hrtimer poll_check_timer;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 4dba735505d4..e186e9194fad 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3227,14 +3227,14 @@ void i915_gem_reset(struct drm_i915_private *dev_priv,
>   	i915_retire_requests(dev_priv);
>   
>   	for_each_engine(engine, dev_priv, id) {
> -		struct i915_gem_context *ctx;
> +		struct intel_context *ce;
>   
>   		i915_gem_reset_engine(engine,
>   				      engine->hangcheck.active_request,
>   				      stalled_mask & ENGINE_MASK(id));
> -		ctx = fetch_and_zero(&engine->last_retired_context);
> -		if (ctx)
> -			engine->context_unpin(engine, ctx);
> +		ce = fetch_and_zero(&engine->last_retired_context);
> +		if (ce)
> +			intel_context_unpin(ce);
>   
>   		/*
>   		 * Ostensibily, we always want a context loaded for powersaving,
> @@ -4948,13 +4948,13 @@ void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
>   
>   static void assert_kernel_context_is_current(struct drm_i915_private *i915)
>   {
> -	struct i915_gem_context *kernel_context = i915->kernel_context;
> +	struct i915_gem_context *kctx = i915->kernel_context;
>   	struct intel_engine_cs *engine;
>   	enum intel_engine_id id;
>   
>   	for_each_engine(engine, i915, id) {
>   		GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline->last_request));
> -		GEM_BUG_ON(engine->last_retired_context != kernel_context);
> +		GEM_BUG_ON(engine->last_retired_context->gem_context != kctx);
>   	}
>   }
>   
> @@ -5291,7 +5291,7 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
>   	for_each_engine(engine, i915, id) {
>   		struct i915_vma *state;
>   
> -		state = ctx->engine[id].state;
> +		state = to_intel_context(ctx, engine)->state;
>   		if (!state)
>   			continue;
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 2fe779cab298..ed1c591ee866 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -125,16 +125,10 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
>   	i915_ppgtt_put(ctx->ppgtt);
>   
>   	for (i = 0; i < I915_NUM_ENGINES; i++) {
> -		struct intel_context *ce = &ctx->engine[i];
> +		struct intel_context *ce = &ctx->__engine[i];
>   
> -		if (!ce->state)
> -			continue;
> -
> -		WARN_ON(ce->pin_count);
> -		if (ce->ring)
> -			intel_ring_free(ce->ring);
> -
> -		__i915_gem_object_release_unless_active(ce->state->obj);
> +		if (ce->ops)

Is ce->ops now the gate which was "if (!ce->state) continue" before, to 
avoid GEM_BUG_ON in execlists_context_destroy? I am wondering if this 
loop should not just be for_each_engine. Places which walk until 
I915_NUM_ENGINES should be very special and this one doesn't feel like 
one. Anyways, nothing is broken so this is just an observation.

> +			ce->ops->destroy(ce);
>   	}
>   
>   	kfree(ctx->name);
> @@ -266,6 +260,7 @@ __create_hw_context(struct drm_i915_private *dev_priv,
>   		    struct drm_i915_file_private *file_priv)
>   {
>   	struct i915_gem_context *ctx;
> +	unsigned int n;
>   	int ret;
>   
>   	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
> @@ -283,6 +278,12 @@ __create_hw_context(struct drm_i915_private *dev_priv,
>   	ctx->i915 = dev_priv;
>   	ctx->sched.priority = I915_PRIORITY_NORMAL;
>   
> +	for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) {
> +		struct intel_context *ce = &ctx->__engine[n];
> +
> +		ce->gem_context = ctx;
> +	}

I was just saying above. :)

> +
>   	INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL);
>   	INIT_LIST_HEAD(&ctx->handles_list);
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index b12a8a8c5af9..61bff6d37c6a 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -45,6 +45,11 @@ struct intel_ring;
>   
>   #define DEFAULT_CONTEXT_HANDLE 0
>   
> +struct intel_context_ops {
> +	void (*unpin)(struct intel_context *ce);
> +	void (*destroy)(struct intel_context *ce);
> +};
> +
>   /**
>    * struct i915_gem_context - client state
>    *
> @@ -144,12 +149,15 @@ struct i915_gem_context {
>   
>   	/** engine: per-engine logical HW state */
>   	struct intel_context {
> +		struct i915_gem_context *gem_context;
>   		struct i915_vma *state;
>   		struct intel_ring *ring;
>   		u32 *lrc_reg_state;
>   		u64 lrc_desc;
>   		int pin_count;
> -	} engine[I915_NUM_ENGINES];
> +
> +		const struct intel_context_ops *ops;
> +	} __engine[I915_NUM_ENGINES];
>   
>   	/** ring_size: size for allocating the per-engine ring buffer */
>   	u32 ring_size;
> @@ -256,6 +264,23 @@ static inline bool i915_gem_context_is_kernel(struct i915_gem_context *ctx)
>   	return !ctx->file_priv;
>   }
>   
> +static inline struct intel_context *
> +to_intel_context(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
> +{
> +	return &ctx->__engine[engine->id];
> +}
> +
> +static inline struct intel_context *
> +intel_context_pin(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
> +{
> +	return engine->context_pin(engine, ctx);
> +}
> +
> +static inline void intel_context_unpin(struct intel_context *ce)
> +{
> +	ce->ops->unpin(ce);
> +}
> +
>   /* i915_gem_context.c */
>   int __must_check i915_gem_contexts_init(struct drm_i915_private *dev_priv);
>   void i915_gem_contexts_lost(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 269574b7254c..4801047d4d6a 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1474,7 +1474,7 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   
>   			ee->ctx =
>   				i915_error_object_create(i915,
> -							 ctx->engine[i].state);
> +							 request->hw_context->state);
>   
>   			error->simulated |=
>   				i915_gem_context_no_error_capture(ctx);
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index bfc906cd4e5e..11afab651050 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1221,7 +1221,7 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
>   		dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
>   	} else {
>   		struct intel_engine_cs *engine = dev_priv->engine[RCS];
> -		struct intel_ring *ring;
> +		struct intel_context *ce;
>   		int ret;
>   
>   		ret = i915_mutex_lock_interruptible(&dev_priv->drm);
> @@ -1234,19 +1234,19 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
>   		 *
>   		 * NB: implied RCS engine...
>   		 */
> -		ring = engine->context_pin(engine, stream->ctx);
> +		ce = intel_context_pin(stream->ctx, engine);
>   		mutex_unlock(&dev_priv->drm.struct_mutex);
> -		if (IS_ERR(ring))
> -			return PTR_ERR(ring);
> +		if (IS_ERR(ce))
> +			return PTR_ERR(ce);
>   
> +		dev_priv->perf.oa.pinned_ctx = ce;
>   
>   		/*
>   		 * Explicitly track the ID (instead of calling
>   		 * i915_ggtt_offset() on the fly) considering the difference
>   		 * with gen8+ and execlists
>   		 */
> -		dev_priv->perf.oa.specific_ctx_id =
> -			i915_ggtt_offset(stream->ctx->engine[engine->id].state);
> +		dev_priv->perf.oa.specific_ctx_id = i915_ggtt_offset(ce->state);
>   	}
>   
>   	return 0;
> @@ -1262,17 +1262,14 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
>   static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
>   {
>   	struct drm_i915_private *dev_priv = stream->dev_priv;
> +	struct intel_context *ce;
>   
> -	if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
> -		dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
> -	} else {
> -		struct intel_engine_cs *engine = dev_priv->engine[RCS];
> +	dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
>   
> +	ce = fetch_and_zero(&dev_priv->perf.oa.pinned_ctx);
> +	if (ce) {
>   		mutex_lock(&dev_priv->drm.struct_mutex);
> -
> -		dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
> -		engine->context_unpin(engine, stream->ctx);
> -
> +		intel_context_unpin(ce);
>   		mutex_unlock(&dev_priv->drm.struct_mutex);
>   	}
>   }
> @@ -1759,6 +1756,7 @@ static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_pr
>   static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
>   				       const struct i915_oa_config *oa_config)
>   {
> +	struct intel_engine_cs *engine = dev_priv->engine[RCS];
>   	struct i915_gem_context *ctx;
>   	int ret;
>   	unsigned int wait_flags = I915_WAIT_LOCKED;
> @@ -1789,7 +1787,7 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
>   
>   	/* Update all contexts now that we've stalled the submission. */
>   	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
> -		struct intel_context *ce = &ctx->engine[RCS];
> +		struct intel_context *ce = to_intel_context(ctx, engine);
>   		u32 *regs;
>   
>   		/* OA settings will be set upon first use */
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index f913e56604ea..da97ce4e18a9 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -410,8 +410,8 @@ static void i915_request_retire(struct i915_request *request)
>   	 * the subsequent request.
>   	 */
>   	if (engine->last_retired_context)
> -		engine->context_unpin(engine, engine->last_retired_context);
> -	engine->last_retired_context = request->gem_context;
> +		intel_context_unpin(engine->last_retired_context);
> +	engine->last_retired_context = request->hw_context;
>   
>   	spin_lock_irq(&request->lock);
>   	if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
> @@ -613,7 +613,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   {
>   	struct drm_i915_private *i915 = engine->i915;
>   	struct i915_request *rq;
> -	struct intel_ring *ring;
> +	struct intel_context *ce;
>   	int ret;
>   
>   	lockdep_assert_held(&i915->drm.struct_mutex);
> @@ -637,16 +637,15 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	 * GGTT space, so do this first before we reserve a seqno for
>   	 * ourselves.
>   	 */
> -	ring = engine->context_pin(engine, ctx);
> -	if (IS_ERR(ring))
> -		return ERR_CAST(ring);
> -	GEM_BUG_ON(!ring);
> +	ce = intel_context_pin(ctx, engine);
> +	if (IS_ERR(ce))
> +		return ERR_CAST(ce);
>   
>   	ret = reserve_engine(engine);
>   	if (ret)
>   		goto err_unpin;
>   
> -	ret = intel_ring_wait_for_space(ring, MIN_SPACE_FOR_ADD_REQUEST);
> +	ret = intel_ring_wait_for_space(ce->ring, MIN_SPACE_FOR_ADD_REQUEST);
>   	if (ret)
>   		goto err_unreserve;
>   
> @@ -733,7 +732,8 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	rq->i915 = i915;
>   	rq->engine = engine;
>   	rq->gem_context = ctx;
> -	rq->ring = ring;
> +	rq->hw_context = ce;
> +	rq->ring = ce->ring;
>   
>   	/* No zalloc, must clear what we need by hand */
>   	rq->global_seqno = 0;
> @@ -786,7 +786,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   err_unreserve:
>   	unreserve_engine(engine);
>   err_unpin:
> -	engine->context_unpin(engine, ctx);
> +	intel_context_unpin(ce);
>   	return ERR_PTR(ret);
>   }
>   
> @@ -972,7 +972,6 @@ i915_request_await_object(struct i915_request *to,
>   void __i915_request_add(struct i915_request *request, bool flush_caches)
>   {
>   	struct intel_engine_cs *engine = request->engine;
> -	struct intel_ring *ring = request->ring;
>   	struct intel_timeline *timeline = request->timeline;
>   	struct i915_request *prev;
>   	u32 *cs;
> @@ -1048,7 +1047,7 @@ void __i915_request_add(struct i915_request *request, bool flush_caches)
>   	GEM_BUG_ON(timeline->seqno != request->fence.seqno);
>   	i915_gem_active_set(&timeline->last_request, request);
>   
> -	list_add_tail(&request->ring_link, &ring->request_list);
> +	list_add_tail(&request->ring_link, &request->ring->request_list);
>   	request->emitted_jiffies = jiffies;
>   
>   	/*
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 6a029b3f0a88..7ea939f974ea 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -92,8 +92,9 @@ struct i915_request {
>   	 * i915_request_free() will then decrement the refcount on the
>   	 * context.
>   	 */
> -	struct i915_gem_context *gem_context;
>   	struct intel_engine_cs *engine;
> +	struct i915_gem_context *gem_context;
> +	struct intel_context *hw_context;
>   	struct intel_ring *ring;
>   	struct intel_timeline *timeline;
>   	struct intel_signal_node signaling;
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 4749426f9cad..ca8453ff4a47 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -665,6 +665,12 @@ static int init_phys_status_page(struct intel_engine_cs *engine)
>   	return 0;
>   }
>   
> +static void __intel_context_unpin(struct i915_gem_context *ctx,
> +				  struct intel_engine_cs *engine)
> +{
> +	intel_context_unpin(to_intel_context(ctx, engine));
> +}
> +
>   /**
>    * intel_engines_init_common - initialize cengine state which might require hw access
>    * @engine: Engine to initialize.
> @@ -678,7 +684,8 @@ static int init_phys_status_page(struct intel_engine_cs *engine)
>    */
>   int intel_engine_init_common(struct intel_engine_cs *engine)
>   {
> -	struct intel_ring *ring;
> +	struct drm_i915_private *i915 = engine->i915;
> +	struct intel_context *ce;
>   	int ret;
>   
>   	engine->set_default_submission(engine);
> @@ -690,19 +697,18 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
>   	 * be available. To avoid this we always pin the default
>   	 * context.
>   	 */
> -	ring = engine->context_pin(engine, engine->i915->kernel_context);
> -	if (IS_ERR(ring))
> -		return PTR_ERR(ring);
> +	ce = intel_context_pin(i915->kernel_context, engine);
> +	if (IS_ERR(ce))
> +		return PTR_ERR(ce);
>   
>   	/*
>   	 * Similarly the preempt context must always be available so that
>   	 * we can interrupt the engine at any time.
>   	 */
> -	if (engine->i915->preempt_context) {
> -		ring = engine->context_pin(engine,
> -					   engine->i915->preempt_context);
> -		if (IS_ERR(ring)) {
> -			ret = PTR_ERR(ring);
> +	if (i915->preempt_context) {
> +		ce = intel_context_pin(i915->preempt_context, engine);
> +		if (IS_ERR(ce)) {
> +			ret = PTR_ERR(ce);
>   			goto err_unpin_kernel;
>   		}
>   	}
> @@ -711,7 +717,7 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
>   	if (ret)
>   		goto err_unpin_preempt;
>   
> -	if (HWS_NEEDS_PHYSICAL(engine->i915))
> +	if (HWS_NEEDS_PHYSICAL(i915))
>   		ret = init_phys_status_page(engine);
>   	else
>   		ret = init_status_page(engine);
> @@ -723,10 +729,11 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
>   err_breadcrumbs:
>   	intel_engine_fini_breadcrumbs(engine);
>   err_unpin_preempt:
> -	if (engine->i915->preempt_context)
> -		engine->context_unpin(engine, engine->i915->preempt_context);
> +	if (i915->preempt_context)
> +		__intel_context_unpin(i915->preempt_context, engine);
> +
>   err_unpin_kernel:
> -	engine->context_unpin(engine, engine->i915->kernel_context);
> +	__intel_context_unpin(i915->kernel_context, engine);
>   	return ret;
>   }
>   
> @@ -739,6 +746,8 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
>    */
>   void intel_engine_cleanup_common(struct intel_engine_cs *engine)
>   {
> +	struct drm_i915_private *i915 = engine->i915;
> +
>   	intel_engine_cleanup_scratch(engine);
>   
>   	if (HWS_NEEDS_PHYSICAL(engine->i915))
> @@ -753,9 +762,9 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
>   	if (engine->default_state)
>   		i915_gem_object_put(engine->default_state);
>   
> -	if (engine->i915->preempt_context)
> -		engine->context_unpin(engine, engine->i915->preempt_context);
> -	engine->context_unpin(engine, engine->i915->kernel_context);
> +	if (i915->preempt_context)
> +		__intel_context_unpin(i915->preempt_context, engine);
> +	__intel_context_unpin(i915->kernel_context, engine);
>   }
>   
>   u64 intel_engine_get_active_head(const struct intel_engine_cs *engine)
> @@ -997,8 +1006,7 @@ bool intel_engines_are_idle(struct drm_i915_private *dev_priv)
>    */
>   bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine)
>   {
> -	const struct i915_gem_context * const kernel_context =
> -		engine->i915->kernel_context;
> +	const struct i915_gem_context *kctx = engine->i915->kernel_context;
>   	struct i915_request *rq;
>   
>   	lockdep_assert_held(&engine->i915->drm.struct_mutex);
> @@ -1010,9 +1018,12 @@ bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine)
>   	 */
>   	rq = __i915_gem_active_peek(&engine->timeline->last_request);
>   	if (rq)
> -		return rq->gem_context == kernel_context;
> -	else
> -		return engine->last_retired_context == kernel_context;
> +		return rq->gem_context == kctx;
> +
> +	if (!engine->last_retired_context)
> +		return false;
> +
> +	return engine->last_retired_context->gem_context == kctx;

You thought this will be a bit more readable/obvious?

>   }
>   
>   void intel_engines_reset_default_submission(struct drm_i915_private *i915)
> @@ -1096,16 +1107,16 @@ void intel_engines_unpark(struct drm_i915_private *i915)
>    */
>   void intel_engine_lost_context(struct intel_engine_cs *engine)
>   {
> -	struct i915_gem_context *ctx;
> +	struct intel_context *ce;
>   
>   	lockdep_assert_held(&engine->i915->drm.struct_mutex);
>   
>   	engine->legacy_active_context = NULL;
>   	engine->legacy_active_ppgtt = NULL;
>   
> -	ctx = fetch_and_zero(&engine->last_retired_context);
> -	if (ctx)
> -		engine->context_unpin(engine, ctx);
> +	ce = fetch_and_zero(&engine->last_retired_context);
> +	if (ce)
> +		intel_context_unpin(ce);
>   }
>   
>   bool intel_engine_can_store_dword(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/intel_guc_ads.c b/drivers/gpu/drm/i915/intel_guc_ads.c
> index 334cb5202e1c..b8b4d346dd4d 100644
> --- a/drivers/gpu/drm/i915/intel_guc_ads.c
> +++ b/drivers/gpu/drm/i915/intel_guc_ads.c
> @@ -121,7 +121,7 @@ int intel_guc_ads_create(struct intel_guc *guc)
>   	 * to find it. Note that we have to skip our header (1 page),
>   	 * because our GuC shared data is there.
>   	 */
> -	kernel_ctx_vma = dev_priv->kernel_context->engine[RCS].state;
> +	kernel_ctx_vma = dev_priv->kernel_context->__engine[RCS].state;
>   	blob->ads.golden_context_lrca =
>   		intel_guc_ggtt_offset(guc, kernel_ctx_vma) + skipped_offset;
>   
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index 8527fa1f5c3e..2fe3f9daa56d 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -362,7 +362,7 @@ static void guc_stage_desc_init(struct intel_guc *guc,
>   	desc->db_id = client->doorbell_id;
>   
>   	for_each_engine_masked(engine, dev_priv, client->engines, tmp) {
> -		struct intel_context *ce = &ctx->engine[engine->id];
> +		struct intel_context *ce = to_intel_context(ctx, engine);
>   		u32 guc_engine_id = engine->guc_id;
>   		struct guc_execlist_context *lrc = &desc->lrc[guc_engine_id];
>   
> @@ -511,9 +511,7 @@ static void guc_add_request(struct intel_guc *guc, struct i915_request *rq)
>   {
>   	struct intel_guc_client *client = guc->execbuf_client;
>   	struct intel_engine_cs *engine = rq->engine;
> -	u32 ctx_desc =
> -		lower_32_bits(intel_lr_context_descriptor(rq->gem_context,
> -							  engine));
> +	u32 ctx_desc = lower_32_bits(rq->hw_context->lrc_desc);

Ha...

>   	u32 ring_tail = intel_ring_set_tail(rq->ring, rq->tail) / sizeof(u64);
>   
>   	spin_lock(&client->wq_lock);
> @@ -551,8 +549,8 @@ static void inject_preempt_context(struct work_struct *work)
>   					     preempt_work[engine->id]);
>   	struct intel_guc_client *client = guc->preempt_client;
>   	struct guc_stage_desc *stage_desc = __get_stage_desc(client);
> -	u32 ctx_desc = lower_32_bits(intel_lr_context_descriptor(client->owner,
> -								 engine));
> +	u32 ctx_desc = lower_32_bits(to_intel_context(client->owner,
> +						      engine)->lrc_desc);

..hum.

>   	u32 data[7];
>   
>   	/*
> @@ -708,7 +706,7 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>   		struct i915_request *rq, *rn;
>   
>   		list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
> -			if (last && rq->gem_context != last->gem_context) {
> +			if (last && rq->hw_context != last->hw_context) {

Same thing really since this is already per engine.

>   				if (port == last_port) {
>   					__list_del_many(&p->requests,
>   							&rq->sched.link);
> @@ -991,7 +989,8 @@ static void guc_fill_preempt_context(struct intel_guc *guc)
>   	enum intel_engine_id id;
>   
>   	for_each_engine(engine, dev_priv, id) {
> -		struct intel_context *ce = &client->owner->engine[id];
> +		struct intel_context *ce =
> +			to_intel_context(client->owner, engine);
>   		u32 addr = intel_hws_preempt_done_address(engine);
>   		u32 *cs;
>   
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 0777226e65a6..e6725690b5b6 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -164,7 +164,8 @@
>   #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
>   
>   static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
> -					    struct intel_engine_cs *engine);
> +					    struct intel_engine_cs *engine,
> +					    struct intel_context *ce);

Feels a bit redundant to pass in everything but OK.

>   static void execlists_init_reg_state(u32 *reg_state,
>   				     struct i915_gem_context *ctx,
>   				     struct intel_engine_cs *engine,
> @@ -221,9 +222,9 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
>    */
>   static void
>   intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
> -				   struct intel_engine_cs *engine)
> +				   struct intel_engine_cs *engine,
> +				   struct intel_context *ce)
>   {
> -	struct intel_context *ce = &ctx->engine[engine->id];
>   	u64 desc;
>   
>   	BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
> @@ -414,7 +415,7 @@ execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
>   
>   static u64 execlists_update_context(struct i915_request *rq)
>   {
> -	struct intel_context *ce = &rq->gem_context->engine[rq->engine->id];
> +	struct intel_context *ce = rq->hw_context;
>   	struct i915_hw_ppgtt *ppgtt =
>   		rq->gem_context->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
>   	u32 *reg_state = ce->lrc_reg_state;
> @@ -491,14 +492,14 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>   	execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
>   }
>   
> -static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
> +static bool ctx_single_port_submission(const struct intel_context *ctx)

Should be ce, not ctx.

>   {
>   	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
> -		i915_gem_context_force_single_submission(ctx));
> +		i915_gem_context_force_single_submission(ctx->gem_context));
>   }

But the whole function change is again not needed I think.

>   
> -static bool can_merge_ctx(const struct i915_gem_context *prev,
> -			  const struct i915_gem_context *next)
> +static bool can_merge_ctx(const struct intel_context *prev,
> +			  const struct intel_context *next)

This one neither.

>   {
>   	if (prev != next)
>   		return false;
> @@ -523,7 +524,7 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists *execlists = &engine->execlists;
>   	struct intel_context *ce =
> -		&engine->i915->preempt_context->engine[engine->id];
> +		to_intel_context(engine->i915->preempt_context, engine);
>   	unsigned int n;
>   
>   	GEM_BUG_ON(execlists->preempt_complete_status !=
> @@ -666,8 +667,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			 * second request, and so we never need to tell the
>   			 * hardware about the first.
>   			 */
> -			if (last && !can_merge_ctx(rq->gem_context,
> -						   last->gem_context)) {
> +			if (last &&
> +			    !can_merge_ctx(rq->hw_context, last->hw_context)) {
>   				/*
>   				 * If we are on the second port and cannot
>   				 * combine this request with the last, then we
> @@ -686,14 +687,14 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   				 * the same context (even though a different
>   				 * request) to the second port.
>   				 */
> -				if (ctx_single_port_submission(last->gem_context) ||
> -				    ctx_single_port_submission(rq->gem_context)) {
> +				if (ctx_single_port_submission(last->hw_context) ||
> +				    ctx_single_port_submission(rq->hw_context)) {
>   					__list_del_many(&p->requests,
>   							&rq->sched.link);
>   					goto done;
>   				}
>   
> -				GEM_BUG_ON(last->gem_context == rq->gem_context);
> +				GEM_BUG_ON(last->hw_context == rq->hw_context);

And if you agree with the above then these two hunks do not have to be 
in the patch.

>   
>   				if (submit)
>   					port_assign(port, last);
> @@ -1277,6 +1278,37 @@ static void execlists_schedule(struct i915_request *request,
>   	spin_unlock_irq(&engine->timeline->lock);
>   }
>   
> +static void execlists_context_destroy(struct intel_context *ce)
> +{
> +	GEM_BUG_ON(!ce->state);
> +	GEM_BUG_ON(ce->pin_count);
> +
> +	intel_ring_free(ce->ring);
> +	__i915_gem_object_release_unless_active(ce->state->obj);
> +}
> +
> +static void __execlists_context_unpin(struct intel_context *ce)
> +{
> +	intel_ring_unpin(ce->ring);
> +
> +	ce->state->obj->pin_global--;
> +	i915_gem_object_unpin_map(ce->state->obj);
> +	i915_vma_unpin(ce->state);
> +
> +	i915_gem_context_put(ce->gem_context);
> +}
> +
> +static void execlists_context_unpin(struct intel_context *ce)
> +{
> +	lockdep_assert_held(&ce->gem_context->i915->drm.struct_mutex);
> +	GEM_BUG_ON(ce->pin_count == 0);
> +
> +	if (--ce->pin_count)
> +		return;
> +
> +	__execlists_context_unpin(ce);
> +}
> +
>   static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
>   {
>   	unsigned int flags;
> @@ -1300,21 +1332,15 @@ static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
>   	return i915_vma_pin(vma, 0, GEN8_LR_CONTEXT_ALIGN, flags);
>   }
>   
> -static struct intel_ring *
> -execlists_context_pin(struct intel_engine_cs *engine,
> -		      struct i915_gem_context *ctx)
> +static struct intel_context *
> +__execlists_context_pin(struct intel_engine_cs *engine,
> +			struct i915_gem_context *ctx,
> +			struct intel_context *ce)
>   {
> -	struct intel_context *ce = &ctx->engine[engine->id];
>   	void *vaddr;
>   	int ret;
>   
> -	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
> -
> -	if (likely(ce->pin_count++))
> -		goto out;
> -	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
> -
> -	ret = execlists_context_deferred_alloc(ctx, engine);
> +	ret = execlists_context_deferred_alloc(ctx, engine, ce);
>   	if (ret)
>   		goto err;
>   	GEM_BUG_ON(!ce->state);
> @@ -1333,7 +1359,7 @@ execlists_context_pin(struct intel_engine_cs *engine,
>   	if (ret)
>   		goto unpin_map;
>   
> -	intel_lr_context_descriptor_update(ctx, engine);
> +	intel_lr_context_descriptor_update(ctx, engine, ce);
>   
>   	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
>   	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
> @@ -1342,8 +1368,7 @@ execlists_context_pin(struct intel_engine_cs *engine,
>   
>   	ce->state->obj->pin_global++;
>   	i915_gem_context_get(ctx);
> -out:
> -	return ce->ring;
> +	return ce;
>   
>   unpin_map:
>   	i915_gem_object_unpin_map(ce->state->obj);
> @@ -1354,33 +1379,33 @@ execlists_context_pin(struct intel_engine_cs *engine,
>   	return ERR_PTR(ret);
>   }
>   
> -static void execlists_context_unpin(struct intel_engine_cs *engine,
> -				    struct i915_gem_context *ctx)
> +static const struct intel_context_ops execlists_context_ops = {
> +	.unpin = execlists_context_unpin,
> +	.destroy = execlists_context_destroy,
> +};
> +
> +static struct intel_context *
> +execlists_context_pin(struct intel_engine_cs *engine,
> +		      struct i915_gem_context *ctx)
>   {
> -	struct intel_context *ce = &ctx->engine[engine->id];
> +	struct intel_context *ce = to_intel_context(ctx, engine);
>   
>   	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
> -	GEM_BUG_ON(ce->pin_count == 0);
> -
> -	if (--ce->pin_count)
> -		return;
>   
> -	intel_ring_unpin(ce->ring);
> +	if (likely(ce->pin_count++))
> +		return ce;
> +	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
>   
> -	ce->state->obj->pin_global--;
> -	i915_gem_object_unpin_map(ce->state->obj);
> -	i915_vma_unpin(ce->state);
> +	ce->ops = &execlists_context_ops;

If ops are set on pin it would be good to unset them on destroy. Hm, or 
even set them not on pin but on creation. Not a huge deal but it is a 
bit unbalanced, and also if ce->ops is also a guard to distinguish 
possible from impossible engines then it is a bit more strange.

>   
> -	i915_gem_context_put(ctx);
> +	return __execlists_context_pin(engine, ctx, ce);
>   }
>   
>   static int execlists_request_alloc(struct i915_request *request)
>   {
> -	struct intel_engine_cs *engine = request->engine;
> -	struct intel_context *ce = &request->gem_context->engine[engine->id];
> -	int ret;
> +	int err;
>   
> -	GEM_BUG_ON(!ce->pin_count);
> +	GEM_BUG_ON(!request->hw_context->pin_count);
>   
>   	/* Flush enough space to reduce the likelihood of waiting after
>   	 * we start building the request - in which case we will just
> @@ -1388,9 +1413,9 @@ static int execlists_request_alloc(struct i915_request *request)
>   	 */
>   	request->reserved_space += EXECLISTS_REQUEST_SIZE;
>   
> -	ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
> -	if (ret)
> -		return ret;
> +	err = intel_ring_wait_for_space(request->ring, request->reserved_space);
> +	if (err)
> +		return err;

Rename of ret to err for a smaller diff is avoidable.

>   
>   	/* Note that after this point, we have committed to using
>   	 * this request as it is being used to both track the
> @@ -1780,8 +1805,8 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>   			      struct i915_request *request)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
> -	struct intel_context *ce;
>   	unsigned long flags;
> +	u32 *regs;
>   
>   	GEM_TRACE("%s request global=%x, current=%d\n",
>   		  engine->name, request ? request->global_seqno : 0,
> @@ -1831,14 +1856,13 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>   	 * future request will be after userspace has had the opportunity
>   	 * to recreate its own state.
>   	 */
> -	ce = &request->gem_context->engine[engine->id];
> -	execlists_init_reg_state(ce->lrc_reg_state,
> -				 request->gem_context, engine, ce->ring);
> +	regs = request->hw_context->lrc_reg_state;
> +	execlists_init_reg_state(regs,
> +				 request->gem_context, engine, request->ring);
>   
>   	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
> -	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
> -		i915_ggtt_offset(ce->ring->vma);
> -	ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
> +	regs[CTX_RING_BUFFER_START+1] = i915_ggtt_offset(request->ring->vma);
> +	regs[CTX_RING_HEAD+1] = request->postfix;

Does it now fit in 80-chars to have spaces around the plus operator? (To 
make checkpatch happier.)

>   
>   	request->ring->head = request->postfix;
>   	intel_ring_update_space(request->ring);
> @@ -2176,8 +2200,6 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   	engine->reset_hw = reset_common_ring;
>   
>   	engine->context_pin = execlists_context_pin;
> -	engine->context_unpin = execlists_context_unpin;
> -
>   	engine->request_alloc = execlists_request_alloc;
>   
>   	engine->emit_flush = gen8_emit_flush;
> @@ -2272,9 +2294,13 @@ static int logical_ring_init(struct intel_engine_cs *engine)
>   	}
>   
>   	engine->execlists.preempt_complete_status = ~0u;
> -	if (engine->i915->preempt_context)
> +	if (engine->i915->preempt_context) {
> +		struct intel_context *ce =
> +			to_intel_context(engine->i915->preempt_context, engine);
> +
>   		engine->execlists.preempt_complete_status =
> -			upper_32_bits(engine->i915->preempt_context->engine[engine->id].lrc_desc);
> +			upper_32_bits(ce->lrc_desc);
> +	}
>   
>   	return 0;
>   
> @@ -2408,7 +2434,7 @@ static void execlists_init_reg_state(u32 *regs,
>   	struct drm_i915_private *dev_priv = engine->i915;
>   	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt;
>   	u32 base = engine->mmio_base;
> -	bool rcs = engine->id == RCS;
> +	bool rcs = engine->class == RENDER_CLASS;

Doesn't belong in this patch.

>   
>   	/* A context is actually a big batch buffer with several
>   	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
> @@ -2553,10 +2579,10 @@ populate_lr_context(struct i915_gem_context *ctx,
>   }
>   
>   static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
> -					    struct intel_engine_cs *engine)
> +					    struct intel_engine_cs *engine,
> +					    struct intel_context *ce)
>   {
>   	struct drm_i915_gem_object *ctx_obj;
> -	struct intel_context *ce = &ctx->engine[engine->id];
>   	struct i915_vma *vma;
>   	uint32_t context_size;
>   	struct intel_ring *ring;
> @@ -2627,7 +2653,8 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv)
>   	 */
>   	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
>   		for_each_engine(engine, dev_priv, id) {
> -			struct intel_context *ce = &ctx->engine[engine->id];
> +			struct intel_context *ce =
> +				to_intel_context(ctx, engine);
>   			u32 *reg;
>   
>   			if (!ce->state)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 59d7b86012e9..1593194e930c 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -104,11 +104,4 @@ struct i915_gem_context;
>   
>   void intel_lr_context_resume(struct drm_i915_private *dev_priv);
>   
> -static inline uint64_t
> -intel_lr_context_descriptor(struct i915_gem_context *ctx,
> -			    struct intel_engine_cs *engine)
> -{
> -	return ctx->engine[engine->id].lrc_desc;
> -}
> -
>   #endif /* _INTEL_LRC_H_ */
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 3ea8eb5d49f5..9f526afa11ed 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -558,8 +558,7 @@ static void reset_ring_common(struct intel_engine_cs *engine,
>   	 */
>   	if (request) {
>   		struct drm_i915_private *dev_priv = request->i915;
> -		struct intel_context *ce =
> -			&request->gem_context->engine[engine->id];
> +		struct intel_context *ce = request->hw_context;
>   		struct i915_hw_ppgtt *ppgtt;
>   
>   		if (ce->state) {
> @@ -1164,9 +1163,33 @@ intel_ring_free(struct intel_ring *ring)
>   	kfree(ring);
>   }
>   
> -static int context_pin(struct i915_gem_context *ctx)
> +static void intel_ring_context_destroy(struct intel_context *ce)
>   {
> -	struct i915_vma *vma = ctx->engine[RCS].state;
> +	GEM_BUG_ON(ce->pin_count);
> +
> +	if (ce->state)
> +		__i915_gem_object_release_unless_active(ce->state->obj);
> +}
> +
> +static void intel_ring_context_unpin(struct intel_context *ce)
> +{
> +	lockdep_assert_held(&ce->gem_context->i915->drm.struct_mutex);
> +	GEM_BUG_ON(ce->pin_count == 0);
> +
> +	if (--ce->pin_count)
> +		return;
> +
> +	if (ce->state) {
> +		ce->state->obj->pin_global--;
> +		i915_vma_unpin(ce->state);
> +	}
> +
> +	i915_gem_context_put(ce->gem_context);
> +}
> +
> +static int __context_pin(struct intel_context *ce)
> +{
> +	struct i915_vma *vma = ce->state;
>   	int ret;
>   
>   	/*
> @@ -1253,25 +1276,19 @@ alloc_context_vma(struct intel_engine_cs *engine)
>   	return ERR_PTR(err);
>   }
>   
> -static struct intel_ring *
> -intel_ring_context_pin(struct intel_engine_cs *engine,
> -		       struct i915_gem_context *ctx)
> +static struct intel_context *
> +__ring_context_pin(struct intel_engine_cs *engine,
> +		   struct i915_gem_context *ctx,
> +		   struct intel_context *ce)
>   {
> -	struct intel_context *ce = &ctx->engine[engine->id];
> -	int ret;
> -
> -	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
> -
> -	if (likely(ce->pin_count++))
> -		goto out;
> -	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
> +	int err;
>   
>   	if (!ce->state && engine->context_size) {
>   		struct i915_vma *vma;
>   
>   		vma = alloc_context_vma(engine);
>   		if (IS_ERR(vma)) {
> -			ret = PTR_ERR(vma);
> +			err = PTR_ERR(vma);
>   			goto err;
>   		}
>   
> @@ -1279,8 +1296,8 @@ intel_ring_context_pin(struct intel_engine_cs *engine,
>   	}
>   
>   	if (ce->state) {
> -		ret = context_pin(ctx);
> -		if (ret)
> +		err = __context_pin(ce);
> +		if (err)
>   			goto err;
>   
>   		ce->state->obj->pin_global++;
> @@ -1288,32 +1305,37 @@ intel_ring_context_pin(struct intel_engine_cs *engine,
>   
>   	i915_gem_context_get(ctx);
>   
> -out:
>   	/* One ringbuffer to rule them all */
> -	return engine->buffer;
> +	GEM_BUG_ON(!engine->buffer);
> +	ce->ring = engine->buffer;
> +
> +	return ce;
>   
>   err:
>   	ce->pin_count = 0;

Strictly speaking this should be done in the caller.

> -	return ERR_PTR(ret);
> +	return ERR_PTR(err);
>   }
>   
> -static void intel_ring_context_unpin(struct intel_engine_cs *engine,
> -				     struct i915_gem_context *ctx)
> +static const struct intel_context_ops ring_context_ops = {
> +	.unpin = intel_ring_context_unpin,
> +	.destroy = intel_ring_context_destroy,
> +};
> +
> +static struct intel_context *
> +intel_ring_context_pin(struct intel_engine_cs *engine,
> +		       struct i915_gem_context *ctx)
>   {
> -	struct intel_context *ce = &ctx->engine[engine->id];
> +	struct intel_context *ce = to_intel_context(ctx, engine);
>   
>   	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
> -	GEM_BUG_ON(ce->pin_count == 0);
>   
> -	if (--ce->pin_count)
> -		return;
> +	if (likely(ce->pin_count++))
> +		return ce;
> +	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
>   
> -	if (ce->state) {
> -		ce->state->obj->pin_global--;
> -		i915_vma_unpin(ce->state);
> -	}
> +	ce->ops = &ring_context_ops;
>   
> -	i915_gem_context_put(ctx);
> +	return __ring_context_pin(engine, ctx, ce);
>   }

Pin/unpin bits were a nightmare diff but phew, looks ok.

>   
>   static int intel_init_ring_buffer(struct intel_engine_cs *engine)
> @@ -1323,10 +1345,6 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
>   
>   	intel_engine_setup_common(engine);
>   
> -	err = intel_engine_init_common(engine);
> -	if (err)
> -		goto err;
> -
>   	ring = intel_engine_create_ring(engine, 32 * PAGE_SIZE);
>   	if (IS_ERR(ring)) {
>   		err = PTR_ERR(ring);
> @@ -1341,8 +1359,14 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
>   	GEM_BUG_ON(engine->buffer);
>   	engine->buffer = ring;
>   
> +	err = intel_engine_init_common(engine);
> +	if (err)
> +		goto err_unpin;
> +

This ordering change doesn't look like it has to be done in this patch.

>   	return 0;
>   
> +err_unpin:
> +	intel_ring_unpin(ring);
>   err_ring:
>   	intel_ring_free(ring);
>   err:
> @@ -1428,7 +1452,7 @@ static inline int mi_set_context(struct i915_request *rq, u32 flags)
>   
>   	*cs++ = MI_NOOP;
>   	*cs++ = MI_SET_CONTEXT;
> -	*cs++ = i915_ggtt_offset(rq->gem_context->engine[RCS].state) | flags;
> +	*cs++ = i915_ggtt_offset(rq->hw_context->state) | flags;
>   	/*
>   	 * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
>   	 * WaMiSetContext_Hang:snb,ivb,vlv
> @@ -1519,7 +1543,7 @@ static int switch_context(struct i915_request *rq)
>   		hw_flags = MI_FORCE_RESTORE;
>   	}
>   
> -	if (to_ctx->engine[engine->id].state &&
> +	if (rq->hw_context->state &&
>   	    (to_ctx != from_ctx || hw_flags & MI_FORCE_RESTORE)) {
>   		GEM_BUG_ON(engine->id != RCS);
>   
> @@ -1567,7 +1591,7 @@ static int ring_request_alloc(struct i915_request *request)
>   {
>   	int ret;
>   
> -	GEM_BUG_ON(!request->gem_context->engine[request->engine->id].pin_count);
> +	GEM_BUG_ON(!request->hw_context->pin_count);
>   
>   	/* Flush enough space to reduce the likelihood of waiting after
>   	 * we start building the request - in which case we will just
> @@ -1994,8 +2018,6 @@ static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
>   	engine->reset_hw = reset_ring_common;
>   
>   	engine->context_pin = intel_ring_context_pin;
> -	engine->context_unpin = intel_ring_context_unpin;
> -
>   	engine->request_alloc = ring_request_alloc;
>   
>   	engine->emit_breadcrumb = i9xx_emit_breadcrumb;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 1231695fb4da..005d68e67d2e 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -427,10 +427,9 @@ struct intel_engine_cs {
>   
>   	void		(*set_default_submission)(struct intel_engine_cs *engine);
>   
> -	struct intel_ring *(*context_pin)(struct intel_engine_cs *engine,
> -					  struct i915_gem_context *ctx);
> -	void		(*context_unpin)(struct intel_engine_cs *engine,
> -					 struct i915_gem_context *ctx);
> +	struct intel_context *(*context_pin)(struct intel_engine_cs *engine,
> +					     struct i915_gem_context *ctx);
> +
>   	int		(*request_alloc)(struct i915_request *rq);
>   	int		(*init_context)(struct i915_request *rq);
>   
> @@ -546,7 +545,7 @@ struct intel_engine_cs {
>   	 * to the kernel context and trash it as the save may not happen
>   	 * before the hardware is powered down.
>   	 */
> -	struct i915_gem_context *last_retired_context;
> +	struct intel_context *last_retired_context;
>   
>   	/* We track the current MI_SET_CONTEXT in order to eliminate
>   	 * redudant context switches. This presumes that requests are not
> diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
> index 501becc47c0c..8904f1ce64e3 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_context.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_context.c
> @@ -30,6 +30,7 @@ mock_context(struct drm_i915_private *i915,
>   	     const char *name)
>   {
>   	struct i915_gem_context *ctx;
> +	unsigned int n;
>   	int ret;
>   
>   	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
> @@ -43,6 +44,12 @@ mock_context(struct drm_i915_private *i915,
>   	INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL);
>   	INIT_LIST_HEAD(&ctx->handles_list);
>   
> +	for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) {
> +		struct intel_context *ce = &ctx->__engine[n];
> +
> +		ce->gem_context = ctx;
> +	}
> +
>   	ret = ida_simple_get(&i915->contexts.hw_ida,
>   			     0, MAX_CONTEXT_HW_ID, GFP_KERNEL);
>   	if (ret < 0)
> diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
> index 78a89efa1119..612c48123808 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_engine.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
> @@ -67,18 +67,36 @@ static void hw_delay_complete(struct timer_list *t)
>   	spin_unlock(&engine->hw_lock);
>   }
>   
> -static struct intel_ring *
> -mock_context_pin(struct intel_engine_cs *engine,
> -		 struct i915_gem_context *ctx)
> +static void mock_context_unpin(struct intel_context *ce)
> +{
> +	if (--ce->pin_count)
> +		return;
> +
> +	i915_gem_context_put(ce->gem_context);
> +}
> +
> +static void mock_context_destroy(struct intel_context *ce)
>   {
> -	i915_gem_context_get(ctx);
> -	return engine->buffer;
> +	GEM_BUG_ON(ce->pin_count);
>   }
>   
> -static void mock_context_unpin(struct intel_engine_cs *engine,
> -			       struct i915_gem_context *ctx)
> +static const struct intel_context_ops mock_context_ops = {
> +	.unpin = mock_context_unpin,
> +	.destroy = mock_context_destroy,
> +};
> +
> +static struct intel_context *
> +mock_context_pin(struct intel_engine_cs *engine,
> +		 struct i915_gem_context *ctx)
>   {
> -	i915_gem_context_put(ctx);
> +	struct intel_context *ce = to_intel_context(ctx, engine);
> +
> +	if (!ce->pin_count++) {
> +		i915_gem_context_get(ctx);
> +		ce->ring = engine->buffer;
> +	}
> +
> +	return ce;
>   }
>   
>   static int mock_request_alloc(struct i915_request *request)
> @@ -168,7 +186,6 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>   	engine->base.status_page.page_addr = (void *)(engine + 1);
>   
>   	engine->base.context_pin = mock_context_pin;
> -	engine->base.context_unpin = mock_context_unpin;
>   	engine->base.request_alloc = mock_request_alloc;
>   	engine->base.emit_flush = mock_emit_flush;
>   	engine->base.emit_breadcrumb = mock_emit_breadcrumb;
> @@ -213,11 +230,13 @@ void mock_engine_free(struct intel_engine_cs *engine)
>   {
>   	struct mock_engine *mock =
>   		container_of(engine, typeof(*mock), base);
> +	struct intel_context *ce;
>   
>   	GEM_BUG_ON(timer_pending(&mock->hw_delay));
>   
> -	if (engine->last_retired_context)
> -		engine->context_unpin(engine, engine->last_retired_context);
> +	ce = fetch_and_zero(&engine->last_retired_context);
> +	if (ce)
> +		intel_context_unpin(ce);
>   
>   	intel_engine_fini_breadcrumbs(engine);
>   
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index e6d4b882599a..47a9de741c5f 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -243,6 +243,11 @@ struct drm_i915_private *mock_gem_device(void)
>   	if (!i915->kernel_context)
>   		goto err_engine;
>   
> +	/* Fake the initial load for the powersaving context */
> +	i915->engine[RCS]->last_retired_context =
> +		intel_context_pin(i915->kernel_context, i915->engine[RCS]);
> +	GEM_BUG_ON(IS_ERR_OR_NULL(i915->engine[RCS]->last_retired_context));

What is this?

> +
>   	WARN_ON(i915_gemfs_init(i915));
>   
>   	return i915;
> 

Regards,

Tvrtko
Chris Wilson April 19, 2018, 11:10 a.m. UTC | #2
Quoting Tvrtko Ursulin (2018-04-19 11:40:06)
> 
> On 19/04/2018 08:17, Chris Wilson wrote:
> > To ease the frequent and ugly pointer dance of
> > &request->gem_context->engine[request->engine->id] during request
> > submission, store that pointer as request->hw_context. One major
> > advantage that we will exploit later is that this decouples the logical
> > context state from the engine itself.
> 
> I can see merit in making all the places which work on engines or 
> requests more logically and elegantly grab the engine specific context.
> 
> Authors of current unmerged work will probably be not so thrilled though.

> > @@ -353,60 +346,56 @@ int intel_gvt_scan_and_shadow_workload(struct intel_vgpu_workload *workload)
> >       struct intel_vgpu_submission *s = &vgpu->submission;
> >       struct i915_gem_context *shadow_ctx = s->shadow_ctx;
> >       struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
> > -     int ring_id = workload->ring_id;
> > -     struct intel_engine_cs *engine = dev_priv->engine[ring_id];
> > -     struct intel_ring *ring;
> > +     struct intel_engine_cs *engine = dev_priv->engine[workload->ring_id];
> > +     struct intel_context *ce;
> >       int ret;
> >   
> >       lockdep_assert_held(&dev_priv->drm.struct_mutex);
> >   
> > -     if (workload->shadowed)
> > +     if (workload->req)
> >               return 0;
> 
> GVT team will need to look at this hunk/function.
...
> At which point I skip over all GVT and continue further down. :)

That code doesn't merit your attention (or ours really until it is
improved, it really is staging/ quality).

> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> > index 2fe779cab298..ed1c591ee866 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -125,16 +125,10 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
> >       i915_ppgtt_put(ctx->ppgtt);
> >   
> >       for (i = 0; i < I915_NUM_ENGINES; i++) {
> > -             struct intel_context *ce = &ctx->engine[i];
> > +             struct intel_context *ce = &ctx->__engine[i];
> >   
> > -             if (!ce->state)
> > -                     continue;
> > -
> > -             WARN_ON(ce->pin_count);
> > -             if (ce->ring)
> > -                     intel_ring_free(ce->ring);
> > -
> > -             __i915_gem_object_release_unless_active(ce->state->obj);
> > +             if (ce->ops)
> 
> Is ce->ops now the gate which was "if (!ce->state) continue" before, to 
> avoid GEM_BUG_ON in execlists_context_destroy? I am wondering if this 
> loop should not just be for_each_engine. Places which walk until 
> I915_NUM_ENGINES should be very special and this one doesn't feel like 
> one.

s/I915_NUM_ENGINES/ARRAY_SIZE(ctx->__engine)/ That's the semantic intent
here. That we aren't just thinking about engines, but the contents of
the context struct. We teardown the struct.

> > @@ -1010,9 +1018,12 @@ bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine)
> >        */
> >       rq = __i915_gem_active_peek(&engine->timeline->last_request);
> >       if (rq)
> > -             return rq->gem_context == kernel_context;
> > -     else
> > -             return engine->last_retired_context == kernel_context;
> > +             return rq->gem_context == kctx;
> > +
> > +     if (!engine->last_retired_context)
> > +             return false;
> > +
> > +     return engine->last_retired_context->gem_context == kctx;
> 
> You thought this will be a bit more readable/obvious?

Hmm, I wrote this before I wrote to_intel_context(). If we use that, it
looks like

        const struct intel_context *kernel_context =
                to_intel_context(engine->i915->kernel_context, engine);
        struct i915_request *rq;

        rq = __i915_gem_active_peek(&engine->timeline->last_request);
        if (rq)
                return rq->hw_context == kernel_context;
        else
                return engine->last_retired_context == kernel_context;

which isn't as bad as the first pass was...

> > @@ -511,9 +511,7 @@ static void guc_add_request(struct intel_guc *guc, struct i915_request *rq)
> >   {
> >       struct intel_guc_client *client = guc->execbuf_client;
> >       struct intel_engine_cs *engine = rq->engine;
> > -     u32 ctx_desc =
> > -             lower_32_bits(intel_lr_context_descriptor(rq->gem_context,
> > -                                                       engine));
> > +     u32 ctx_desc = lower_32_bits(rq->hw_context->lrc_desc);
> 
> Ha...
> 
> >       u32 ring_tail = intel_ring_set_tail(rq->ring, rq->tail) / sizeof(u64);
> >   
> >       spin_lock(&client->wq_lock);
> > @@ -551,8 +549,8 @@ static void inject_preempt_context(struct work_struct *work)
> >                                            preempt_work[engine->id]);
> >       struct intel_guc_client *client = guc->preempt_client;
> >       struct guc_stage_desc *stage_desc = __get_stage_desc(client);
> > -     u32 ctx_desc = lower_32_bits(intel_lr_context_descriptor(client->owner,
> > -                                                              engine));
> > +     u32 ctx_desc = lower_32_bits(to_intel_context(client->owner,
> > +                                                   engine)->lrc_desc);
> 
> ..hum.

Just pointing out that this code is horrible ;)

One of the goals of having rq->hw_context was to avoid the pointer
dance, such as exemplified by intel_lr_context_descriptor(). But here we
should just stick the ctx_desc in the intel_guc_client.

> > @@ -708,7 +706,7 @@ static void guc_dequeue(struct intel_engine_cs *engine)
> >               struct i915_request *rq, *rn;
> >   
> >               list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
> > -                     if (last && rq->gem_context != last->gem_context) {
> > +                     if (last && rq->hw_context != last->hw_context) {
> 
> Same thing really since this is already per engine.

Not always...

> >                               if (port == last_port) {
> >                                       __list_del_many(&p->requests,
> >                                                       &rq->sched.link);
> > @@ -991,7 +989,8 @@ static void guc_fill_preempt_context(struct intel_guc *guc)
> >       enum intel_engine_id id;
> >   
> >       for_each_engine(engine, dev_priv, id) {
> > -             struct intel_context *ce = &client->owner->engine[id];
> > +             struct intel_context *ce =
> > +                     to_intel_context(client->owner, engine);
> >               u32 addr = intel_hws_preempt_done_address(engine);
> >               u32 *cs;
> >   
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index 0777226e65a6..e6725690b5b6 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -164,7 +164,8 @@
> >   #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
> >   
> >   static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
> > -                                         struct intel_engine_cs *engine);
> > +                                         struct intel_engine_cs *engine,
> > +                                         struct intel_context *ce);
> 
> Feels a bit redundant to pass in everything but OK.

You do remember which patch this was split out of ;)

> >   {
> >       return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
> > -             i915_gem_context_force_single_submission(ctx));
> > +             i915_gem_context_force_single_submission(ctx->gem_context));
> >   }
> 
> But the whole function change is again not needed I think.
> 
> >   
> > -static bool can_merge_ctx(const struct i915_gem_context *prev,
> > -                       const struct i915_gem_context *next)
> > +static bool can_merge_ctx(const struct intel_context *prev,
> > +                       const struct intel_context *next)
> 
> This one neither.

Consider what it means. The rule is that we are not allowed to submit
the same lrc descriptor to subsequent ports; that corresponds to
the hw context, not gem_context. The payoff, aside from the better
semantics now, is in subsequent patches.

> > @@ -686,14 +687,14 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> >                                * the same context (even though a different
> >                                * request) to the second port.
> >                                */
> > -                             if (ctx_single_port_submission(last->gem_context) ||
> > -                                 ctx_single_port_submission(rq->gem_context)) {
> > +                             if (ctx_single_port_submission(last->hw_context) ||
> > +                                 ctx_single_port_submission(rq->hw_context)) {
> >                                       __list_del_many(&p->requests,
> >                                                       &rq->sched.link);
> >                                       goto done;
> >                               }
> >   
> > -                             GEM_BUG_ON(last->gem_context == rq->gem_context);
> > +                             GEM_BUG_ON(last->hw_context == rq->hw_context);
> 
> And if you agree with the above then these two hunks do not have to be 
> in the patch.

Respectfully disagree as this is ground work for subsequent logical
contexts that float between engines.

> > +static struct intel_context *
> > +execlists_context_pin(struct intel_engine_cs *engine,
> > +                   struct i915_gem_context *ctx)
> >   {
> > -     struct intel_context *ce = &ctx->engine[engine->id];
> > +     struct intel_context *ce = to_intel_context(ctx, engine);
> >   
> >       lockdep_assert_held(&ctx->i915->drm.struct_mutex);
> > -     GEM_BUG_ON(ce->pin_count == 0);
> > -
> > -     if (--ce->pin_count)
> > -             return;
> >   
> > -     intel_ring_unpin(ce->ring);
> > +     if (likely(ce->pin_count++))
> > +             return ce;
> > +     GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
> >   
> > -     ce->state->obj->pin_global--;
> > -     i915_gem_object_unpin_map(ce->state->obj);
> > -     i915_vma_unpin(ce->state);
> > +     ce->ops = &execlists_context_ops;
> 
> If ops are set on pin it would be good to unset them on destroy.

No point on destroy, since it's being destroyed.

> Hm, or even set them not on pin but on creation.

engine->context_pin() is our intel_context factory, pin is the creation
function.

Definitely a bit asymmetrical, the key bit was having ce->unpin() so that
we didn't try to release the intel_context via a stale engine reference.

What I have been considering is passing intel_context to
i915_request_alloc() instead and trying to avoid that asymmetry by
managing the creation in the caller and separating it from pin().

> Not a huge deal but it is a 
> bit unbalanced, and also if ce->ops is also a guard to distinguish 
> possible from impossible engines then it is a bit more strange.

Not impossible engines in this regard, unused contexts.

> >   
> > -     i915_gem_context_put(ctx);
> > +     return __execlists_context_pin(engine, ctx, ce);
> >   }
> >   
> >   static int execlists_request_alloc(struct i915_request *request)
> >   {
> > -     struct intel_engine_cs *engine = request->engine;
> > -     struct intel_context *ce = &request->gem_context->engine[engine->id];
> > -     int ret;
> > +     int err;
> >   
> > -     GEM_BUG_ON(!ce->pin_count);
> > +     GEM_BUG_ON(!request->hw_context->pin_count);
> >   
> >       /* Flush enough space to reduce the likelihood of waiting after
> >        * we start building the request - in which case we will just
> > @@ -1388,9 +1413,9 @@ static int execlists_request_alloc(struct i915_request *request)
> >        */
> >       request->reserved_space += EXECLISTS_REQUEST_SIZE;
> >   
> > -     ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
> > -     if (ret)
> > -             return ret;
> > +     err = intel_ring_wait_for_space(request->ring, request->reserved_space);
> > +     if (err)
> > +             return err;
> 
> Rename of ret to err for a smaller diff is avoidable.

A couple of lines for trying to harmonise between ret and err.

> > @@ -1288,32 +1305,37 @@ intel_ring_context_pin(struct intel_engine_cs *engine,
> >   
> >       i915_gem_context_get(ctx);
> >   
> > -out:
> >       /* One ringbuffer to rule them all */
> > -     return engine->buffer;
> > +     GEM_BUG_ON(!engine->buffer);
> > +     ce->ring = engine->buffer;
> > +
> > +     return ce;
> >   
> >   err:
> >       ce->pin_count = 0;
> 
> Strictly speaking this should be done in the caller.

Sssh. Tail call, think of it as a single contiguous function ;)

> > @@ -1323,10 +1345,6 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
> >   
> >       intel_engine_setup_common(engine);
> >   
> > -     err = intel_engine_init_common(engine);
> > -     if (err)
> > -             goto err;
> > -
> >       ring = intel_engine_create_ring(engine, 32 * PAGE_SIZE);
> >       if (IS_ERR(ring)) {
> >               err = PTR_ERR(ring);
> > @@ -1341,8 +1359,14 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
> >       GEM_BUG_ON(engine->buffer);
> >       engine->buffer = ring;
> >   
> > +     err = intel_engine_init_common(engine);
> > +     if (err)
> > +             goto err_unpin;
> > +
> 
> This ordering change doesn't look like it has to be done in this patch.

It does. Do you want to take another look ;)

The problem here is the legacy ringbuffer submission uses a common
engine->buffer, and the kernel contexts are pinned in init_common; so we
have to create that buffer first.

> > diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> > index e6d4b882599a..47a9de741c5f 100644
> > --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> > +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> > @@ -243,6 +243,11 @@ struct drm_i915_private *mock_gem_device(void)
> >       if (!i915->kernel_context)
> >               goto err_engine;
> >   
> > +     /* Fake the initial load for the powersaving context */
> > +     i915->engine[RCS]->last_retired_context =
> > +             intel_context_pin(i915->kernel_context, i915->engine[RCS]);
> > +     GEM_BUG_ON(IS_ERR_OR_NULL(i915->engine[RCS]->last_retired_context));
> 
> What is this?

Keeping selftests happy. I was asserting engine->last_retired_context was
set chasing ce->ops and found an instance where it wasn't which caused
this patch to blow up. And I had I written
intel_engine_has_kernel_context() differently, this chunk wouldn't have
been required.
-Chris
Tvrtko Ursulin April 19, 2018, 12:11 p.m. UTC | #3
On 19/04/2018 12:10, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-04-19 11:40:06)
>>
>> On 19/04/2018 08:17, Chris Wilson wrote:
>>> To ease the frequent and ugly pointer dance of
>>> &request->gem_context->engine[request->engine->id] during request
>>> submission, store that pointer as request->hw_context. One major
>>> advantage that we will exploit later is that this decouples the logical
>>> context state from the engine itself.
>>
>> I can see merit in making all the places which work on engines or
>> requests more logically and elegantly grab the engine specific context.
>>
>> Authors of current unmerged work will probably be not so thrilled though.
> 
>>> @@ -353,60 +346,56 @@ int intel_gvt_scan_and_shadow_workload(struct intel_vgpu_workload *workload)
>>>        struct intel_vgpu_submission *s = &vgpu->submission;
>>>        struct i915_gem_context *shadow_ctx = s->shadow_ctx;
>>>        struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
>>> -     int ring_id = workload->ring_id;
>>> -     struct intel_engine_cs *engine = dev_priv->engine[ring_id];
>>> -     struct intel_ring *ring;
>>> +     struct intel_engine_cs *engine = dev_priv->engine[workload->ring_id];
>>> +     struct intel_context *ce;
>>>        int ret;
>>>    
>>>        lockdep_assert_held(&dev_priv->drm.struct_mutex);
>>>    
>>> -     if (workload->shadowed)
>>> +     if (workload->req)
>>>                return 0;
>>
>> GVT team will need to look at this hunk/function.
> ...
>> At which point I skip over all GVT and continue further down. :)
> 
> That code doesn't merit your attention (or ours really until it is
> improved, it really is staging/ quality).

Still, best to ask GVT team to r-b their parts.

[snip]

>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>> index 0777226e65a6..e6725690b5b6 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -164,7 +164,8 @@
>>>    #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
>>>    
>>>    static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
>>> -                                         struct intel_engine_cs *engine);
>>> +                                         struct intel_engine_cs *engine,
>>> +                                         struct intel_context *ce);
>>
>> Feels a bit redundant to pass in everything but OK.
> 
> You do remember which patch this was split out of ;)
> 
>>>    {
>>>        return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
>>> -             i915_gem_context_force_single_submission(ctx));
>>> +             i915_gem_context_force_single_submission(ctx->gem_context));
>>>    }
>>
>> But the whole function change is again not needed I think.
>>
>>>    
>>> -static bool can_merge_ctx(const struct i915_gem_context *prev,
>>> -                       const struct i915_gem_context *next)
>>> +static bool can_merge_ctx(const struct intel_context *prev,
>>> +                       const struct intel_context *next)
>>
>> This one neither.
> 
> Consider what it means. The rule is that we are not allowed to submit
> the same lrc descriptor to subsequent ports; that corresponds to
> the hw context, not gem_context. The payoff, aside from the better
> semantics now, is in subsequent patches.

Yes all fine, just saying it did not have to be in _this_ series but 
could have came later.

>>> +static struct intel_context *
>>> +execlists_context_pin(struct intel_engine_cs *engine,
>>> +                   struct i915_gem_context *ctx)
>>>    {
>>> -     struct intel_context *ce = &ctx->engine[engine->id];
>>> +     struct intel_context *ce = to_intel_context(ctx, engine);
>>>    
>>>        lockdep_assert_held(&ctx->i915->drm.struct_mutex);
>>> -     GEM_BUG_ON(ce->pin_count == 0);
>>> -
>>> -     if (--ce->pin_count)
>>> -             return;
>>>    
>>> -     intel_ring_unpin(ce->ring);
>>> +     if (likely(ce->pin_count++))
>>> +             return ce;
>>> +     GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
>>>    
>>> -     ce->state->obj->pin_global--;
>>> -     i915_gem_object_unpin_map(ce->state->obj);
>>> -     i915_vma_unpin(ce->state);
>>> +     ce->ops = &execlists_context_ops;
>>
>> If ops are set on pin it would be good to unset them on destroy.
> 
> No point on destroy, since it's being destroyed.
> 
>> Hm, or even set them not on pin but on creation.
> 
> engine->context_pin() is our intel_context factory, pin is the creation
> function.
> 
> Definitely a bit asymmetrical, the key bit was having ce->unpin() so that
> we didn't try to release the intel_context via a stale engine reference.
> 
> What I have been considering is passing intel_context to
> i915_request_alloc() instead and trying to avoid that asymmetry by
> managing the creation in the caller and separating it from pin().
> 
>> Not a huge deal but it is a
>> bit unbalanced, and also if ce->ops is also a guard to distinguish
>> possible from impossible engines then it is a bit more strange.
> 
> Not impossible engines in this regard, unused contexts.

Both then, since impossible engines will have unused contexts. :)

>>> @@ -1323,10 +1345,6 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
>>>    
>>>        intel_engine_setup_common(engine);
>>>    
>>> -     err = intel_engine_init_common(engine);
>>> -     if (err)
>>> -             goto err;
>>> -
>>>        ring = intel_engine_create_ring(engine, 32 * PAGE_SIZE);
>>>        if (IS_ERR(ring)) {
>>>                err = PTR_ERR(ring);
>>> @@ -1341,8 +1359,14 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
>>>        GEM_BUG_ON(engine->buffer);
>>>        engine->buffer = ring;
>>>    
>>> +     err = intel_engine_init_common(engine);
>>> +     if (err)
>>> +             goto err_unpin;
>>> +
>>
>> This ordering change doesn't look like it has to be done in this patch.
> 
> It does. Do you want to take another look ;)
> 
> The problem here is the legacy ringbuffer submission uses a common
> engine->buffer, and the kernel contexts are pinned in init_common; so we
> have to create that buffer first.

So it looks like an pre-existing bug. intel_ring_context_pin returns 
engine->buffer which hasn't been created yet. Oh well.. can you inject a 
patch which solves only that before this one?

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/gvt/mmio_context.c b/drivers/gpu/drm/i915/gvt/mmio_context.c
index a5bac83d53a9..708170e61625 100644
--- a/drivers/gpu/drm/i915/gvt/mmio_context.c
+++ b/drivers/gpu/drm/i915/gvt/mmio_context.c
@@ -446,9 +446,9 @@  static void switch_mocs(struct intel_vgpu *pre, struct intel_vgpu *next,
 
 #define CTX_CONTEXT_CONTROL_VAL	0x03
 
-bool is_inhibit_context(struct i915_gem_context *ctx, int ring_id)
+bool is_inhibit_context(struct intel_context *ce)
 {
-	u32 *reg_state = ctx->engine[ring_id].lrc_reg_state;
+	const u32 *reg_state = ce->lrc_reg_state;
 	u32 inhibit_mask =
 		_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
 
@@ -501,7 +501,7 @@  static void switch_mmio(struct intel_vgpu *pre,
 			 * itself.
 			 */
 			if (mmio->in_context &&
-			    !is_inhibit_context(s->shadow_ctx, ring_id))
+			    !is_inhibit_context(&s->shadow_ctx->__engine[ring_id]))
 				continue;
 
 			if (mmio->mask)
diff --git a/drivers/gpu/drm/i915/gvt/mmio_context.h b/drivers/gpu/drm/i915/gvt/mmio_context.h
index 0439eb8057a8..5c3b9ff9f96a 100644
--- a/drivers/gpu/drm/i915/gvt/mmio_context.h
+++ b/drivers/gpu/drm/i915/gvt/mmio_context.h
@@ -49,7 +49,7 @@  void intel_gvt_switch_mmio(struct intel_vgpu *pre,
 
 void intel_gvt_init_engine_mmio_context(struct intel_gvt *gvt);
 
-bool is_inhibit_context(struct i915_gem_context *ctx, int ring_id);
+bool is_inhibit_context(struct intel_context *ce);
 
 int intel_vgpu_restore_inhibit_context(struct intel_vgpu *vgpu,
 				       struct i915_request *req);
diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c
index f64cccb2e793..c2e440200b0c 100644
--- a/drivers/gpu/drm/i915/gvt/scheduler.c
+++ b/drivers/gpu/drm/i915/gvt/scheduler.c
@@ -54,11 +54,8 @@  static void set_context_pdp_root_pointer(
 
 static void update_shadow_pdps(struct intel_vgpu_workload *workload)
 {
-	struct intel_vgpu *vgpu = workload->vgpu;
-	int ring_id = workload->ring_id;
-	struct i915_gem_context *shadow_ctx = vgpu->submission.shadow_ctx;
 	struct drm_i915_gem_object *ctx_obj =
-		shadow_ctx->engine[ring_id].state->obj;
+		workload->req->hw_context->state->obj;
 	struct execlist_ring_context *shadow_ring_context;
 	struct page *page;
 
@@ -128,9 +125,8 @@  static int populate_shadow_context(struct intel_vgpu_workload *workload)
 	struct intel_vgpu *vgpu = workload->vgpu;
 	struct intel_gvt *gvt = vgpu->gvt;
 	int ring_id = workload->ring_id;
-	struct i915_gem_context *shadow_ctx = vgpu->submission.shadow_ctx;
 	struct drm_i915_gem_object *ctx_obj =
-		shadow_ctx->engine[ring_id].state->obj;
+		workload->req->hw_context->state->obj;
 	struct execlist_ring_context *shadow_ring_context;
 	struct page *page;
 	void *dst;
@@ -280,10 +276,8 @@  static int shadow_context_status_change(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
-static void shadow_context_descriptor_update(struct i915_gem_context *ctx,
-		struct intel_engine_cs *engine)
+static void shadow_context_descriptor_update(struct intel_context *ce)
 {
-	struct intel_context *ce = &ctx->engine[engine->id];
 	u64 desc = 0;
 
 	desc = ce->lrc_desc;
@@ -292,7 +286,7 @@  static void shadow_context_descriptor_update(struct i915_gem_context *ctx,
 	 * like GEN8_CTX_* cached in desc_template
 	 */
 	desc &= U64_MAX << 12;
-	desc |= ctx->desc_template & ((1ULL << 12) - 1);
+	desc |= ce->gem_context->desc_template & ((1ULL << 12) - 1);
 
 	ce->lrc_desc = desc;
 }
@@ -300,12 +294,11 @@  static void shadow_context_descriptor_update(struct i915_gem_context *ctx,
 static int copy_workload_to_ring_buffer(struct intel_vgpu_workload *workload)
 {
 	struct intel_vgpu *vgpu = workload->vgpu;
+	struct i915_request *req = workload->req;
 	void *shadow_ring_buffer_va;
 	u32 *cs;
-	struct i915_request *req = workload->req;
 
-	if (IS_KABYLAKE(req->i915) &&
-	    is_inhibit_context(req->gem_context, req->engine->id))
+	if (IS_KABYLAKE(req->i915) && is_inhibit_context(req->hw_context))
 		intel_vgpu_restore_inhibit_context(vgpu, req);
 
 	/* allocate shadow ring buffer */
@@ -353,60 +346,56 @@  int intel_gvt_scan_and_shadow_workload(struct intel_vgpu_workload *workload)
 	struct intel_vgpu_submission *s = &vgpu->submission;
 	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
-	int ring_id = workload->ring_id;
-	struct intel_engine_cs *engine = dev_priv->engine[ring_id];
-	struct intel_ring *ring;
+	struct intel_engine_cs *engine = dev_priv->engine[workload->ring_id];
+	struct intel_context *ce;
 	int ret;
 
 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
 
-	if (workload->shadowed)
+	if (workload->req)
 		return 0;
 
+	/* pin shadow context by gvt even the shadow context will be pinned
+	 * when i915 alloc request. That is because gvt will update the guest
+	 * context from shadow context when workload is completed, and at that
+	 * moment, i915 may already unpined the shadow context to make the
+	 * shadow_ctx pages invalid. So gvt need to pin itself. After update
+	 * the guest context, gvt can unpin the shadow_ctx safely.
+	 */
+	ce = intel_context_pin(shadow_ctx, engine);
+	if (IS_ERR(ce)) {
+		gvt_vgpu_err("fail to pin shadow context\n");
+		return PTR_ERR(ce);
+	}
+
 	shadow_ctx->desc_template &= ~(0x3 << GEN8_CTX_ADDRESSING_MODE_SHIFT);
 	shadow_ctx->desc_template |= workload->ctx_desc.addressing_mode <<
 				    GEN8_CTX_ADDRESSING_MODE_SHIFT;
 
-	if (!test_and_set_bit(ring_id, s->shadow_ctx_desc_updated))
-		shadow_context_descriptor_update(shadow_ctx,
-					dev_priv->engine[ring_id]);
+	if (!test_and_set_bit(workload->ring_id, s->shadow_ctx_desc_updated))
+		shadow_context_descriptor_update(ce);
 
 	ret = intel_gvt_scan_and_shadow_ringbuffer(workload);
 	if (ret)
-		goto err_scan;
+		goto err_unpin;
 
 	if ((workload->ring_id == RCS) &&
 	    (workload->wa_ctx.indirect_ctx.size != 0)) {
 		ret = intel_gvt_scan_and_shadow_wa_ctx(&workload->wa_ctx);
 		if (ret)
-			goto err_scan;
-	}
-
-	/* pin shadow context by gvt even the shadow context will be pinned
-	 * when i915 alloc request. That is because gvt will update the guest
-	 * context from shadow context when workload is completed, and at that
-	 * moment, i915 may already unpined the shadow context to make the
-	 * shadow_ctx pages invalid. So gvt need to pin itself. After update
-	 * the guest context, gvt can unpin the shadow_ctx safely.
-	 */
-	ring = engine->context_pin(engine, shadow_ctx);
-	if (IS_ERR(ring)) {
-		ret = PTR_ERR(ring);
-		gvt_vgpu_err("fail to pin shadow context\n");
-		goto err_shadow;
+			goto err_shadow;
 	}
 
 	ret = populate_shadow_context(workload);
 	if (ret)
-		goto err_unpin;
-	workload->shadowed = true;
+		goto err_shadow;
+
 	return 0;
 
-err_unpin:
-	engine->context_unpin(engine, shadow_ctx);
 err_shadow:
 	release_shadow_wa_ctx(&workload->wa_ctx);
-err_scan:
+err_unpin:
+	intel_context_unpin(ce);
 	return ret;
 }
 
@@ -414,7 +403,6 @@  static int intel_gvt_generate_request(struct intel_vgpu_workload *workload)
 {
 	int ring_id = workload->ring_id;
 	struct drm_i915_private *dev_priv = workload->vgpu->gvt->dev_priv;
-	struct intel_engine_cs *engine = dev_priv->engine[ring_id];
 	struct i915_request *rq;
 	struct intel_vgpu *vgpu = workload->vgpu;
 	struct intel_vgpu_submission *s = &vgpu->submission;
@@ -437,7 +425,6 @@  static int intel_gvt_generate_request(struct intel_vgpu_workload *workload)
 	return 0;
 
 err_unpin:
-	engine->context_unpin(engine, shadow_ctx);
 	release_shadow_wa_ctx(&workload->wa_ctx);
 	return ret;
 }
@@ -495,21 +482,13 @@  static int prepare_shadow_batch_buffer(struct intel_vgpu_workload *workload)
 	return ret;
 }
 
-static int update_wa_ctx_2_shadow_ctx(struct intel_shadow_wa_ctx *wa_ctx)
+static void update_wa_ctx_2_shadow_ctx(struct intel_shadow_wa_ctx *wa_ctx)
 {
-	struct intel_vgpu_workload *workload = container_of(wa_ctx,
-					struct intel_vgpu_workload,
-					wa_ctx);
-	int ring_id = workload->ring_id;
-	struct intel_vgpu_submission *s = &workload->vgpu->submission;
-	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
-	struct drm_i915_gem_object *ctx_obj =
-		shadow_ctx->engine[ring_id].state->obj;
-	struct execlist_ring_context *shadow_ring_context;
-	struct page *page;
-
-	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
-	shadow_ring_context = kmap_atomic(page);
+	struct intel_vgpu_workload *workload =
+		container_of(wa_ctx, struct intel_vgpu_workload, wa_ctx);
+	struct i915_request *rq = workload->req;
+	struct execlist_ring_context *shadow_ring_context =
+		(struct execlist_ring_context *)rq->hw_context->lrc_reg_state;
 
 	shadow_ring_context->bb_per_ctx_ptr.val =
 		(shadow_ring_context->bb_per_ctx_ptr.val &
@@ -517,9 +496,6 @@  static int update_wa_ctx_2_shadow_ctx(struct intel_shadow_wa_ctx *wa_ctx)
 	shadow_ring_context->rcs_indirect_ctx.val =
 		(shadow_ring_context->rcs_indirect_ctx.val &
 		(~INDIRECT_CTX_ADDR_MASK)) | wa_ctx->indirect_ctx.shadow_gma;
-
-	kunmap_atomic(shadow_ring_context);
-	return 0;
 }
 
 static int prepare_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
@@ -648,12 +624,9 @@  static int prepare_workload(struct intel_vgpu_workload *workload)
 static int dispatch_workload(struct intel_vgpu_workload *workload)
 {
 	struct intel_vgpu *vgpu = workload->vgpu;
-	struct intel_vgpu_submission *s = &vgpu->submission;
-	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
 	int ring_id = workload->ring_id;
-	struct intel_engine_cs *engine = dev_priv->engine[ring_id];
-	int ret = 0;
+	int ret;
 
 	gvt_dbg_sched("ring id %d prepare to dispatch workload %p\n",
 		ring_id, workload);
@@ -665,10 +638,6 @@  static int dispatch_workload(struct intel_vgpu_workload *workload)
 		goto out;
 
 	ret = prepare_workload(workload);
-	if (ret) {
-		engine->context_unpin(engine, shadow_ctx);
-		goto out;
-	}
 
 out:
 	if (ret)
@@ -743,27 +712,23 @@  static struct intel_vgpu_workload *pick_next_workload(
 
 static void update_guest_context(struct intel_vgpu_workload *workload)
 {
+	struct i915_request *rq = workload->req;
 	struct intel_vgpu *vgpu = workload->vgpu;
 	struct intel_gvt *gvt = vgpu->gvt;
-	struct intel_vgpu_submission *s = &vgpu->submission;
-	struct i915_gem_context *shadow_ctx = s->shadow_ctx;
-	int ring_id = workload->ring_id;
-	struct drm_i915_gem_object *ctx_obj =
-		shadow_ctx->engine[ring_id].state->obj;
+	struct drm_i915_gem_object *ctx_obj = rq->hw_context->state->obj;
 	struct execlist_ring_context *shadow_ring_context;
 	struct page *page;
 	void *src;
 	unsigned long context_gpa, context_page_num;
 	int i;
 
-	gvt_dbg_sched("ring id %d workload lrca %x\n", ring_id,
-			workload->ctx_desc.lrca);
-
-	context_page_num = gvt->dev_priv->engine[ring_id]->context_size;
+	gvt_dbg_sched("ring id %d workload lrca %x\n", rq->engine->id,
+		      workload->ctx_desc.lrca);
 
+	context_page_num = rq->engine->context_size;
 	context_page_num = context_page_num >> PAGE_SHIFT;
 
-	if (IS_BROADWELL(gvt->dev_priv) && ring_id == RCS)
+	if (IS_BROADWELL(gvt->dev_priv) && rq->engine->id == RCS)
 		context_page_num = 19;
 
 	i = 2;
@@ -836,6 +801,7 @@  static void complete_current_workload(struct intel_gvt *gvt, int ring_id)
 		scheduler->current_workload[ring_id];
 	struct intel_vgpu *vgpu = workload->vgpu;
 	struct intel_vgpu_submission *s = &vgpu->submission;
+	struct i915_request *rq;
 	int event;
 
 	mutex_lock(&gvt->lock);
@@ -844,11 +810,8 @@  static void complete_current_workload(struct intel_gvt *gvt, int ring_id)
 	 * switch to make sure request is completed.
 	 * For the workload w/o request, directly complete the workload.
 	 */
-	if (workload->req) {
-		struct drm_i915_private *dev_priv =
-			workload->vgpu->gvt->dev_priv;
-		struct intel_engine_cs *engine =
-			dev_priv->engine[workload->ring_id];
+	rq = fetch_and_zero(&workload->req);
+	if (rq) {
 		wait_event(workload->shadow_ctx_status_wq,
 			   !atomic_read(&workload->shadow_ctx_active));
 
@@ -864,8 +827,6 @@  static void complete_current_workload(struct intel_gvt *gvt, int ring_id)
 				workload->status = 0;
 		}
 
-		i915_request_put(fetch_and_zero(&workload->req));
-
 		if (!workload->status && !(vgpu->resetting_eng &
 					   ENGINE_MASK(ring_id))) {
 			update_guest_context(workload);
@@ -874,10 +835,13 @@  static void complete_current_workload(struct intel_gvt *gvt, int ring_id)
 					 INTEL_GVT_EVENT_MAX)
 				intel_vgpu_trigger_virtual_event(vgpu, event);
 		}
-		mutex_lock(&dev_priv->drm.struct_mutex);
+
 		/* unpin shadow ctx as the shadow_ctx update is done */
-		engine->context_unpin(engine, s->shadow_ctx);
-		mutex_unlock(&dev_priv->drm.struct_mutex);
+		mutex_lock(&rq->i915->drm.struct_mutex);
+		intel_context_unpin(rq->hw_context);
+		mutex_unlock(&rq->i915->drm.struct_mutex);
+
+		i915_request_put(rq);
 	}
 
 	gvt_dbg_sched("ring id %d complete workload %p status %d\n",
@@ -1251,7 +1215,6 @@  alloc_workload(struct intel_vgpu *vgpu)
 	atomic_set(&workload->shadow_ctx_active, 0);
 
 	workload->status = -EINPROGRESS;
-	workload->shadowed = false;
 	workload->vgpu = vgpu;
 
 	return workload;
diff --git a/drivers/gpu/drm/i915/gvt/scheduler.h b/drivers/gpu/drm/i915/gvt/scheduler.h
index 486ed57a4ad1..b0e04017d7a2 100644
--- a/drivers/gpu/drm/i915/gvt/scheduler.h
+++ b/drivers/gpu/drm/i915/gvt/scheduler.h
@@ -83,7 +83,6 @@  struct intel_vgpu_workload {
 	struct i915_request *req;
 	/* if this workload has been dispatched to i915? */
 	bool dispatched;
-	bool shadowed;
 	int status;
 
 	struct intel_vgpu_mm *shadow_mm;
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 792f69e44ba5..fd202185d6e5 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -377,16 +377,19 @@  static void print_batch_pool_stats(struct seq_file *m,
 	print_file_stats(m, "[k]batch pool", stats);
 }
 
-static int per_file_ctx_stats(int id, void *ptr, void *data)
+static int per_file_ctx_stats(int idx, void *ptr, void *data)
 {
 	struct i915_gem_context *ctx = ptr;
-	int n;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	for_each_engine(engine, ctx->i915, id) {
+		struct intel_context *ce = to_intel_context(ctx, engine);
 
-	for (n = 0; n < ARRAY_SIZE(ctx->engine); n++) {
-		if (ctx->engine[n].state)
-			per_file_stats(0, ctx->engine[n].state->obj, data);
-		if (ctx->engine[n].ring)
-			per_file_stats(0, ctx->engine[n].ring->vma->obj, data);
+		if (ce->state)
+			per_file_stats(0, ce->state->obj, data);
+		if (ce->ring)
+			per_file_stats(0, ce->ring->vma->obj, data);
 	}
 
 	return 0;
@@ -1960,7 +1963,8 @@  static int i915_context_status(struct seq_file *m, void *unused)
 		seq_putc(m, '\n');
 
 		for_each_engine(engine, dev_priv, id) {
-			struct intel_context *ce = &ctx->engine[engine->id];
+			struct intel_context *ce =
+				to_intel_context(ctx, engine);
 
 			seq_printf(m, "%s: ", engine->name);
 			if (ce->state)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 028691108125..8907f71b900a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1945,6 +1945,7 @@  struct drm_i915_private {
 			 */
 			struct i915_perf_stream *exclusive_stream;
 
+			struct intel_context *pinned_ctx;
 			u32 specific_ctx_id;
 
 			struct hrtimer poll_check_timer;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4dba735505d4..e186e9194fad 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3227,14 +3227,14 @@  void i915_gem_reset(struct drm_i915_private *dev_priv,
 	i915_retire_requests(dev_priv);
 
 	for_each_engine(engine, dev_priv, id) {
-		struct i915_gem_context *ctx;
+		struct intel_context *ce;
 
 		i915_gem_reset_engine(engine,
 				      engine->hangcheck.active_request,
 				      stalled_mask & ENGINE_MASK(id));
-		ctx = fetch_and_zero(&engine->last_retired_context);
-		if (ctx)
-			engine->context_unpin(engine, ctx);
+		ce = fetch_and_zero(&engine->last_retired_context);
+		if (ce)
+			intel_context_unpin(ce);
 
 		/*
 		 * Ostensibily, we always want a context loaded for powersaving,
@@ -4948,13 +4948,13 @@  void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
 
 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
 {
-	struct i915_gem_context *kernel_context = i915->kernel_context;
+	struct i915_gem_context *kctx = i915->kernel_context;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
 	for_each_engine(engine, i915, id) {
 		GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline->last_request));
-		GEM_BUG_ON(engine->last_retired_context != kernel_context);
+		GEM_BUG_ON(engine->last_retired_context->gem_context != kctx);
 	}
 }
 
@@ -5291,7 +5291,7 @@  static int __intel_engines_record_defaults(struct drm_i915_private *i915)
 	for_each_engine(engine, i915, id) {
 		struct i915_vma *state;
 
-		state = ctx->engine[id].state;
+		state = to_intel_context(ctx, engine)->state;
 		if (!state)
 			continue;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 2fe779cab298..ed1c591ee866 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -125,16 +125,10 @@  static void i915_gem_context_free(struct i915_gem_context *ctx)
 	i915_ppgtt_put(ctx->ppgtt);
 
 	for (i = 0; i < I915_NUM_ENGINES; i++) {
-		struct intel_context *ce = &ctx->engine[i];
+		struct intel_context *ce = &ctx->__engine[i];
 
-		if (!ce->state)
-			continue;
-
-		WARN_ON(ce->pin_count);
-		if (ce->ring)
-			intel_ring_free(ce->ring);
-
-		__i915_gem_object_release_unless_active(ce->state->obj);
+		if (ce->ops)
+			ce->ops->destroy(ce);
 	}
 
 	kfree(ctx->name);
@@ -266,6 +260,7 @@  __create_hw_context(struct drm_i915_private *dev_priv,
 		    struct drm_i915_file_private *file_priv)
 {
 	struct i915_gem_context *ctx;
+	unsigned int n;
 	int ret;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -283,6 +278,12 @@  __create_hw_context(struct drm_i915_private *dev_priv,
 	ctx->i915 = dev_priv;
 	ctx->sched.priority = I915_PRIORITY_NORMAL;
 
+	for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) {
+		struct intel_context *ce = &ctx->__engine[n];
+
+		ce->gem_context = ctx;
+	}
+
 	INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL);
 	INIT_LIST_HEAD(&ctx->handles_list);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index b12a8a8c5af9..61bff6d37c6a 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -45,6 +45,11 @@  struct intel_ring;
 
 #define DEFAULT_CONTEXT_HANDLE 0
 
+struct intel_context_ops {
+	void (*unpin)(struct intel_context *ce);
+	void (*destroy)(struct intel_context *ce);
+};
+
 /**
  * struct i915_gem_context - client state
  *
@@ -144,12 +149,15 @@  struct i915_gem_context {
 
 	/** engine: per-engine logical HW state */
 	struct intel_context {
+		struct i915_gem_context *gem_context;
 		struct i915_vma *state;
 		struct intel_ring *ring;
 		u32 *lrc_reg_state;
 		u64 lrc_desc;
 		int pin_count;
-	} engine[I915_NUM_ENGINES];
+
+		const struct intel_context_ops *ops;
+	} __engine[I915_NUM_ENGINES];
 
 	/** ring_size: size for allocating the per-engine ring buffer */
 	u32 ring_size;
@@ -256,6 +264,23 @@  static inline bool i915_gem_context_is_kernel(struct i915_gem_context *ctx)
 	return !ctx->file_priv;
 }
 
+static inline struct intel_context *
+to_intel_context(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
+{
+	return &ctx->__engine[engine->id];
+}
+
+static inline struct intel_context *
+intel_context_pin(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
+{
+	return engine->context_pin(engine, ctx);
+}
+
+static inline void intel_context_unpin(struct intel_context *ce)
+{
+	ce->ops->unpin(ce);
+}
+
 /* i915_gem_context.c */
 int __must_check i915_gem_contexts_init(struct drm_i915_private *dev_priv);
 void i915_gem_contexts_lost(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 269574b7254c..4801047d4d6a 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1474,7 +1474,7 @@  static void gem_record_rings(struct i915_gpu_state *error)
 
 			ee->ctx =
 				i915_error_object_create(i915,
-							 ctx->engine[i].state);
+							 request->hw_context->state);
 
 			error->simulated |=
 				i915_gem_context_no_error_capture(ctx);
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index bfc906cd4e5e..11afab651050 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1221,7 +1221,7 @@  static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
 		dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
 	} else {
 		struct intel_engine_cs *engine = dev_priv->engine[RCS];
-		struct intel_ring *ring;
+		struct intel_context *ce;
 		int ret;
 
 		ret = i915_mutex_lock_interruptible(&dev_priv->drm);
@@ -1234,19 +1234,19 @@  static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
 		 *
 		 * NB: implied RCS engine...
 		 */
-		ring = engine->context_pin(engine, stream->ctx);
+		ce = intel_context_pin(stream->ctx, engine);
 		mutex_unlock(&dev_priv->drm.struct_mutex);
-		if (IS_ERR(ring))
-			return PTR_ERR(ring);
+		if (IS_ERR(ce))
+			return PTR_ERR(ce);
 
+		dev_priv->perf.oa.pinned_ctx = ce;
 
 		/*
 		 * Explicitly track the ID (instead of calling
 		 * i915_ggtt_offset() on the fly) considering the difference
 		 * with gen8+ and execlists
 		 */
-		dev_priv->perf.oa.specific_ctx_id =
-			i915_ggtt_offset(stream->ctx->engine[engine->id].state);
+		dev_priv->perf.oa.specific_ctx_id = i915_ggtt_offset(ce->state);
 	}
 
 	return 0;
@@ -1262,17 +1262,14 @@  static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
 static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct intel_context *ce;
 
-	if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
-		dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
-	} else {
-		struct intel_engine_cs *engine = dev_priv->engine[RCS];
+	dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
 
+	ce = fetch_and_zero(&dev_priv->perf.oa.pinned_ctx);
+	if (ce) {
 		mutex_lock(&dev_priv->drm.struct_mutex);
-
-		dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
-		engine->context_unpin(engine, stream->ctx);
-
+		intel_context_unpin(ce);
 		mutex_unlock(&dev_priv->drm.struct_mutex);
 	}
 }
@@ -1759,6 +1756,7 @@  static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_pr
 static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 				       const struct i915_oa_config *oa_config)
 {
+	struct intel_engine_cs *engine = dev_priv->engine[RCS];
 	struct i915_gem_context *ctx;
 	int ret;
 	unsigned int wait_flags = I915_WAIT_LOCKED;
@@ -1789,7 +1787,7 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 
 	/* Update all contexts now that we've stalled the submission. */
 	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
-		struct intel_context *ce = &ctx->engine[RCS];
+		struct intel_context *ce = to_intel_context(ctx, engine);
 		u32 *regs;
 
 		/* OA settings will be set upon first use */
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index f913e56604ea..da97ce4e18a9 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -410,8 +410,8 @@  static void i915_request_retire(struct i915_request *request)
 	 * the subsequent request.
 	 */
 	if (engine->last_retired_context)
-		engine->context_unpin(engine, engine->last_retired_context);
-	engine->last_retired_context = request->gem_context;
+		intel_context_unpin(engine->last_retired_context);
+	engine->last_retired_context = request->hw_context;
 
 	spin_lock_irq(&request->lock);
 	if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
@@ -613,7 +613,7 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 {
 	struct drm_i915_private *i915 = engine->i915;
 	struct i915_request *rq;
-	struct intel_ring *ring;
+	struct intel_context *ce;
 	int ret;
 
 	lockdep_assert_held(&i915->drm.struct_mutex);
@@ -637,16 +637,15 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 * GGTT space, so do this first before we reserve a seqno for
 	 * ourselves.
 	 */
-	ring = engine->context_pin(engine, ctx);
-	if (IS_ERR(ring))
-		return ERR_CAST(ring);
-	GEM_BUG_ON(!ring);
+	ce = intel_context_pin(ctx, engine);
+	if (IS_ERR(ce))
+		return ERR_CAST(ce);
 
 	ret = reserve_engine(engine);
 	if (ret)
 		goto err_unpin;
 
-	ret = intel_ring_wait_for_space(ring, MIN_SPACE_FOR_ADD_REQUEST);
+	ret = intel_ring_wait_for_space(ce->ring, MIN_SPACE_FOR_ADD_REQUEST);
 	if (ret)
 		goto err_unreserve;
 
@@ -733,7 +732,8 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	rq->i915 = i915;
 	rq->engine = engine;
 	rq->gem_context = ctx;
-	rq->ring = ring;
+	rq->hw_context = ce;
+	rq->ring = ce->ring;
 
 	/* No zalloc, must clear what we need by hand */
 	rq->global_seqno = 0;
@@ -786,7 +786,7 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 err_unreserve:
 	unreserve_engine(engine);
 err_unpin:
-	engine->context_unpin(engine, ctx);
+	intel_context_unpin(ce);
 	return ERR_PTR(ret);
 }
 
@@ -972,7 +972,6 @@  i915_request_await_object(struct i915_request *to,
 void __i915_request_add(struct i915_request *request, bool flush_caches)
 {
 	struct intel_engine_cs *engine = request->engine;
-	struct intel_ring *ring = request->ring;
 	struct intel_timeline *timeline = request->timeline;
 	struct i915_request *prev;
 	u32 *cs;
@@ -1048,7 +1047,7 @@  void __i915_request_add(struct i915_request *request, bool flush_caches)
 	GEM_BUG_ON(timeline->seqno != request->fence.seqno);
 	i915_gem_active_set(&timeline->last_request, request);
 
-	list_add_tail(&request->ring_link, &ring->request_list);
+	list_add_tail(&request->ring_link, &request->ring->request_list);
 	request->emitted_jiffies = jiffies;
 
 	/*
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 6a029b3f0a88..7ea939f974ea 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -92,8 +92,9 @@  struct i915_request {
 	 * i915_request_free() will then decrement the refcount on the
 	 * context.
 	 */
-	struct i915_gem_context *gem_context;
 	struct intel_engine_cs *engine;
+	struct i915_gem_context *gem_context;
+	struct intel_context *hw_context;
 	struct intel_ring *ring;
 	struct intel_timeline *timeline;
 	struct intel_signal_node signaling;
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 4749426f9cad..ca8453ff4a47 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -665,6 +665,12 @@  static int init_phys_status_page(struct intel_engine_cs *engine)
 	return 0;
 }
 
+static void __intel_context_unpin(struct i915_gem_context *ctx,
+				  struct intel_engine_cs *engine)
+{
+	intel_context_unpin(to_intel_context(ctx, engine));
+}
+
 /**
  * intel_engines_init_common - initialize cengine state which might require hw access
  * @engine: Engine to initialize.
@@ -678,7 +684,8 @@  static int init_phys_status_page(struct intel_engine_cs *engine)
  */
 int intel_engine_init_common(struct intel_engine_cs *engine)
 {
-	struct intel_ring *ring;
+	struct drm_i915_private *i915 = engine->i915;
+	struct intel_context *ce;
 	int ret;
 
 	engine->set_default_submission(engine);
@@ -690,19 +697,18 @@  int intel_engine_init_common(struct intel_engine_cs *engine)
 	 * be available. To avoid this we always pin the default
 	 * context.
 	 */
-	ring = engine->context_pin(engine, engine->i915->kernel_context);
-	if (IS_ERR(ring))
-		return PTR_ERR(ring);
+	ce = intel_context_pin(i915->kernel_context, engine);
+	if (IS_ERR(ce))
+		return PTR_ERR(ce);
 
 	/*
 	 * Similarly the preempt context must always be available so that
 	 * we can interrupt the engine at any time.
 	 */
-	if (engine->i915->preempt_context) {
-		ring = engine->context_pin(engine,
-					   engine->i915->preempt_context);
-		if (IS_ERR(ring)) {
-			ret = PTR_ERR(ring);
+	if (i915->preempt_context) {
+		ce = intel_context_pin(i915->preempt_context, engine);
+		if (IS_ERR(ce)) {
+			ret = PTR_ERR(ce);
 			goto err_unpin_kernel;
 		}
 	}
@@ -711,7 +717,7 @@  int intel_engine_init_common(struct intel_engine_cs *engine)
 	if (ret)
 		goto err_unpin_preempt;
 
-	if (HWS_NEEDS_PHYSICAL(engine->i915))
+	if (HWS_NEEDS_PHYSICAL(i915))
 		ret = init_phys_status_page(engine);
 	else
 		ret = init_status_page(engine);
@@ -723,10 +729,11 @@  int intel_engine_init_common(struct intel_engine_cs *engine)
 err_breadcrumbs:
 	intel_engine_fini_breadcrumbs(engine);
 err_unpin_preempt:
-	if (engine->i915->preempt_context)
-		engine->context_unpin(engine, engine->i915->preempt_context);
+	if (i915->preempt_context)
+		__intel_context_unpin(i915->preempt_context, engine);
+
 err_unpin_kernel:
-	engine->context_unpin(engine, engine->i915->kernel_context);
+	__intel_context_unpin(i915->kernel_context, engine);
 	return ret;
 }
 
@@ -739,6 +746,8 @@  int intel_engine_init_common(struct intel_engine_cs *engine)
  */
 void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 {
+	struct drm_i915_private *i915 = engine->i915;
+
 	intel_engine_cleanup_scratch(engine);
 
 	if (HWS_NEEDS_PHYSICAL(engine->i915))
@@ -753,9 +762,9 @@  void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 	if (engine->default_state)
 		i915_gem_object_put(engine->default_state);
 
-	if (engine->i915->preempt_context)
-		engine->context_unpin(engine, engine->i915->preempt_context);
-	engine->context_unpin(engine, engine->i915->kernel_context);
+	if (i915->preempt_context)
+		__intel_context_unpin(i915->preempt_context, engine);
+	__intel_context_unpin(i915->kernel_context, engine);
 }
 
 u64 intel_engine_get_active_head(const struct intel_engine_cs *engine)
@@ -997,8 +1006,7 @@  bool intel_engines_are_idle(struct drm_i915_private *dev_priv)
  */
 bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine)
 {
-	const struct i915_gem_context * const kernel_context =
-		engine->i915->kernel_context;
+	const struct i915_gem_context *kctx = engine->i915->kernel_context;
 	struct i915_request *rq;
 
 	lockdep_assert_held(&engine->i915->drm.struct_mutex);
@@ -1010,9 +1018,12 @@  bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine)
 	 */
 	rq = __i915_gem_active_peek(&engine->timeline->last_request);
 	if (rq)
-		return rq->gem_context == kernel_context;
-	else
-		return engine->last_retired_context == kernel_context;
+		return rq->gem_context == kctx;
+
+	if (!engine->last_retired_context)
+		return false;
+
+	return engine->last_retired_context->gem_context == kctx;
 }
 
 void intel_engines_reset_default_submission(struct drm_i915_private *i915)
@@ -1096,16 +1107,16 @@  void intel_engines_unpark(struct drm_i915_private *i915)
  */
 void intel_engine_lost_context(struct intel_engine_cs *engine)
 {
-	struct i915_gem_context *ctx;
+	struct intel_context *ce;
 
 	lockdep_assert_held(&engine->i915->drm.struct_mutex);
 
 	engine->legacy_active_context = NULL;
 	engine->legacy_active_ppgtt = NULL;
 
-	ctx = fetch_and_zero(&engine->last_retired_context);
-	if (ctx)
-		engine->context_unpin(engine, ctx);
+	ce = fetch_and_zero(&engine->last_retired_context);
+	if (ce)
+		intel_context_unpin(ce);
 }
 
 bool intel_engine_can_store_dword(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/intel_guc_ads.c b/drivers/gpu/drm/i915/intel_guc_ads.c
index 334cb5202e1c..b8b4d346dd4d 100644
--- a/drivers/gpu/drm/i915/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/intel_guc_ads.c
@@ -121,7 +121,7 @@  int intel_guc_ads_create(struct intel_guc *guc)
 	 * to find it. Note that we have to skip our header (1 page),
 	 * because our GuC shared data is there.
 	 */
-	kernel_ctx_vma = dev_priv->kernel_context->engine[RCS].state;
+	kernel_ctx_vma = dev_priv->kernel_context->__engine[RCS].state;
 	blob->ads.golden_context_lrca =
 		intel_guc_ggtt_offset(guc, kernel_ctx_vma) + skipped_offset;
 
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index 8527fa1f5c3e..2fe3f9daa56d 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -362,7 +362,7 @@  static void guc_stage_desc_init(struct intel_guc *guc,
 	desc->db_id = client->doorbell_id;
 
 	for_each_engine_masked(engine, dev_priv, client->engines, tmp) {
-		struct intel_context *ce = &ctx->engine[engine->id];
+		struct intel_context *ce = to_intel_context(ctx, engine);
 		u32 guc_engine_id = engine->guc_id;
 		struct guc_execlist_context *lrc = &desc->lrc[guc_engine_id];
 
@@ -511,9 +511,7 @@  static void guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 {
 	struct intel_guc_client *client = guc->execbuf_client;
 	struct intel_engine_cs *engine = rq->engine;
-	u32 ctx_desc =
-		lower_32_bits(intel_lr_context_descriptor(rq->gem_context,
-							  engine));
+	u32 ctx_desc = lower_32_bits(rq->hw_context->lrc_desc);
 	u32 ring_tail = intel_ring_set_tail(rq->ring, rq->tail) / sizeof(u64);
 
 	spin_lock(&client->wq_lock);
@@ -551,8 +549,8 @@  static void inject_preempt_context(struct work_struct *work)
 					     preempt_work[engine->id]);
 	struct intel_guc_client *client = guc->preempt_client;
 	struct guc_stage_desc *stage_desc = __get_stage_desc(client);
-	u32 ctx_desc = lower_32_bits(intel_lr_context_descriptor(client->owner,
-								 engine));
+	u32 ctx_desc = lower_32_bits(to_intel_context(client->owner,
+						      engine)->lrc_desc);
 	u32 data[7];
 
 	/*
@@ -708,7 +706,7 @@  static void guc_dequeue(struct intel_engine_cs *engine)
 		struct i915_request *rq, *rn;
 
 		list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
-			if (last && rq->gem_context != last->gem_context) {
+			if (last && rq->hw_context != last->hw_context) {
 				if (port == last_port) {
 					__list_del_many(&p->requests,
 							&rq->sched.link);
@@ -991,7 +989,8 @@  static void guc_fill_preempt_context(struct intel_guc *guc)
 	enum intel_engine_id id;
 
 	for_each_engine(engine, dev_priv, id) {
-		struct intel_context *ce = &client->owner->engine[id];
+		struct intel_context *ce =
+			to_intel_context(client->owner, engine);
 		u32 addr = intel_hws_preempt_done_address(engine);
 		u32 *cs;
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 0777226e65a6..e6725690b5b6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -164,7 +164,8 @@ 
 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
 
 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
-					    struct intel_engine_cs *engine);
+					    struct intel_engine_cs *engine,
+					    struct intel_context *ce);
 static void execlists_init_reg_state(u32 *reg_state,
 				     struct i915_gem_context *ctx,
 				     struct intel_engine_cs *engine,
@@ -221,9 +222,9 @@  static inline bool need_preempt(const struct intel_engine_cs *engine,
  */
 static void
 intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
-				   struct intel_engine_cs *engine)
+				   struct intel_engine_cs *engine,
+				   struct intel_context *ce)
 {
-	struct intel_context *ce = &ctx->engine[engine->id];
 	u64 desc;
 
 	BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
@@ -414,7 +415,7 @@  execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
 
 static u64 execlists_update_context(struct i915_request *rq)
 {
-	struct intel_context *ce = &rq->gem_context->engine[rq->engine->id];
+	struct intel_context *ce = rq->hw_context;
 	struct i915_hw_ppgtt *ppgtt =
 		rq->gem_context->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
 	u32 *reg_state = ce->lrc_reg_state;
@@ -491,14 +492,14 @@  static void execlists_submit_ports(struct intel_engine_cs *engine)
 	execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
 }
 
-static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
+static bool ctx_single_port_submission(const struct intel_context *ctx)
 {
 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
-		i915_gem_context_force_single_submission(ctx));
+		i915_gem_context_force_single_submission(ctx->gem_context));
 }
 
-static bool can_merge_ctx(const struct i915_gem_context *prev,
-			  const struct i915_gem_context *next)
+static bool can_merge_ctx(const struct intel_context *prev,
+			  const struct intel_context *next)
 {
 	if (prev != next)
 		return false;
@@ -523,7 +524,7 @@  static void inject_preempt_context(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists *execlists = &engine->execlists;
 	struct intel_context *ce =
-		&engine->i915->preempt_context->engine[engine->id];
+		to_intel_context(engine->i915->preempt_context, engine);
 	unsigned int n;
 
 	GEM_BUG_ON(execlists->preempt_complete_status !=
@@ -666,8 +667,8 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 			 * second request, and so we never need to tell the
 			 * hardware about the first.
 			 */
-			if (last && !can_merge_ctx(rq->gem_context,
-						   last->gem_context)) {
+			if (last &&
+			    !can_merge_ctx(rq->hw_context, last->hw_context)) {
 				/*
 				 * If we are on the second port and cannot
 				 * combine this request with the last, then we
@@ -686,14 +687,14 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 				 * the same context (even though a different
 				 * request) to the second port.
 				 */
-				if (ctx_single_port_submission(last->gem_context) ||
-				    ctx_single_port_submission(rq->gem_context)) {
+				if (ctx_single_port_submission(last->hw_context) ||
+				    ctx_single_port_submission(rq->hw_context)) {
 					__list_del_many(&p->requests,
 							&rq->sched.link);
 					goto done;
 				}
 
-				GEM_BUG_ON(last->gem_context == rq->gem_context);
+				GEM_BUG_ON(last->hw_context == rq->hw_context);
 
 				if (submit)
 					port_assign(port, last);
@@ -1277,6 +1278,37 @@  static void execlists_schedule(struct i915_request *request,
 	spin_unlock_irq(&engine->timeline->lock);
 }
 
+static void execlists_context_destroy(struct intel_context *ce)
+{
+	GEM_BUG_ON(!ce->state);
+	GEM_BUG_ON(ce->pin_count);
+
+	intel_ring_free(ce->ring);
+	__i915_gem_object_release_unless_active(ce->state->obj);
+}
+
+static void __execlists_context_unpin(struct intel_context *ce)
+{
+	intel_ring_unpin(ce->ring);
+
+	ce->state->obj->pin_global--;
+	i915_gem_object_unpin_map(ce->state->obj);
+	i915_vma_unpin(ce->state);
+
+	i915_gem_context_put(ce->gem_context);
+}
+
+static void execlists_context_unpin(struct intel_context *ce)
+{
+	lockdep_assert_held(&ce->gem_context->i915->drm.struct_mutex);
+	GEM_BUG_ON(ce->pin_count == 0);
+
+	if (--ce->pin_count)
+		return;
+
+	__execlists_context_unpin(ce);
+}
+
 static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
 {
 	unsigned int flags;
@@ -1300,21 +1332,15 @@  static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
 	return i915_vma_pin(vma, 0, GEN8_LR_CONTEXT_ALIGN, flags);
 }
 
-static struct intel_ring *
-execlists_context_pin(struct intel_engine_cs *engine,
-		      struct i915_gem_context *ctx)
+static struct intel_context *
+__execlists_context_pin(struct intel_engine_cs *engine,
+			struct i915_gem_context *ctx,
+			struct intel_context *ce)
 {
-	struct intel_context *ce = &ctx->engine[engine->id];
 	void *vaddr;
 	int ret;
 
-	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
-
-	if (likely(ce->pin_count++))
-		goto out;
-	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
-
-	ret = execlists_context_deferred_alloc(ctx, engine);
+	ret = execlists_context_deferred_alloc(ctx, engine, ce);
 	if (ret)
 		goto err;
 	GEM_BUG_ON(!ce->state);
@@ -1333,7 +1359,7 @@  execlists_context_pin(struct intel_engine_cs *engine,
 	if (ret)
 		goto unpin_map;
 
-	intel_lr_context_descriptor_update(ctx, engine);
+	intel_lr_context_descriptor_update(ctx, engine, ce);
 
 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
 	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
@@ -1342,8 +1368,7 @@  execlists_context_pin(struct intel_engine_cs *engine,
 
 	ce->state->obj->pin_global++;
 	i915_gem_context_get(ctx);
-out:
-	return ce->ring;
+	return ce;
 
 unpin_map:
 	i915_gem_object_unpin_map(ce->state->obj);
@@ -1354,33 +1379,33 @@  execlists_context_pin(struct intel_engine_cs *engine,
 	return ERR_PTR(ret);
 }
 
-static void execlists_context_unpin(struct intel_engine_cs *engine,
-				    struct i915_gem_context *ctx)
+static const struct intel_context_ops execlists_context_ops = {
+	.unpin = execlists_context_unpin,
+	.destroy = execlists_context_destroy,
+};
+
+static struct intel_context *
+execlists_context_pin(struct intel_engine_cs *engine,
+		      struct i915_gem_context *ctx)
 {
-	struct intel_context *ce = &ctx->engine[engine->id];
+	struct intel_context *ce = to_intel_context(ctx, engine);
 
 	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
-	GEM_BUG_ON(ce->pin_count == 0);
-
-	if (--ce->pin_count)
-		return;
 
-	intel_ring_unpin(ce->ring);
+	if (likely(ce->pin_count++))
+		return ce;
+	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
 
-	ce->state->obj->pin_global--;
-	i915_gem_object_unpin_map(ce->state->obj);
-	i915_vma_unpin(ce->state);
+	ce->ops = &execlists_context_ops;
 
-	i915_gem_context_put(ctx);
+	return __execlists_context_pin(engine, ctx, ce);
 }
 
 static int execlists_request_alloc(struct i915_request *request)
 {
-	struct intel_engine_cs *engine = request->engine;
-	struct intel_context *ce = &request->gem_context->engine[engine->id];
-	int ret;
+	int err;
 
-	GEM_BUG_ON(!ce->pin_count);
+	GEM_BUG_ON(!request->hw_context->pin_count);
 
 	/* Flush enough space to reduce the likelihood of waiting after
 	 * we start building the request - in which case we will just
@@ -1388,9 +1413,9 @@  static int execlists_request_alloc(struct i915_request *request)
 	 */
 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
 
-	ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
-	if (ret)
-		return ret;
+	err = intel_ring_wait_for_space(request->ring, request->reserved_space);
+	if (err)
+		return err;
 
 	/* Note that after this point, we have committed to using
 	 * this request as it is being used to both track the
@@ -1780,8 +1805,8 @@  static void reset_common_ring(struct intel_engine_cs *engine,
 			      struct i915_request *request)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	struct intel_context *ce;
 	unsigned long flags;
+	u32 *regs;
 
 	GEM_TRACE("%s request global=%x, current=%d\n",
 		  engine->name, request ? request->global_seqno : 0,
@@ -1831,14 +1856,13 @@  static void reset_common_ring(struct intel_engine_cs *engine,
 	 * future request will be after userspace has had the opportunity
 	 * to recreate its own state.
 	 */
-	ce = &request->gem_context->engine[engine->id];
-	execlists_init_reg_state(ce->lrc_reg_state,
-				 request->gem_context, engine, ce->ring);
+	regs = request->hw_context->lrc_reg_state;
+	execlists_init_reg_state(regs,
+				 request->gem_context, engine, request->ring);
 
 	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
-	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
-		i915_ggtt_offset(ce->ring->vma);
-	ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
+	regs[CTX_RING_BUFFER_START+1] = i915_ggtt_offset(request->ring->vma);
+	regs[CTX_RING_HEAD+1] = request->postfix;
 
 	request->ring->head = request->postfix;
 	intel_ring_update_space(request->ring);
@@ -2176,8 +2200,6 @@  logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 	engine->reset_hw = reset_common_ring;
 
 	engine->context_pin = execlists_context_pin;
-	engine->context_unpin = execlists_context_unpin;
-
 	engine->request_alloc = execlists_request_alloc;
 
 	engine->emit_flush = gen8_emit_flush;
@@ -2272,9 +2294,13 @@  static int logical_ring_init(struct intel_engine_cs *engine)
 	}
 
 	engine->execlists.preempt_complete_status = ~0u;
-	if (engine->i915->preempt_context)
+	if (engine->i915->preempt_context) {
+		struct intel_context *ce =
+			to_intel_context(engine->i915->preempt_context, engine);
+
 		engine->execlists.preempt_complete_status =
-			upper_32_bits(engine->i915->preempt_context->engine[engine->id].lrc_desc);
+			upper_32_bits(ce->lrc_desc);
+	}
 
 	return 0;
 
@@ -2408,7 +2434,7 @@  static void execlists_init_reg_state(u32 *regs,
 	struct drm_i915_private *dev_priv = engine->i915;
 	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt;
 	u32 base = engine->mmio_base;
-	bool rcs = engine->id == RCS;
+	bool rcs = engine->class == RENDER_CLASS;
 
 	/* A context is actually a big batch buffer with several
 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
@@ -2553,10 +2579,10 @@  populate_lr_context(struct i915_gem_context *ctx,
 }
 
 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
-					    struct intel_engine_cs *engine)
+					    struct intel_engine_cs *engine,
+					    struct intel_context *ce)
 {
 	struct drm_i915_gem_object *ctx_obj;
-	struct intel_context *ce = &ctx->engine[engine->id];
 	struct i915_vma *vma;
 	uint32_t context_size;
 	struct intel_ring *ring;
@@ -2627,7 +2653,8 @@  void intel_lr_context_resume(struct drm_i915_private *dev_priv)
 	 */
 	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
 		for_each_engine(engine, dev_priv, id) {
-			struct intel_context *ce = &ctx->engine[engine->id];
+			struct intel_context *ce =
+				to_intel_context(ctx, engine);
 			u32 *reg;
 
 			if (!ce->state)
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 59d7b86012e9..1593194e930c 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -104,11 +104,4 @@  struct i915_gem_context;
 
 void intel_lr_context_resume(struct drm_i915_private *dev_priv);
 
-static inline uint64_t
-intel_lr_context_descriptor(struct i915_gem_context *ctx,
-			    struct intel_engine_cs *engine)
-{
-	return ctx->engine[engine->id].lrc_desc;
-}
-
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 3ea8eb5d49f5..9f526afa11ed 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -558,8 +558,7 @@  static void reset_ring_common(struct intel_engine_cs *engine,
 	 */
 	if (request) {
 		struct drm_i915_private *dev_priv = request->i915;
-		struct intel_context *ce =
-			&request->gem_context->engine[engine->id];
+		struct intel_context *ce = request->hw_context;
 		struct i915_hw_ppgtt *ppgtt;
 
 		if (ce->state) {
@@ -1164,9 +1163,33 @@  intel_ring_free(struct intel_ring *ring)
 	kfree(ring);
 }
 
-static int context_pin(struct i915_gem_context *ctx)
+static void intel_ring_context_destroy(struct intel_context *ce)
 {
-	struct i915_vma *vma = ctx->engine[RCS].state;
+	GEM_BUG_ON(ce->pin_count);
+
+	if (ce->state)
+		__i915_gem_object_release_unless_active(ce->state->obj);
+}
+
+static void intel_ring_context_unpin(struct intel_context *ce)
+{
+	lockdep_assert_held(&ce->gem_context->i915->drm.struct_mutex);
+	GEM_BUG_ON(ce->pin_count == 0);
+
+	if (--ce->pin_count)
+		return;
+
+	if (ce->state) {
+		ce->state->obj->pin_global--;
+		i915_vma_unpin(ce->state);
+	}
+
+	i915_gem_context_put(ce->gem_context);
+}
+
+static int __context_pin(struct intel_context *ce)
+{
+	struct i915_vma *vma = ce->state;
 	int ret;
 
 	/*
@@ -1253,25 +1276,19 @@  alloc_context_vma(struct intel_engine_cs *engine)
 	return ERR_PTR(err);
 }
 
-static struct intel_ring *
-intel_ring_context_pin(struct intel_engine_cs *engine,
-		       struct i915_gem_context *ctx)
+static struct intel_context *
+__ring_context_pin(struct intel_engine_cs *engine,
+		   struct i915_gem_context *ctx,
+		   struct intel_context *ce)
 {
-	struct intel_context *ce = &ctx->engine[engine->id];
-	int ret;
-
-	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
-
-	if (likely(ce->pin_count++))
-		goto out;
-	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
+	int err;
 
 	if (!ce->state && engine->context_size) {
 		struct i915_vma *vma;
 
 		vma = alloc_context_vma(engine);
 		if (IS_ERR(vma)) {
-			ret = PTR_ERR(vma);
+			err = PTR_ERR(vma);
 			goto err;
 		}
 
@@ -1279,8 +1296,8 @@  intel_ring_context_pin(struct intel_engine_cs *engine,
 	}
 
 	if (ce->state) {
-		ret = context_pin(ctx);
-		if (ret)
+		err = __context_pin(ce);
+		if (err)
 			goto err;
 
 		ce->state->obj->pin_global++;
@@ -1288,32 +1305,37 @@  intel_ring_context_pin(struct intel_engine_cs *engine,
 
 	i915_gem_context_get(ctx);
 
-out:
 	/* One ringbuffer to rule them all */
-	return engine->buffer;
+	GEM_BUG_ON(!engine->buffer);
+	ce->ring = engine->buffer;
+
+	return ce;
 
 err:
 	ce->pin_count = 0;
-	return ERR_PTR(ret);
+	return ERR_PTR(err);
 }
 
-static void intel_ring_context_unpin(struct intel_engine_cs *engine,
-				     struct i915_gem_context *ctx)
+static const struct intel_context_ops ring_context_ops = {
+	.unpin = intel_ring_context_unpin,
+	.destroy = intel_ring_context_destroy,
+};
+
+static struct intel_context *
+intel_ring_context_pin(struct intel_engine_cs *engine,
+		       struct i915_gem_context *ctx)
 {
-	struct intel_context *ce = &ctx->engine[engine->id];
+	struct intel_context *ce = to_intel_context(ctx, engine);
 
 	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
-	GEM_BUG_ON(ce->pin_count == 0);
 
-	if (--ce->pin_count)
-		return;
+	if (likely(ce->pin_count++))
+		return ce;
+	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
 
-	if (ce->state) {
-		ce->state->obj->pin_global--;
-		i915_vma_unpin(ce->state);
-	}
+	ce->ops = &ring_context_ops;
 
-	i915_gem_context_put(ctx);
+	return __ring_context_pin(engine, ctx, ce);
 }
 
 static int intel_init_ring_buffer(struct intel_engine_cs *engine)
@@ -1323,10 +1345,6 @@  static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 
 	intel_engine_setup_common(engine);
 
-	err = intel_engine_init_common(engine);
-	if (err)
-		goto err;
-
 	ring = intel_engine_create_ring(engine, 32 * PAGE_SIZE);
 	if (IS_ERR(ring)) {
 		err = PTR_ERR(ring);
@@ -1341,8 +1359,14 @@  static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 	GEM_BUG_ON(engine->buffer);
 	engine->buffer = ring;
 
+	err = intel_engine_init_common(engine);
+	if (err)
+		goto err_unpin;
+
 	return 0;
 
+err_unpin:
+	intel_ring_unpin(ring);
 err_ring:
 	intel_ring_free(ring);
 err:
@@ -1428,7 +1452,7 @@  static inline int mi_set_context(struct i915_request *rq, u32 flags)
 
 	*cs++ = MI_NOOP;
 	*cs++ = MI_SET_CONTEXT;
-	*cs++ = i915_ggtt_offset(rq->gem_context->engine[RCS].state) | flags;
+	*cs++ = i915_ggtt_offset(rq->hw_context->state) | flags;
 	/*
 	 * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
 	 * WaMiSetContext_Hang:snb,ivb,vlv
@@ -1519,7 +1543,7 @@  static int switch_context(struct i915_request *rq)
 		hw_flags = MI_FORCE_RESTORE;
 	}
 
-	if (to_ctx->engine[engine->id].state &&
+	if (rq->hw_context->state &&
 	    (to_ctx != from_ctx || hw_flags & MI_FORCE_RESTORE)) {
 		GEM_BUG_ON(engine->id != RCS);
 
@@ -1567,7 +1591,7 @@  static int ring_request_alloc(struct i915_request *request)
 {
 	int ret;
 
-	GEM_BUG_ON(!request->gem_context->engine[request->engine->id].pin_count);
+	GEM_BUG_ON(!request->hw_context->pin_count);
 
 	/* Flush enough space to reduce the likelihood of waiting after
 	 * we start building the request - in which case we will just
@@ -1994,8 +2018,6 @@  static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
 	engine->reset_hw = reset_ring_common;
 
 	engine->context_pin = intel_ring_context_pin;
-	engine->context_unpin = intel_ring_context_unpin;
-
 	engine->request_alloc = ring_request_alloc;
 
 	engine->emit_breadcrumb = i9xx_emit_breadcrumb;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 1231695fb4da..005d68e67d2e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -427,10 +427,9 @@  struct intel_engine_cs {
 
 	void		(*set_default_submission)(struct intel_engine_cs *engine);
 
-	struct intel_ring *(*context_pin)(struct intel_engine_cs *engine,
-					  struct i915_gem_context *ctx);
-	void		(*context_unpin)(struct intel_engine_cs *engine,
-					 struct i915_gem_context *ctx);
+	struct intel_context *(*context_pin)(struct intel_engine_cs *engine,
+					     struct i915_gem_context *ctx);
+
 	int		(*request_alloc)(struct i915_request *rq);
 	int		(*init_context)(struct i915_request *rq);
 
@@ -546,7 +545,7 @@  struct intel_engine_cs {
 	 * to the kernel context and trash it as the save may not happen
 	 * before the hardware is powered down.
 	 */
-	struct i915_gem_context *last_retired_context;
+	struct intel_context *last_retired_context;
 
 	/* We track the current MI_SET_CONTEXT in order to eliminate
 	 * redudant context switches. This presumes that requests are not
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index 501becc47c0c..8904f1ce64e3 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -30,6 +30,7 @@  mock_context(struct drm_i915_private *i915,
 	     const char *name)
 {
 	struct i915_gem_context *ctx;
+	unsigned int n;
 	int ret;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -43,6 +44,12 @@  mock_context(struct drm_i915_private *i915,
 	INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL);
 	INIT_LIST_HEAD(&ctx->handles_list);
 
+	for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) {
+		struct intel_context *ce = &ctx->__engine[n];
+
+		ce->gem_context = ctx;
+	}
+
 	ret = ida_simple_get(&i915->contexts.hw_ida,
 			     0, MAX_CONTEXT_HW_ID, GFP_KERNEL);
 	if (ret < 0)
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index 78a89efa1119..612c48123808 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -67,18 +67,36 @@  static void hw_delay_complete(struct timer_list *t)
 	spin_unlock(&engine->hw_lock);
 }
 
-static struct intel_ring *
-mock_context_pin(struct intel_engine_cs *engine,
-		 struct i915_gem_context *ctx)
+static void mock_context_unpin(struct intel_context *ce)
+{
+	if (--ce->pin_count)
+		return;
+
+	i915_gem_context_put(ce->gem_context);
+}
+
+static void mock_context_destroy(struct intel_context *ce)
 {
-	i915_gem_context_get(ctx);
-	return engine->buffer;
+	GEM_BUG_ON(ce->pin_count);
 }
 
-static void mock_context_unpin(struct intel_engine_cs *engine,
-			       struct i915_gem_context *ctx)
+static const struct intel_context_ops mock_context_ops = {
+	.unpin = mock_context_unpin,
+	.destroy = mock_context_destroy,
+};
+
+static struct intel_context *
+mock_context_pin(struct intel_engine_cs *engine,
+		 struct i915_gem_context *ctx)
 {
-	i915_gem_context_put(ctx);
+	struct intel_context *ce = to_intel_context(ctx, engine);
+
+	if (!ce->pin_count++) {
+		i915_gem_context_get(ctx);
+		ce->ring = engine->buffer;
+	}
+
+	return ce;
 }
 
 static int mock_request_alloc(struct i915_request *request)
@@ -168,7 +186,6 @@  struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 	engine->base.status_page.page_addr = (void *)(engine + 1);
 
 	engine->base.context_pin = mock_context_pin;
-	engine->base.context_unpin = mock_context_unpin;
 	engine->base.request_alloc = mock_request_alloc;
 	engine->base.emit_flush = mock_emit_flush;
 	engine->base.emit_breadcrumb = mock_emit_breadcrumb;
@@ -213,11 +230,13 @@  void mock_engine_free(struct intel_engine_cs *engine)
 {
 	struct mock_engine *mock =
 		container_of(engine, typeof(*mock), base);
+	struct intel_context *ce;
 
 	GEM_BUG_ON(timer_pending(&mock->hw_delay));
 
-	if (engine->last_retired_context)
-		engine->context_unpin(engine, engine->last_retired_context);
+	ce = fetch_and_zero(&engine->last_retired_context);
+	if (ce)
+		intel_context_unpin(ce);
 
 	intel_engine_fini_breadcrumbs(engine);
 
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index e6d4b882599a..47a9de741c5f 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -243,6 +243,11 @@  struct drm_i915_private *mock_gem_device(void)
 	if (!i915->kernel_context)
 		goto err_engine;
 
+	/* Fake the initial load for the powersaving context */
+	i915->engine[RCS]->last_retired_context =
+		intel_context_pin(i915->kernel_context, i915->engine[RCS]);
+	GEM_BUG_ON(IS_ERR_OR_NULL(i915->engine[RCS]->last_retired_context));
+
 	WARN_ON(i915_gemfs_init(i915));
 
 	return i915;