diff mbox series

[v10,5/9] drm/i915: vgpu context submission pv optimization

Message ID 1568699301-2799-6-git-send-email-xiaolin.zhang@intel.com (mailing list archive)
State New, archived
Headers show
Series i915 vgpu PV to improve vgpu performance | expand

Commit Message

Xiaolin Zhang Sept. 17, 2019, 5:48 a.m. UTC
It is performance optimization to override the actual submisison backend
in order to eliminate execlists csb process and reduce mmio trap numbers
for workload submission without context switch interrupt by talking with
GVT via PV submisison notification mechanism between guest and GVT.

Use PV_SUBMISSION to control this level of pv optimization.

v0: RFC.
v1: rebase.
v2: added pv ops for pv context submission. to maximize code resuse,
introduced 2 more ops (submit_ports & preempt_context) instead of 1 op
(set_default_submission) in engine structure. pv version of
submit_ports and preempt_context implemented.
v3:
1. to reduce more code duplication, code refactor and replaced 2 ops
"submit_ports & preempt_contex" from v2 by 1 ops "write_desc"
in engine structure. pv version of write_des implemented.
2. added VGT_G2V_ELSP_SUBMIT for g2v pv notification.
v4: implemented pv elsp submission tasklet as the backend workload
submisison by talking to GVT with PV notificaiton mechanism and renamed
VGT_G2V_ELSP_SUBMIT to VGT_G2V_PV_SUBMISIION.
v5: addressed v4 comments from Chris, intel_pv_submission.c added.
v6: addressed v5 comments from Chris, replaced engine id by hw_id.
v7: rebase.
v8: addressed v7 comments from Chris.

Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>
---
 drivers/gpu/drm/i915/Makefile              |   2 +-
 drivers/gpu/drm/i915/gt/intel_lrc.c        |  12 +-
 drivers/gpu/drm/i915/i915_vgpu.c           |  18 +-
 drivers/gpu/drm/i915/i915_vgpu.h           |  15 ++
 drivers/gpu/drm/i915/intel_pv_submission.c | 300 +++++++++++++++++++++++++++++
 5 files changed, 341 insertions(+), 6 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/intel_pv_submission.c

Comments

Chris Wilson Sept. 17, 2019, 9:54 a.m. UTC | #1
Quoting Xiaolin Zhang (2019-09-17 06:48:16)
> It is performance optimization to override the actual submisison backend
> in order to eliminate execlists csb process and reduce mmio trap numbers
> for workload submission without context switch interrupt by talking with
> GVT via PV submisison notification mechanism between guest and GVT.
> 
> Use PV_SUBMISSION to control this level of pv optimization.
> 
> v0: RFC.
> v1: rebase.
> v2: added pv ops for pv context submission. to maximize code resuse,
> introduced 2 more ops (submit_ports & preempt_context) instead of 1 op
> (set_default_submission) in engine structure. pv version of
> submit_ports and preempt_context implemented.
> v3:
> 1. to reduce more code duplication, code refactor and replaced 2 ops
> "submit_ports & preempt_contex" from v2 by 1 ops "write_desc"
> in engine structure. pv version of write_des implemented.
> 2. added VGT_G2V_ELSP_SUBMIT for g2v pv notification.
> v4: implemented pv elsp submission tasklet as the backend workload
> submisison by talking to GVT with PV notificaiton mechanism and renamed
> VGT_G2V_ELSP_SUBMIT to VGT_G2V_PV_SUBMISIION.
> v5: addressed v4 comments from Chris, intel_pv_submission.c added.
> v6: addressed v5 comments from Chris, replaced engine id by hw_id.
> v7: rebase.
> v8: addressed v7 comments from Chris.
> 
> Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>
> ---
>  drivers/gpu/drm/i915/Makefile              |   2 +-
>  drivers/gpu/drm/i915/gt/intel_lrc.c        |  12 +-
>  drivers/gpu/drm/i915/i915_vgpu.c           |  18 +-
>  drivers/gpu/drm/i915/i915_vgpu.h           |  15 ++
>  drivers/gpu/drm/i915/intel_pv_submission.c | 300 +++++++++++++++++++++++++++++
>  5 files changed, 341 insertions(+), 6 deletions(-)
>  create mode 100644 drivers/gpu/drm/i915/intel_pv_submission.c
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 658b930..f500cf1 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -250,7 +250,7 @@ i915-$(CONFIG_DRM_I915_SELFTEST) += \
>         selftests/igt_spinner.o
>  
>  # virtual gpu code
> -i915-y += i915_vgpu.o
> +i915-y += i915_vgpu.o intel_pv_submission.o
>  
>  ifeq ($(CONFIG_DRM_I915_GVT),y)
>  i915-y += intel_gvt.o
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index a3f0e49..5587aff 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -2933,10 +2933,14 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
>         engine->unpark = NULL;
>  
>         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
> -       if (!intel_vgpu_active(engine->i915)) {
> -               engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
> -               if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
> -                       engine->flags |= I915_ENGINE_HAS_PREEMPTION;
> +       engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
> +       if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
> +               engine->flags |= I915_ENGINE_HAS_PREEMPTION;
> +
> +       if (intel_vgpu_active(engine->i915)) {
> +               engine->flags &= ~I915_ENGINE_HAS_SEMAPHORES;
> +               engine->flags &= ~I915_ENGINE_HAS_PREEMPTION;
> +               intel_vgpu_config_pv_caps(engine->i915, PV_SUBMISSION, engine);
>         }
>  
>         if (INTEL_GEN(engine->i915) >= 12) /* XXX disabled for debugging */
> diff --git a/drivers/gpu/drm/i915/i915_vgpu.c b/drivers/gpu/drm/i915/i915_vgpu.c
> index e458892..a488b68 100644
> --- a/drivers/gpu/drm/i915/i915_vgpu.c
> +++ b/drivers/gpu/drm/i915/i915_vgpu.c
> @@ -97,7 +97,7 @@ void i915_detect_vgpu(struct drm_i915_private *dev_priv)
>         mutex_init(&dev_priv->vgpu.lock);
>  
>         /* guest driver PV capability */
> -       dev_priv->vgpu.pv_caps = PV_PPGTT_UPDATE;
> +       dev_priv->vgpu.pv_caps = PV_PPGTT_UPDATE | PV_SUBMISSION;
>  
>         if (!intel_vgpu_check_pv_caps(dev_priv, shared_area)) {
>                 DRM_INFO("Virtual GPU for Intel GVT-g detected.\n");
> @@ -389,6 +389,7 @@ void intel_vgpu_config_pv_caps(struct drm_i915_private *dev_priv,
>                 enum pv_caps cap, void *data)
>  {
>         struct i915_ppgtt *ppgtt;
> +       struct intel_engine_cs *engine;
>  
>         if (!intel_vgpu_enabled_pv_caps(dev_priv, cap))
>                 return;
> @@ -399,6 +400,11 @@ void intel_vgpu_config_pv_caps(struct drm_i915_private *dev_priv,
>                 ppgtt->vm.insert_entries = gen8_ppgtt_insert_4lvl_pv;
>                 ppgtt->vm.clear_range = gen8_ppgtt_clear_4lvl_pv;
>         }
> +
> +       if (cap == PV_SUBMISSION) {
> +               engine = (struct intel_engine_cs *)data;
> +               vgpu_set_pv_submission(engine);
> +       }
>  }
>  
>  /**
> @@ -594,6 +600,8 @@ static int intel_vgpu_setup_shared_page(struct drm_i915_private *dev_priv,
>         u64 gpa;
>         u16 ver_maj, ver_min;
>         int ret = 0;
> +       int i;
> +       u32 size;
>  
>         /* We allocate 1 page shared between guest and GVT for data exchange.
>          *       ___________.....................
> @@ -666,6 +674,14 @@ static int intel_vgpu_setup_shared_page(struct drm_i915_private *dev_priv,
>         pv->notify = intel_vgpu_pv_notify_mmio;
>         mutex_init(&pv->send_mutex);
>  
> +       /* setup PV per engine data exchange structure */
> +       size = sizeof(struct pv_submission);
> +       for (i = 0; i < PV_MAX_ENGINES_NUM; i++) {
> +               pv->pv_elsp[i] = (void *)base + PV_ELSP_OFF +  size * i;
> +               pv->pv_elsp[i]->submitted = false;
> +               spin_lock_init(&pv->pv_elsp[i]->lock);
> +       }
> +
>         return ret;
>  err:
>         __free_page(virt_to_page(base));
> diff --git a/drivers/gpu/drm/i915/i915_vgpu.h b/drivers/gpu/drm/i915/i915_vgpu.h
> index b0fee5b..c33cb05 100644
> --- a/drivers/gpu/drm/i915/i915_vgpu.h
> +++ b/drivers/gpu/drm/i915/i915_vgpu.h
> @@ -29,6 +29,8 @@
>  
>  #define PV_MAJOR               1
>  #define PV_MINOR               0
> +#define PV_MAX_ENGINES_NUM     (VECS1_HW + 1)
> +#define PV_ELSP_OFF            (PAGE_SIZE/8)
>  #define PV_DESC_OFF            (PAGE_SIZE/4)
>  #define PV_CMD_OFF             (PAGE_SIZE/2)
>  
> @@ -37,6 +39,7 @@
>   */
>  enum pv_caps {
>         PV_PPGTT_UPDATE = 0x1,
> +       PV_SUBMISSION = 0x2,
>  };
>  
>  /* PV actions */
> @@ -45,6 +48,7 @@ enum intel_vgpu_pv_action {
>         PV_ACTION_PPGTT_L4_ALLOC,
>         PV_ACTION_PPGTT_L4_CLEAR,
>         PV_ACTION_PPGTT_L4_INSERT,
> +       PV_ACTION_ELSP_SUBMISSION,
>  };
>  
>  /*
> @@ -56,6 +60,13 @@ struct gvt_shared_page {
>         u16 ver_minor;
>  };
>  
> +/* workload submission pv support data structure */
> +struct pv_submission {
> +       u64 descs[EXECLIST_MAX_PORTS];
> +       bool submitted;
> +       spinlock_t lock;
> +};
> +
>  /*
>   * Definition of the command transport message header (DW0)
>   *
> @@ -100,6 +111,9 @@ struct i915_virtual_gpu_pv {
>         struct gvt_shared_page *shared_page;
>         bool enabled;
>  
> +       /* per engine PV workload submission data */
> +       struct pv_submission *pv_elsp[PV_MAX_ENGINES_NUM];
> +
>         /* PV command buffer support */
>         struct vgpu_pv_ct_buffer ctb;
>         u32 next_fence;
> @@ -164,4 +178,5 @@ bool intel_vgpu_check_pv_caps(struct drm_i915_private *dev_priv,
>                 void __iomem *shared_area);
>  void intel_vgpu_config_pv_caps(struct drm_i915_private *dev_priv,
>                 enum pv_caps cap, void *data);
> +void vgpu_set_pv_submission(struct intel_engine_cs *engine);
>  #endif /* _I915_VGPU_H_ */
> diff --git a/drivers/gpu/drm/i915/intel_pv_submission.c b/drivers/gpu/drm/i915/intel_pv_submission.c
> new file mode 100644
> index 0000000..1083d56
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/intel_pv_submission.c
> @@ -0,0 +1,300 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2018 Intel Corporation
> + */
> +
> +#include "i915_vgpu.h"
> +#include "gt/intel_lrc_reg.h"
> +#include "gt/intel_gt_pm.h"
> +#include "i915_trace.h"
> +
> +#define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
> +
> +static u64 execlists_update_context(struct i915_request *rq)
> +{
> +       struct intel_context *ce = rq->hw_context;
> +       u32 *reg_state = ce->lrc_reg_state;
> +
> +       reg_state[CTX_RING_TAIL + 1] = intel_ring_set_tail(rq->ring, rq->tail);
> +
> +       return ce->lrc_desc;
> +}
> +
> +static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +{
> +       return rb_entry(rb, struct i915_priolist, node);
> +}
> +
> +static void pv_submit(struct intel_engine_cs *engine,
> +              struct i915_request **out,
> +              struct i915_request **end)
> +{
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
> +       struct i915_virtual_gpu_pv *pv = engine->i915->vgpu.pv;
> +       struct pv_submission *pv_elsp = pv->pv_elsp[engine->hw_id];
> +       struct i915_request *rq;
> +       int n, err;
> +
> +       memset(pv_elsp->descs, 0, sizeof(pv_elsp->descs));
> +       n = 0;
> +
> +       do {
> +               rq = *out++;
> +               pv_elsp->descs[n++] = execlists_update_context(rq);
> +       } while (out != end);
> +
> +       spin_lock(&pv_elsp->lock);
> +       pv_elsp->submitted = true;
> +       writel(PV_ACTION_ELSP_SUBMISSION, execlists->submit_reg);
> +
> +#define done (READ_ONCE(pv_elsp->submitted) == false)
> +       err = wait_for_atomic_us(done, 1000);

Please tell me you have a better plan than adding a 1ms wait with
interrupts off.

> +#undef done
> +       spin_unlock(&pv_elsp->lock);
> +
> +       if (unlikely(err))
> +               DRM_ERROR("PV (%s) workload submission failed\n", engine->name);
> +
> +}


> +void vgpu_set_pv_submission(struct intel_engine_cs *engine)
> +{
> +       /*
> +        * We inherit a bunch of functions from execlists that we'd like
> +        * to keep using:
> +        *
> +        *    engine->submit_request = execlists_submit_request;
> +        *    engine->cancel_requests = execlists_cancel_requests;
> +        *    engine->schedule = execlists_schedule;
> +        *
> +        * But we need to override the actual submission backend in order
> +        * to talk to the GVT with PV notification message.
> +        */
> +
> +       engine->execlists.tasklet.func = vgpu_pv_submission_tasklet;
> +
> +       /* do not use execlists park/unpark */
> +       engine->park = engine->unpark = NULL;

But this was used to enable the interrupts so that your tasklet was
called. guc dropped it because they only plan to run on HW that always
sends the interrupts; your supported HW needs the interrupts enabled on
demand.

Please go and add a test to demonstrate this.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 658b930..f500cf1 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -250,7 +250,7 @@  i915-$(CONFIG_DRM_I915_SELFTEST) += \
 	selftests/igt_spinner.o
 
 # virtual gpu code
-i915-y += i915_vgpu.o
+i915-y += i915_vgpu.o intel_pv_submission.o
 
 ifeq ($(CONFIG_DRM_I915_GVT),y)
 i915-y += intel_gvt.o
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index a3f0e49..5587aff 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2933,10 +2933,14 @@  void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
 	engine->unpark = NULL;
 
 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
-	if (!intel_vgpu_active(engine->i915)) {
-		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
-		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
-			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
+	engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
+	if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
+		engine->flags |= I915_ENGINE_HAS_PREEMPTION;
+
+	if (intel_vgpu_active(engine->i915)) {
+		engine->flags &= ~I915_ENGINE_HAS_SEMAPHORES;
+		engine->flags &= ~I915_ENGINE_HAS_PREEMPTION;
+		intel_vgpu_config_pv_caps(engine->i915,	PV_SUBMISSION, engine);
 	}
 
 	if (INTEL_GEN(engine->i915) >= 12) /* XXX disabled for debugging */
diff --git a/drivers/gpu/drm/i915/i915_vgpu.c b/drivers/gpu/drm/i915/i915_vgpu.c
index e458892..a488b68 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.c
+++ b/drivers/gpu/drm/i915/i915_vgpu.c
@@ -97,7 +97,7 @@  void i915_detect_vgpu(struct drm_i915_private *dev_priv)
 	mutex_init(&dev_priv->vgpu.lock);
 
 	/* guest driver PV capability */
-	dev_priv->vgpu.pv_caps = PV_PPGTT_UPDATE;
+	dev_priv->vgpu.pv_caps = PV_PPGTT_UPDATE | PV_SUBMISSION;
 
 	if (!intel_vgpu_check_pv_caps(dev_priv, shared_area)) {
 		DRM_INFO("Virtual GPU for Intel GVT-g detected.\n");
@@ -389,6 +389,7 @@  void intel_vgpu_config_pv_caps(struct drm_i915_private *dev_priv,
 		enum pv_caps cap, void *data)
 {
 	struct i915_ppgtt *ppgtt;
+	struct intel_engine_cs *engine;
 
 	if (!intel_vgpu_enabled_pv_caps(dev_priv, cap))
 		return;
@@ -399,6 +400,11 @@  void intel_vgpu_config_pv_caps(struct drm_i915_private *dev_priv,
 		ppgtt->vm.insert_entries = gen8_ppgtt_insert_4lvl_pv;
 		ppgtt->vm.clear_range = gen8_ppgtt_clear_4lvl_pv;
 	}
+
+	if (cap == PV_SUBMISSION) {
+		engine = (struct intel_engine_cs *)data;
+		vgpu_set_pv_submission(engine);
+	}
 }
 
 /**
@@ -594,6 +600,8 @@  static int intel_vgpu_setup_shared_page(struct drm_i915_private *dev_priv,
 	u64 gpa;
 	u16 ver_maj, ver_min;
 	int ret = 0;
+	int i;
+	u32 size;
 
 	/* We allocate 1 page shared between guest and GVT for data exchange.
 	 *       ___________.....................
@@ -666,6 +674,14 @@  static int intel_vgpu_setup_shared_page(struct drm_i915_private *dev_priv,
 	pv->notify = intel_vgpu_pv_notify_mmio;
 	mutex_init(&pv->send_mutex);
 
+	/* setup PV per engine data exchange structure */
+	size = sizeof(struct pv_submission);
+	for (i = 0; i < PV_MAX_ENGINES_NUM; i++) {
+		pv->pv_elsp[i] = (void *)base + PV_ELSP_OFF +  size * i;
+		pv->pv_elsp[i]->submitted = false;
+		spin_lock_init(&pv->pv_elsp[i]->lock);
+	}
+
 	return ret;
 err:
 	__free_page(virt_to_page(base));
diff --git a/drivers/gpu/drm/i915/i915_vgpu.h b/drivers/gpu/drm/i915/i915_vgpu.h
index b0fee5b..c33cb05 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.h
+++ b/drivers/gpu/drm/i915/i915_vgpu.h
@@ -29,6 +29,8 @@ 
 
 #define PV_MAJOR		1
 #define PV_MINOR		0
+#define PV_MAX_ENGINES_NUM	(VECS1_HW + 1)
+#define PV_ELSP_OFF		(PAGE_SIZE/8)
 #define PV_DESC_OFF		(PAGE_SIZE/4)
 #define PV_CMD_OFF		(PAGE_SIZE/2)
 
@@ -37,6 +39,7 @@ 
  */
 enum pv_caps {
 	PV_PPGTT_UPDATE = 0x1,
+	PV_SUBMISSION = 0x2,
 };
 
 /* PV actions */
@@ -45,6 +48,7 @@  enum intel_vgpu_pv_action {
 	PV_ACTION_PPGTT_L4_ALLOC,
 	PV_ACTION_PPGTT_L4_CLEAR,
 	PV_ACTION_PPGTT_L4_INSERT,
+	PV_ACTION_ELSP_SUBMISSION,
 };
 
 /*
@@ -56,6 +60,13 @@  struct gvt_shared_page {
 	u16 ver_minor;
 };
 
+/* workload submission pv support data structure */
+struct pv_submission {
+	u64 descs[EXECLIST_MAX_PORTS];
+	bool submitted;
+	spinlock_t lock;
+};
+
 /*
  * Definition of the command transport message header (DW0)
  *
@@ -100,6 +111,9 @@  struct i915_virtual_gpu_pv {
 	struct gvt_shared_page *shared_page;
 	bool enabled;
 
+	/* per engine PV workload submission data */
+	struct pv_submission *pv_elsp[PV_MAX_ENGINES_NUM];
+
 	/* PV command buffer support */
 	struct vgpu_pv_ct_buffer ctb;
 	u32 next_fence;
@@ -164,4 +178,5 @@  bool intel_vgpu_check_pv_caps(struct drm_i915_private *dev_priv,
 		void __iomem *shared_area);
 void intel_vgpu_config_pv_caps(struct drm_i915_private *dev_priv,
 		enum pv_caps cap, void *data);
+void vgpu_set_pv_submission(struct intel_engine_cs *engine);
 #endif /* _I915_VGPU_H_ */
diff --git a/drivers/gpu/drm/i915/intel_pv_submission.c b/drivers/gpu/drm/i915/intel_pv_submission.c
new file mode 100644
index 0000000..1083d56
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_pv_submission.c
@@ -0,0 +1,300 @@ 
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2018 Intel Corporation
+ */
+
+#include "i915_vgpu.h"
+#include "gt/intel_lrc_reg.h"
+#include "gt/intel_gt_pm.h"
+#include "i915_trace.h"
+
+#define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
+
+static u64 execlists_update_context(struct i915_request *rq)
+{
+	struct intel_context *ce = rq->hw_context;
+	u32 *reg_state = ce->lrc_reg_state;
+
+	reg_state[CTX_RING_TAIL + 1] = intel_ring_set_tail(rq->ring, rq->tail);
+
+	return ce->lrc_desc;
+}
+
+static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+{
+	return rb_entry(rb, struct i915_priolist, node);
+}
+
+static void pv_submit(struct intel_engine_cs *engine,
+	       struct i915_request **out,
+	       struct i915_request **end)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_virtual_gpu_pv *pv = engine->i915->vgpu.pv;
+	struct pv_submission *pv_elsp = pv->pv_elsp[engine->hw_id];
+	struct i915_request *rq;
+	int n, err;
+
+	memset(pv_elsp->descs, 0, sizeof(pv_elsp->descs));
+	n = 0;
+
+	do {
+		rq = *out++;
+		pv_elsp->descs[n++] = execlists_update_context(rq);
+	} while (out != end);
+
+	spin_lock(&pv_elsp->lock);
+	pv_elsp->submitted = true;
+	writel(PV_ACTION_ELSP_SUBMISSION, execlists->submit_reg);
+
+#define done (READ_ONCE(pv_elsp->submitted) == false)
+	err = wait_for_atomic_us(done, 1000);
+#undef done
+	spin_unlock(&pv_elsp->lock);
+
+	if (unlikely(err))
+		DRM_ERROR("PV (%s) workload submission failed\n", engine->name);
+
+}
+
+static struct i915_request *schedule_in(struct i915_request *rq, int idx)
+{
+	trace_i915_request_in(rq, idx);
+
+	intel_gt_pm_get(rq->engine->gt);
+	return i915_request_get(rq);
+}
+
+static void pv_dequeue(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_request **first = execlists->inflight;
+	struct i915_request ** const last_port = first + execlists->port_mask;
+	struct i915_request *last = first[0];
+	struct i915_request **port;
+	bool submit = false;
+	struct rb_node *rb;
+
+	lockdep_assert_held(&engine->active.lock);
+
+	if (last) {
+		if (*++first)
+			return;
+		last = NULL;
+	}
+
+	port = first;
+	while ((rb = rb_first_cached(&execlists->queue))) {
+		struct i915_priolist *p = to_priolist(rb);
+		struct i915_request *rq, *rn;
+		int i;
+
+		priolist_for_each_request_consume(rq, rn, p, i) {
+			if (last && rq->hw_context != last->hw_context) {
+				if (port == last_port)
+					goto done;
+
+				*port = schedule_in(last,
+						    port - execlists->inflight);
+				port++;
+			}
+
+
+			list_del_init(&rq->sched.link);
+			__i915_request_submit(rq);
+			submit = true;
+			last = rq;
+		}
+
+		rb_erase_cached(&p->node, &execlists->queue);
+		i915_priolist_free(p);
+	}
+done:
+	execlists->queue_priority_hint =
+		rb ? to_priolist(rb)->priority : INT_MIN;
+	if (submit) {
+		*port = schedule_in(last, port - execlists->inflight);
+		*++port = NULL;
+		pv_submit(engine, first, port);
+	}
+	execlists->active = execlists->inflight;
+}
+
+static void schedule_out(struct i915_request *rq)
+{
+	trace_i915_request_out(rq);
+
+	intel_gt_pm_put(rq->engine->gt);
+	i915_request_put(rq);
+}
+
+static void vgpu_pv_submission_tasklet(unsigned long data)
+{
+	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_request **port, *rq;
+	unsigned long flags;
+	struct i915_virtual_gpu_pv *pv = engine->i915->vgpu.pv;
+	struct pv_submission *pv_elsp = pv->pv_elsp[engine->hw_id];
+
+	spin_lock_irqsave(&engine->active.lock, flags);
+
+	for (port = execlists->inflight; (rq = *port); port++) {
+		if (!i915_request_completed(rq))
+			break;
+
+		schedule_out(rq);
+	}
+
+	if (port != execlists->inflight) {
+		int idx = port - execlists->inflight;
+		int rem = ARRAY_SIZE(execlists->inflight) - idx;
+
+		memmove(execlists->inflight, port, rem * sizeof(*port));
+	}
+
+	if (!pv_elsp->submitted)
+		pv_dequeue(engine);
+
+	spin_unlock_irqrestore(&engine->active.lock, flags);
+}
+
+static void pv_reset_prepare(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+
+	GEM_TRACE("%s\n", engine->name);
+
+	/*
+	 * Prevent request submission to the hardware until we have
+	 * completed the reset in i915_gem_reset_finish(). If a request
+	 * is completed by one engine, it may then queue a request
+	 * to a second via its execlists->tasklet *just* as we are
+	 * calling engine->init_hw() and also writing the ELSP.
+	 * Turning off the execlists->tasklet until the reset is over
+	 * prevents the race.
+	 */
+	__tasklet_disable_sync_once(&execlists->tasklet);
+}
+
+static void
+cancel_port_requests(struct intel_engine_execlists * const execlists)
+{
+	struct i915_request * const *port, *rq;
+
+	/* Note we are only using the inflight and not the pending queue */
+	for (port = execlists->active; (rq = *port); port++)
+		schedule_out(rq);
+	execlists->active =
+		memset(execlists->inflight, 0, sizeof(execlists->inflight));
+}
+
+static void pv_reset(struct intel_engine_cs *engine, bool stalled)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_request *rq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&engine->active.lock, flags);
+
+	cancel_port_requests(execlists);
+
+	/* Push back any incomplete requests for replay after the reset. */
+	rq = execlists_unwind_incomplete_requests(execlists);
+	if (!rq)
+		goto out_unlock;
+
+	if (!i915_request_started(rq))
+		stalled = false;
+
+	__i915_request_reset(rq, stalled);
+	intel_lr_context_reset(engine, rq->hw_context, rq->head, stalled);
+
+out_unlock:
+	spin_unlock_irqrestore(&engine->active.lock, flags);
+}
+
+static void pv_reset_finish(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+
+	if (__tasklet_enable(&execlists->tasklet))
+		/* And kick in case we missed a new request submission. */
+		tasklet_hi_schedule(&execlists->tasklet);
+
+	GEM_TRACE("%s: depth->%d\n", engine->name,
+		  atomic_read(&execlists->tasklet.count));
+}
+
+static void pv_cancel_requests(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_request *rq, *rn;
+	struct rb_node *rb;
+	unsigned long flags;
+
+	GEM_TRACE("%s\n", engine->name);
+
+	spin_lock_irqsave(&engine->active.lock, flags);
+
+	/* Cancel the requests on the HW and clear the ELSP tracker. */
+	cancel_port_requests(execlists);
+
+	/* Mark all executing requests as skipped. */
+	list_for_each_entry(rq, &engine->active.requests, sched.link) {
+		if (!i915_request_signaled(rq))
+			dma_fence_set_error(&rq->fence, -EIO);
+
+		i915_request_mark_complete(rq);
+	}
+
+	/* Flush the queued requests to the timeline list (for retiring). */
+	while ((rb = rb_first_cached(&execlists->queue))) {
+		struct i915_priolist *p = to_priolist(rb);
+		int i;
+
+		priolist_for_each_request_consume(rq, rn, p, i) {
+			list_del_init(&rq->sched.link);
+			__i915_request_submit(rq);
+			dma_fence_set_error(&rq->fence, -EIO);
+			i915_request_mark_complete(rq);
+		}
+
+		rb_erase_cached(&p->node, &execlists->queue);
+		i915_priolist_free(p);
+	}
+
+	execlists->queue_priority_hint = INT_MIN;
+	execlists->queue = RB_ROOT_CACHED;
+
+	spin_unlock_irqrestore(&engine->active.lock, flags);
+}
+
+void vgpu_set_pv_submission(struct intel_engine_cs *engine)
+{
+	/*
+	 * We inherit a bunch of functions from execlists that we'd like
+	 * to keep using:
+	 *
+	 *    engine->submit_request = execlists_submit_request;
+	 *    engine->cancel_requests = execlists_cancel_requests;
+	 *    engine->schedule = execlists_schedule;
+	 *
+	 * But we need to override the actual submission backend in order
+	 * to talk to the GVT with PV notification message.
+	 */
+
+	engine->execlists.tasklet.func = vgpu_pv_submission_tasklet;
+
+	/* do not use execlists park/unpark */
+	engine->park = engine->unpark = NULL;
+
+	engine->reset.prepare = pv_reset_prepare;
+	engine->reset.reset = pv_reset;
+	engine->reset.finish = pv_reset_finish;
+
+	engine->cancel_requests = pv_cancel_requests;
+
+	engine->flags &= ~I915_ENGINE_SUPPORTS_STATS;
+	engine->flags |= I915_ENGINE_NEEDS_BREADCRUMB_TASKLET;
+}