diff mbox series

[v5,4/8] drm/i915: vgpu context submission pv optimization

Message ID 1556507458-24684-5-git-send-email-xiaolin.zhang@intel.com (mailing list archive)
State New, archived
Headers show
Series i915 vgpu PV to improve vgpu performance | expand

Commit Message

Xiaolin Zhang April 29, 2019, 3:10 a.m. UTC
It is performance optimization to override the actual submisison backend
in order to eliminate execlists csb process and reduce mmio trap numbers
for workload submission without context switch interrupt by talking with
GVT via PV submisison notification mechanism between guest and GVT.

Use PV_SUBMISSION to control this level of pv optimization.

v0: RFC
v1: rebase
v2: added pv ops for pv context submission. to maximize code resuse,
introduced 2 more ops (submit_ports & preempt_context) instead of 1 op
(set_default_submission) in engine structure. pv version of
submit_ports and preempt_context implemented.
v3:
1. to reduce more code duplication, code refactor and replaced 2 ops
"submit_ports & preempt_contex" from v2 by 1 ops "write_desc"
in engine structure. pv version of write_des implemented.
2. added VGT_G2V_ELSP_SUBMIT for g2v pv notification.
v4: implemented pv elsp submission tasklet as the backend workload
submisison by talking to GVT with PV notificaiton mechanism and renamed
VGT_G2V_ELSP_SUBMIT to VGT_G2V_PV_SUBMISIION.
v5: addressed v4 comments from Chris, intel_pv_submission.c added.

Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>
---
 drivers/gpu/drm/i915/Makefile              |   2 +-
 drivers/gpu/drm/i915/gt/intel_lrc.c        |   8 +-
 drivers/gpu/drm/i915/i915_pvinfo.h         |   1 +
 drivers/gpu/drm/i915/i915_vgpu.c           |   8 +-
 drivers/gpu/drm/i915/i915_vgpu.h           |   3 +
 drivers/gpu/drm/i915/intel_pv_submission.c | 177 +++++++++++++++++++++++++++++
 6 files changed, 195 insertions(+), 4 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/intel_pv_submission.c

Comments

Chris Wilson April 29, 2019, 10:02 a.m. UTC | #1
Quoting Xiaolin Zhang (2019-04-29 04:10:54)
> +static void pv_submit(struct intel_engine_cs *engine)
> +{
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
> +       struct execlist_port *port = execlists->port;
> +       unsigned int n;
> +       struct gvt_shared_page *shared_page = engine->i915->vgpu.shared_page;
> +       u64 descs[2];
> +
> +       for (n = 0; n < execlists_num_ports(execlists); n++) {
> +               struct i915_request *rq;
> +               unsigned int count = 0;
> +
> +               descs[n] = 0;
> +               rq = port_unpack(&port[n], &count);
> +               if (rq && count == 0) {
> +                       port_set(&port[n], port_pack(rq, ++count));
> +                       descs[n] = execlists_update_context(rq);
> +               }
> +       }
> +
> +       spin_lock(&engine->i915->vgpu.shared_page_lock[engine->id]);

Only one engine at a time now accesses that portion of pv_elsp, so the
spin lock is not required per-se, aiui.

The problem is that there is no coordination between pv_submit and the
other side of the pipe, as far as I can see. So it seems very possible
for us to overwrite entries before they have been read. I expect to see
some ack mechanism for VGT_G2V_PV_SUBMISSION.

> +       for (n = 0; n < execlists_num_ports(execlists); n++)
> +               shared_page->pv_elsp[engine->id].descs[n] = descs[n];

I'd recommend not using engine->id, that's just our internal id and may
vary between host/guest. Use engine->hw_id? Or negotiate using pv setup
and store pv_id?

But starting to look more solid over all. We just need to finish
splitting out the various similar-but-quite-different-really submission
backends. :)
-Chris
Xiaolin Zhang May 21, 2019, 8:26 a.m. UTC | #2
On 04/29/2019 06:03 PM, Chris Wilson wrote:
> Quoting Xiaolin Zhang (2019-04-29 04:10:54)
>> +static void pv_submit(struct intel_engine_cs *engine)
>> +{
>> +       struct intel_engine_execlists * const execlists = &engine->execlists;
>> +       struct execlist_port *port = execlists->port;
>> +       unsigned int n;
>> +       struct gvt_shared_page *shared_page = engine->i915->vgpu.shared_page;
>> +       u64 descs[2];
>> +
>> +       for (n = 0; n < execlists_num_ports(execlists); n++) {
>> +               struct i915_request *rq;
>> +               unsigned int count = 0;
>> +
>> +               descs[n] = 0;
>> +               rq = port_unpack(&port[n], &count);
>> +               if (rq && count == 0) {
>> +                       port_set(&port[n], port_pack(rq, ++count));
>> +                       descs[n] = execlists_update_context(rq);
>> +               }
>> +       }
>> +
>> +       spin_lock(&engine->i915->vgpu.shared_page_lock[engine->id]);
> Only one engine at a time now accesses that portion of pv_elsp, so the
> spin lock is not required per-se, aiui.
>
> The problem is that there is no coordination between pv_submit and the
> other side of the pipe, as far as I can see. So it seems very possible
> for us to overwrite entries before they have been read. I expect to see
> some ack mechanism for VGT_G2V_PV_SUBMISSION.
>
>> +       for (n = 0; n < execlists_num_ports(execlists); n++)
>> +               shared_page->pv_elsp[engine->id].descs[n] = descs[n];
> I'd recommend not using engine->id, that's just our internal id and may
> vary between host/guest. Use engine->hw_id? Or negotiate using pv setup
> and store pv_id?
>
> But starting to look more solid over all. We just need to finish
> splitting out the various similar-but-quite-different-really submission
> backends. :)
> -Chris
>
Chris, Thanks your great comments and will be addressed next version
(will remove spin lock, use hw_id,  add ack mechanism and refine patches).
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index dd8d923..27873c7 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -198,7 +198,7 @@  i915-$(CONFIG_DRM_I915_SELFTEST) += \
 	selftests/igt_spinner.o
 
 # virtual gpu code
-i915-y += i915_vgpu.o
+i915-y += i915_vgpu.o intel_pv_submission.o
 
 # perf code
 i915-y += i915_perf.o \
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index d17c08e..d31a98b 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2380,11 +2380,15 @@  void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
 	engine->unpark = NULL;
 
 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
-	if (!intel_vgpu_active(engine->i915))
-		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
+	engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
 	if (engine->preempt_context &&
 	    HAS_LOGICAL_RING_PREEMPTION(engine->i915))
 		engine->flags |= I915_ENGINE_HAS_PREEMPTION;
+
+	if (intel_vgpu_active(engine->i915)) {
+		engine->flags &= ~I915_ENGINE_HAS_SEMAPHORES;
+		intel_vgpu_config_pv_caps(engine->i915,	PV_SUBMISSION, engine);
+	}
 }
 
 static void
diff --git a/drivers/gpu/drm/i915/i915_pvinfo.h b/drivers/gpu/drm/i915/i915_pvinfo.h
index 2408a9d..362d898 100644
--- a/drivers/gpu/drm/i915/i915_pvinfo.h
+++ b/drivers/gpu/drm/i915/i915_pvinfo.h
@@ -50,6 +50,7 @@  enum vgt_g2v_type {
 	VGT_G2V_PPGTT_L4_ALLOC,
 	VGT_G2V_PPGTT_L4_CLEAR,
 	VGT_G2V_PPGTT_L4_INSERT,
+	VGT_G2V_PV_SUBMISSION,
 	VGT_G2V_MAX,
 };
 
diff --git a/drivers/gpu/drm/i915/i915_vgpu.c b/drivers/gpu/drm/i915/i915_vgpu.c
index 23814cd..8dea0b7 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.c
+++ b/drivers/gpu/drm/i915/i915_vgpu.c
@@ -81,7 +81,7 @@  void i915_check_vgpu(struct drm_i915_private *dev_priv)
 	dev_priv->vgpu.active = true;
 
 	/* guest driver PV capability */
-	dev_priv->vgpu.pv_caps = PV_PPGTT_UPDATE;
+	dev_priv->vgpu.pv_caps = PV_PPGTT_UPDATE | PV_SUBMISSION;
 
 	if (!intel_vgpu_check_pv_caps(dev_priv)) {
 		DRM_INFO("Virtual GPU for Intel GVT-g detected.\n");
@@ -361,6 +361,7 @@  void intel_vgpu_config_pv_caps(struct drm_i915_private *dev_priv,
 		enum pv_caps cap, void *data)
 {
 	struct i915_hw_ppgtt *ppgtt;
+	struct intel_engine_cs *engine;
 
 	if (!intel_vgpu_enabled_pv_caps(dev_priv, cap))
 		return;
@@ -371,6 +372,11 @@  void intel_vgpu_config_pv_caps(struct drm_i915_private *dev_priv,
 		ppgtt->vm.insert_entries = gen8_ppgtt_insert_4lvl_pv;
 		ppgtt->vm.clear_range = gen8_ppgtt_clear_4lvl_pv;
 	}
+
+	if (cap == PV_SUBMISSION) {
+		engine = (struct intel_engine_cs *)data;
+		vgpu_set_pv_submission(engine);
+	}
 }
 
 /*
diff --git a/drivers/gpu/drm/i915/i915_vgpu.h b/drivers/gpu/drm/i915/i915_vgpu.h
index 6bce8a5..f4e8a1e 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.h
+++ b/drivers/gpu/drm/i915/i915_vgpu.h
@@ -31,6 +31,7 @@ 
  */
 enum pv_caps {
 	PV_PPGTT_UPDATE = 0x1,
+	PV_SUBMISSION = 0x2,
 };
 
 /*
@@ -90,4 +91,6 @@  void intel_vgt_deballoon(struct drm_i915_private *dev_priv);
 bool intel_vgpu_check_pv_caps(struct drm_i915_private *dev_priv);
 void intel_vgpu_config_pv_caps(struct drm_i915_private *dev_priv,
 		enum pv_caps cap, void *data);
+void vgpu_set_pv_submission(struct intel_engine_cs *engine);
+
 #endif /* _I915_VGPU_H_ */
diff --git a/drivers/gpu/drm/i915/intel_pv_submission.c b/drivers/gpu/drm/i915/intel_pv_submission.c
new file mode 100644
index 0000000..3ba9c82
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_pv_submission.c
@@ -0,0 +1,177 @@ 
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2018 Intel Corporation
+ */
+
+#include "intel_drv.h"
+#include "i915_vgpu.h"
+#include "gt/intel_lrc_reg.h"
+
+static u64 execlists_update_context(struct i915_request *rq)
+{
+	struct intel_context *ce = rq->hw_context;
+	u32 *reg_state = ce->lrc_reg_state;
+
+	reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
+
+	return ce->lrc_desc;
+}
+
+static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+{
+	return rb_entry(rb, struct i915_priolist, node);
+}
+
+static void pv_submit(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct execlist_port *port = execlists->port;
+	unsigned int n;
+	struct gvt_shared_page *shared_page = engine->i915->vgpu.shared_page;
+	u64 descs[2];
+
+	for (n = 0; n < execlists_num_ports(execlists); n++) {
+		struct i915_request *rq;
+		unsigned int count = 0;
+
+		descs[n] = 0;
+		rq = port_unpack(&port[n], &count);
+		if (rq && count == 0) {
+			port_set(&port[n], port_pack(rq, ++count));
+			descs[n] = execlists_update_context(rq);
+		}
+	}
+
+	spin_lock(&engine->i915->vgpu.shared_page_lock[engine->id]);
+	for (n = 0; n < execlists_num_ports(execlists); n++)
+		shared_page->pv_elsp[engine->id].descs[n] = descs[n];
+
+	writel(VGT_G2V_PV_SUBMISSION, execlists->submit_reg);
+	spin_unlock(&engine->i915->vgpu.shared_page_lock[engine->id]);
+}
+
+static inline int rq_prio(const struct i915_request *rq)
+{
+	return rq->sched.attr.priority;
+}
+
+static inline int port_prio(const struct execlist_port *port)
+{
+	return rq_prio(port_request(port)) | __NO_PREEMPTION;
+}
+
+static void pv_dequeue(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct execlist_port *port = execlists->port;
+	struct i915_request *last = NULL;
+	bool submit = false;
+	struct rb_node *rb;
+
+	lockdep_assert_held(&engine->timeline.lock);
+
+	while ((rb = rb_first_cached(&execlists->queue))) {
+		struct i915_priolist *p = to_priolist(rb);
+		struct i915_request *rq, *rn;
+		int i;
+
+		priolist_for_each_request_consume(rq, rn, p, i) {
+			if (last && rq->hw_context != last->hw_context)
+				goto done;
+
+			list_del_init(&rq->sched.link);
+
+			__i915_request_submit(rq);
+			trace_i915_request_in(rq, port_index(port, execlists));
+
+			last = rq;
+			submit = true;
+		}
+
+		rb_erase_cached(&p->node, &execlists->queue);
+		i915_priolist_free(p);
+	}
+done:
+	execlists->queue_priority_hint =
+			rb ? to_priolist(rb)->priority : INT_MIN;
+	if (submit) {
+		port_set(port, i915_request_get(last));
+		pv_submit(engine);
+	}
+	if (last)
+		execlists_user_begin(execlists, execlists->port);
+
+	/* We must always keep the beast fed if we have work piled up */
+	GEM_BUG_ON(port_isset(execlists->port) &&
+		   !execlists_is_active(execlists, EXECLISTS_ACTIVE_USER));
+	GEM_BUG_ON(rb_first_cached(&execlists->queue) &&
+		   !port_isset(execlists->port));
+}
+
+static void vgpu_pv_submission_tasklet(unsigned long data)
+{
+	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct execlist_port *port = execlists->port;
+	struct i915_request *rq;
+	unsigned long flags;
+	bool rq_finished = false;
+
+	spin_lock_irqsave(&engine->timeline.lock, flags);
+
+	rq = port_request(port);
+	while (rq && i915_request_completed(rq)) {
+		trace_i915_request_out(rq);
+		rq_finished = true;
+		i915_request_put(rq);
+
+		port = execlists_port_complete(execlists, port);
+		if (port_isset(port)) {
+			rq_finished = false;
+			execlists_user_begin(execlists, port);
+			rq = port_request(port);
+		} else {
+			execlists_user_end(execlists);
+			rq = NULL;
+		}
+	}
+
+	if (rq_finished || !rq)
+		pv_dequeue(engine);
+
+	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+}
+
+static void vgpu_pv_submission_park(struct intel_engine_cs *engine)
+{
+	intel_engine_unpin_breadcrumbs_irq(engine);
+	engine->flags &= ~I915_ENGINE_NEEDS_BREADCRUMB_TASKLET;
+}
+
+static void vgpu_pv_submission_unpark(struct intel_engine_cs *engine)
+{
+	engine->flags |= I915_ENGINE_NEEDS_BREADCRUMB_TASKLET;
+	intel_engine_pin_breadcrumbs_irq(engine);
+}
+
+void vgpu_set_pv_submission(struct intel_engine_cs *engine)
+{
+	/*
+	 * We inherit a bunch of functions from execlists that we'd like
+	 * to keep using:
+	 *
+	 *    engine->submit_request = execlists_submit_request;
+	 *    engine->cancel_requests = execlists_cancel_requests;
+	 *    engine->schedule = execlists_schedule;
+	 *
+	 * But we need to override the actual submission backend in order
+	 * to talk to the GVT with PV notification message.
+	 */
+
+	engine->execlists.tasklet.func = vgpu_pv_submission_tasklet;
+
+	engine->park = vgpu_pv_submission_park;
+	engine->unpark = vgpu_pv_submission_unpark;
+
+	engine->flags &= ~I915_ENGINE_HAS_PREEMPTION;
+}