diff mbox series

[v1,03/10] drm/i915: context submission pvmmio optimization

Message ID 1539238452-4389-3-git-send-email-xiaolin.zhang@intel.com (mailing list archive)
State New, archived
Headers show
Series i915 pvmmio to improve GVTg performance | expand

Commit Message

Xiaolin Zhang Oct. 11, 2018, 6:14 a.m. UTC
It is performance optimization to reduce mmio trap numbers from 4 to
1 durning ELSP porting writing (context submission).

When context subission, to cache elsp_data[4] values in
the shared page, the last elsp_data[0] port writing will be trapped
to gvt for real context submission.

Use PVMMIO_ELSP_SUBMIT to control this level of pvmmio optimization.

v1: rebase
v0: RFC

Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>
---
 drivers/gpu/drm/i915/i915_vgpu.c |  2 ++
 drivers/gpu/drm/i915/intel_lrc.c | 37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 1 deletion(-)

Comments

Chris Wilson Oct. 11, 2018, 9:12 a.m. UTC | #1
Quoting Xiaolin Zhang (2018-10-11 07:14:05)
> It is performance optimization to reduce mmio trap numbers from 4 to
> 1 durning ELSP porting writing (context submission).
> 
> When context subission, to cache elsp_data[4] values in
> the shared page, the last elsp_data[0] port writing will be trapped
> to gvt for real context submission.
> 
> Use PVMMIO_ELSP_SUBMIT to control this level of pvmmio optimization.
> 
> v1: rebase
> v0: RFC
> 
> Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_vgpu.c |  2 ++
>  drivers/gpu/drm/i915/intel_lrc.c | 37 ++++++++++++++++++++++++++++++++++++-

Hint: intel_vgpu_submission.c and go wild. You do not need to emulate
execlists at all, an async interface along the lines of guc would
strangely enough be more akin to what you want.
-Chris
Xiaolin Zhang Oct. 15, 2018, 2:35 a.m. UTC | #2
On 10/11/2018 05:12 PM, Chris Wilson wrote:
> Quoting Xiaolin Zhang (2018-10-11 07:14:05)
>> It is performance optimization to reduce mmio trap numbers from 4 to
>> 1 durning ELSP porting writing (context submission).
>>
>> When context subission, to cache elsp_data[4] values in
>> the shared page, the last elsp_data[0] port writing will be trapped
>> to gvt for real context submission.
>>
>> Use PVMMIO_ELSP_SUBMIT to control this level of pvmmio optimization.
>>
>> v1: rebase
>> v0: RFC
>>
>> Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_vgpu.c |  2 ++
>>  drivers/gpu/drm/i915/intel_lrc.c | 37 ++++++++++++++++++++++++++++++++++++-
> Hint: intel_vgpu_submission.c and go wild. You do not need to emulate
> execlists at all, an async interface along the lines of guc would
> strangely enough be more akin to what you want.
> -Chris
>
can't understand your comment very well. so far, vgpu only support
execlist workload submission only, this pv optimization is only valid
for execlist submission and can't support guc.

BRs, Xiaolin
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_vgpu.c b/drivers/gpu/drm/i915/i915_vgpu.c
index 609eefe..84241a7 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.c
+++ b/drivers/gpu/drm/i915/i915_vgpu.c
@@ -66,6 +66,8 @@  void i915_check_vgpu(struct drm_i915_private *dev_priv)
 
 	BUILD_BUG_ON(sizeof(struct vgt_if) != VGT_PVINFO_SIZE);
 
+	dev_priv->vgpu.pv_caps = PVMMIO_ELSP_SUBMIT;
+
 	magic = __raw_i915_read64(dev_priv, vgtif_reg(magic));
 	if (magic != VGT_MAGIC)
 		return;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index d604d8a..1f52633 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -407,6 +407,11 @@  static void execlists_submit_ports(struct intel_engine_cs *engine)
 	struct intel_engine_execlists *execlists = &engine->execlists;
 	struct execlist_port *port = execlists->port;
 	unsigned int n;
+	u32 __iomem *elsp =
+		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+	u32 *elsp_data;
+	u32 descs[4];
+	int i = 0;
 
 	/*
 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
@@ -449,8 +454,24 @@  static void execlists_submit_ports(struct intel_engine_cs *engine)
 			GEM_BUG_ON(!n);
 			desc = 0;
 		}
+		if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) {
+			GEM_BUG_ON(i >= 4);
+			descs[i] = upper_32_bits(desc);
+			descs[i + 1] = lower_32_bits(desc);
+			i += 2;
+		} else {
+			write_desc(execlists, desc, n);
+		}
+	}
 
-		write_desc(execlists, desc, n);
+	if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) {
+		spin_lock(&engine->i915->vgpu.shared_page_lock);
+		elsp_data = engine->i915->vgpu.shared_page->elsp_data;
+		*elsp_data = descs[0];
+		*(elsp_data + 1) = descs[1];
+		*(elsp_data + 2) = descs[2];
+		writel(descs[3], elsp);
+		spin_unlock(&engine->i915->vgpu.shared_page_lock);
 	}
 
 	/* we need to manually load the submit queue */
@@ -493,11 +514,25 @@  static void inject_preempt_context(struct intel_engine_cs *engine)
 	struct intel_engine_execlists *execlists = &engine->execlists;
 	struct intel_context *ce =
 		to_intel_context(engine->i915->preempt_context, engine);
+	u32 __iomem *elsp =
+		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+	u32 *elsp_data;
 	unsigned int n;
 
 	GEM_BUG_ON(execlists->preempt_complete_status !=
 		   upper_32_bits(ce->lrc_desc));
 
+	if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) {
+		spin_lock(&engine->i915->vgpu.shared_page_lock);
+		elsp_data = engine->i915->vgpu.shared_page->elsp_data;
+		*elsp_data = 0;
+		*(elsp_data + 1) = 0;
+		*(elsp_data + 2) = upper_32_bits(ce->lrc_desc);
+		writel(lower_32_bits(ce->lrc_desc), elsp);
+		spin_unlock(&engine->i915->vgpu.shared_page_lock);
+		return;
+	}
+
 	/*
 	 * Switch to our empty preempt context so
 	 * the state of the GPU is known (idle).