diff mbox series

[RFC,03/10] drm/i915/gvt: context submission pvmmio optimization

Message ID 1538066275-52932-4-git-send-email-xiaolin.zhang@intel.com (mailing list archive)
State New, archived
Headers show
Series i915 pvmmio to improve GVTg performance | expand

Commit Message

Xiaolin Zhang Sept. 27, 2018, 4:37 p.m. UTC
It is performance optimization to reduce mmio trap numbers from 4 to
1 durning ELSP porting writing (context submission).

When context subission, to cache elsp_data[4] values in
the shared page, the last elsp_data[0] port writing will be trapped
to gvt for real context submission.

Use PVMMIO_ELSP_SUBMIT to control this level of pvmmio optimization.

Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>
---
 drivers/gpu/drm/i915/i915_params.h |  2 +-
 drivers/gpu/drm/i915/intel_lrc.c   | 37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)

Comments

Chris Wilson Sept. 27, 2018, 7:19 a.m. UTC | #1
Quoting Xiaolin Zhang (2018-09-27 17:37:48)
> It is performance optimization to reduce mmio trap numbers from 4 to
> 1 durning ELSP porting writing (context submission).
> 
> When context subission, to cache elsp_data[4] values in
> the shared page, the last elsp_data[0] port writing will be trapped
> to gvt for real context submission.
> 
> Use PVMMIO_ELSP_SUBMIT to control this level of pvmmio optimization.

> Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_params.h |  2 +-
>  drivers/gpu/drm/i915/intel_lrc.c   | 37 ++++++++++++++++++++++++++++++++++++-
>  2 files changed, 37 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_params.h b/drivers/gpu/drm/i915/i915_params.h
> index 0f6a38b..6c81c87 100644
> --- a/drivers/gpu/drm/i915/i915_params.h
> +++ b/drivers/gpu/drm/i915/i915_params.h
> @@ -69,7 +69,7 @@
>         param(bool, enable_dp_mst, true) \
>         param(bool, enable_dpcd_backlight, false) \
>         param(bool, enable_gvt, false) \
> -       param(int, enable_pvmmio, 0)
> +       param(int, enable_pvmmio, PVMMIO_ELSP_SUBMIT)
>  
>  #define MEMBER(T, member, ...) T member;
>  struct i915_params {
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 4b28225..cdc713c 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -451,7 +451,12 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>  {
>         struct intel_engine_execlists *execlists = &engine->execlists;
>         struct execlist_port *port = execlists->port;
> +       u32 __iomem *elsp =
> +               engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> +       u32 *elsp_data;
>         unsigned int n;
> +       u32 descs[4];
> +       int i = 0;
>  
>         /*
>          * We can skip acquiring intel_runtime_pm_get() here as it was taken
> @@ -494,8 +499,24 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>                         GEM_BUG_ON(!n);
>                         desc = 0;
>                 }
> +               if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) {
> +                       GEM_BUG_ON(i >= 4);
> +                       descs[i] = upper_32_bits(desc);
> +                       descs[i + 1] = lower_32_bits(desc);
> +                       i += 2;
> +               } else {
> +                       write_desc(execlists, desc, n);
> +               }
> +       }
>  
> -               write_desc(execlists, desc, n);
> +       if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) {
> +               spin_lock(&engine->i915->shared_page_lock);
> +               elsp_data = engine->i915->shared_page->elsp_data;
> +               *elsp_data = descs[0];
> +               *(elsp_data + 1) = descs[1];
> +               *(elsp_data + 2) = descs[2];
> +               writel(descs[3], elsp);
> +               spin_unlock(&engine->i915->shared_page_lock);
>         }
>  
>         /* we need to manually load the submit queue */
> @@ -538,11 +559,25 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
>         struct intel_engine_execlists *execlists = &engine->execlists;
>         struct intel_context *ce =
>                 to_intel_context(engine->i915->preempt_context, engine);
> +       u32 __iomem *elsp =
> +               engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> +       u32 *elsp_data;
>         unsigned int n;
>  
>         GEM_BUG_ON(execlists->preempt_complete_status !=
>                    upper_32_bits(ce->lrc_desc));
>  
> +       if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) {
> +               spin_lock(&engine->i915->shared_page_lock);
> +               elsp_data = engine->i915->shared_page->elsp_data;
> +               *elsp_data = 0;
> +               *(elsp_data + 1) = 0;
> +               *(elsp_data + 2) = upper_32_bits(ce->lrc_desc);
> +               writel(lower_32_bits(ce->lrc_desc), elsp);
> +               spin_unlock(&engine->i915->shared_page_lock);
> +               return;
> +       }
> +

Really?
-Chris
Joonas Lahtinen Sept. 27, 2018, 11:13 a.m. UTC | #2
Quoting Xiaolin Zhang (2018-09-27 19:37:48)
> It is performance optimization to reduce mmio trap numbers from 4 to
> 1 durning ELSP porting writing (context submission).
> 
> When context subission, to cache elsp_data[4] values in
> the shared page, the last elsp_data[0] port writing will be trapped
> to gvt for real context submission.
> 
> Use PVMMIO_ELSP_SUBMIT to control this level of pvmmio optimization.
> 
> Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>

These changes are somewhat invasive to the ELSP submit path.

We should instead look into providing "pvgpu" (just a suggestion for
name) flavour of virtual functions which do the para-virtualized talking.
We don't want to sprinkle pv code around the driver with ifs.

Regards, Joonas
Xiaolin Zhang Sept. 28, 2018, 5:31 a.m. UTC | #3
On 09/27/2018 03:21 PM, Chris Wilson wrote:
> Quoting Xiaolin Zhang (2018-09-27 17:37:48)
>> It is performance optimization to reduce mmio trap numbers from 4 to
>> 1 durning ELSP porting writing (context submission).
>>
>> When context subission, to cache elsp_data[4] values in
>> the shared page, the last elsp_data[0] port writing will be trapped
>> to gvt for real context submission.
>>
>> Use PVMMIO_ELSP_SUBMIT to control this level of pvmmio optimization.
>> Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_params.h |  2 +-
>>  drivers/gpu/drm/i915/intel_lrc.c   | 37 ++++++++++++++++++++++++++++++++++++-
>>  2 files changed, 37 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_params.h b/drivers/gpu/drm/i915/i915_params.h
>> index 0f6a38b..6c81c87 100644
>> --- a/drivers/gpu/drm/i915/i915_params.h
>> +++ b/drivers/gpu/drm/i915/i915_params.h
>> @@ -69,7 +69,7 @@
>>         param(bool, enable_dp_mst, true) \
>>         param(bool, enable_dpcd_backlight, false) \
>>         param(bool, enable_gvt, false) \
>> -       param(int, enable_pvmmio, 0)
>> +       param(int, enable_pvmmio, PVMMIO_ELSP_SUBMIT)
>>  
>>  #define MEMBER(T, member, ...) T member;
>>  struct i915_params {
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index 4b28225..cdc713c 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -451,7 +451,12 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>>  {
>>         struct intel_engine_execlists *execlists = &engine->execlists;
>>         struct execlist_port *port = execlists->port;
>> +       u32 __iomem *elsp =
>> +               engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
>> +       u32 *elsp_data;
>>         unsigned int n;
>> +       u32 descs[4];
>> +       int i = 0;
>>  
>>         /*
>>          * We can skip acquiring intel_runtime_pm_get() here as it was taken
>> @@ -494,8 +499,24 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>>                         GEM_BUG_ON(!n);
>>                         desc = 0;
>>                 }
>> +               if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) {
>> +                       GEM_BUG_ON(i >= 4);
>> +                       descs[i] = upper_32_bits(desc);
>> +                       descs[i + 1] = lower_32_bits(desc);
>> +                       i += 2;
>> +               } else {
>> +                       write_desc(execlists, desc, n);
>> +               }
>> +       }
>>  
>> -               write_desc(execlists, desc, n);
>> +       if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) {
>> +               spin_lock(&engine->i915->shared_page_lock);
>> +               elsp_data = engine->i915->shared_page->elsp_data;
>> +               *elsp_data = descs[0];
>> +               *(elsp_data + 1) = descs[1];
>> +               *(elsp_data + 2) = descs[2];
>> +               writel(descs[3], elsp);
>> +               spin_unlock(&engine->i915->shared_page_lock);
>>         }
>>  
>>         /* we need to manually load the submit queue */
>> @@ -538,11 +559,25 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
>>         struct intel_engine_execlists *execlists = &engine->execlists;
>>         struct intel_context *ce =
>>                 to_intel_context(engine->i915->preempt_context, engine);
>> +       u32 __iomem *elsp =
>> +               engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
>> +       u32 *elsp_data;
>>         unsigned int n;
>>  
>>         GEM_BUG_ON(execlists->preempt_complete_status !=
>>                    upper_32_bits(ce->lrc_desc));
>>  
>> +       if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) {
>> +               spin_lock(&engine->i915->shared_page_lock);
>> +               elsp_data = engine->i915->shared_page->elsp_data;
>> +               *elsp_data = 0;
>> +               *(elsp_data + 1) = 0;
>> +               *(elsp_data + 2) = upper_32_bits(ce->lrc_desc);
>> +               writel(lower_32_bits(ce->lrc_desc), elsp);
>> +               spin_unlock(&engine->i915->shared_page_lock);
>> +               return;
>> +       }
>> +
> Really?
> -Chris
in normal case,  write_desc will update port[0] & port[1] descriptor and
in total 4 mmio will be accessed and trapped.
in PVMMIO_ELSP_SUBMIT case,  cache descs[0] ~ descs[4] to elsp_data[4]
in shared_page, just use one mmio access
"writel(descs[3], elsp)" to trap to GVTg for workload submission and in
GVGT the desc[0]~desc[2] will be read from shared_page.
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_params.h b/drivers/gpu/drm/i915/i915_params.h
index 0f6a38b..6c81c87 100644
--- a/drivers/gpu/drm/i915/i915_params.h
+++ b/drivers/gpu/drm/i915/i915_params.h
@@ -69,7 +69,7 @@ 
 	param(bool, enable_dp_mst, true) \
 	param(bool, enable_dpcd_backlight, false) \
 	param(bool, enable_gvt, false) \
-	param(int, enable_pvmmio, 0)
+	param(int, enable_pvmmio, PVMMIO_ELSP_SUBMIT)
 
 #define MEMBER(T, member, ...) T member;
 struct i915_params {
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 4b28225..cdc713c 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -451,7 +451,12 @@  static void execlists_submit_ports(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists *execlists = &engine->execlists;
 	struct execlist_port *port = execlists->port;
+	u32 __iomem *elsp =
+		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+	u32 *elsp_data;
 	unsigned int n;
+	u32 descs[4];
+	int i = 0;
 
 	/*
 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
@@ -494,8 +499,24 @@  static void execlists_submit_ports(struct intel_engine_cs *engine)
 			GEM_BUG_ON(!n);
 			desc = 0;
 		}
+		if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) {
+			GEM_BUG_ON(i >= 4);
+			descs[i] = upper_32_bits(desc);
+			descs[i + 1] = lower_32_bits(desc);
+			i += 2;
+		} else {
+			write_desc(execlists, desc, n);
+		}
+	}
 
-		write_desc(execlists, desc, n);
+	if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) {
+		spin_lock(&engine->i915->shared_page_lock);
+		elsp_data = engine->i915->shared_page->elsp_data;
+		*elsp_data = descs[0];
+		*(elsp_data + 1) = descs[1];
+		*(elsp_data + 2) = descs[2];
+		writel(descs[3], elsp);
+		spin_unlock(&engine->i915->shared_page_lock);
 	}
 
 	/* we need to manually load the submit queue */
@@ -538,11 +559,25 @@  static void inject_preempt_context(struct intel_engine_cs *engine)
 	struct intel_engine_execlists *execlists = &engine->execlists;
 	struct intel_context *ce =
 		to_intel_context(engine->i915->preempt_context, engine);
+	u32 __iomem *elsp =
+		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+	u32 *elsp_data;
 	unsigned int n;
 
 	GEM_BUG_ON(execlists->preempt_complete_status !=
 		   upper_32_bits(ce->lrc_desc));
 
+	if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) {
+		spin_lock(&engine->i915->shared_page_lock);
+		elsp_data = engine->i915->shared_page->elsp_data;
+		*elsp_data = 0;
+		*(elsp_data + 1) = 0;
+		*(elsp_data + 2) = upper_32_bits(ce->lrc_desc);
+		writel(lower_32_bits(ce->lrc_desc), elsp);
+		spin_unlock(&engine->i915->shared_page_lock);
+		return;
+	}
+
 	/*
 	 * Switch to our empty preempt context so
 	 * the state of the GPU is known (idle).