[1/7] drm/i915: preallocate pdps for 32 bit vgpu
diff mbox

Message ID 1440056724-26976-2-git-send-email-zhiyuan.lv@intel.com
State New
Headers show

Commit Message

Zhiyuan Lv Aug. 20, 2015, 7:45 a.m. UTC
This is based on Mika Kuoppala's patch below:
http://article.gmane.org/gmane.comp.freedesktop.xorg.drivers.intel/61104/match=workaround+hw+preload

The patch will preallocate the page directories for 32-bit PPGTT when
i915 runs inside a virtual machine with Intel GVT-g. With this change,
the root pointers in EXECLIST context will always keep the same.

The change is needed for vGPU because Intel GVT-g will do page table
shadowing, and needs to track all the page table changes from guest
i915 driver. However, if guest PPGTT is modified through GPU commands
like LRI, it is not possible to trap the operations in the right time,
so it will be hard to make shadow PPGTT to work correctly.

Shadow PPGTT could be much simpler with this change. Meanwhile
hypervisor could simply prohibit any attempt of PPGTT modification
through GPU command for security.

The function gen8_preallocate_top_level_pdps() in the patch is from
Mika, with only one change to set "used_pdpes" to avoid duplicated
allocation later.

Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Signed-off-by: Zhiyuan Lv <zhiyuan.lv@intel.com>
Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c | 33 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c    |  3 ++-
 2 files changed, 35 insertions(+), 1 deletion(-)

Comments

Joonas Lahtinen Aug. 20, 2015, 10:56 a.m. UTC | #1
Hi,

Added Michel and Dave as CC too, to notice this, as they are specified
in the patch as CC.

On to, 2015-08-20 at 15:45 +0800, Zhiyuan Lv wrote:
> This is based on Mika Kuoppala's patch below:
> http://article.gmane.org/gmane.comp.freedesktop.xorg.drivers.intel/61
> 104/match=workaround+hw+preload
> 
> The patch will preallocate the page directories for 32-bit PPGTT when
> i915 runs inside a virtual machine with Intel GVT-g. With this 
> change,
> the root pointers in EXECLIST context will always keep the same.
> 
> The change is needed for vGPU because Intel GVT-g will do page table
> shadowing, and needs to track all the page table changes from guest
> i915 driver. However, if guest PPGTT is modified through GPU commands
> like LRI, it is not possible to trap the operations in the right 
> time,
> so it will be hard to make shadow PPGTT to work correctly.
> 
> Shadow PPGTT could be much simpler with this change. Meanwhile
> hypervisor could simply prohibit any attempt of PPGTT modification
> through GPU command for security.
> 
> The function gen8_preallocate_top_level_pdps() in the patch is from
> Mika, with only one change to set "used_pdpes" to avoid duplicated
> allocation later.
> 
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> Cc: Dave Gordon <david.s.gordon@intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Signed-off-by: Zhiyuan Lv <zhiyuan.lv@intel.com>
> Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
> 

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

I'm just wondering if it's worth keeping the LRI method of updating the
PDPS at all, for the sake of a couple of KBs per PPGTT, now that we
have an occasional need for making them static. So this patch is R-b:d,
but I'd suggest discussion about removing the LRI update method, and
favoring static PDPS always for 32-bit.

Regards, Joonas

> ---
>  drivers/gpu/drm/i915/i915_gem_gtt.c | 33 
> +++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_lrc.c    |  3 ++-
>  2 files changed, 35 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 4a76807..ed10e77 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1441,6 +1441,33 @@ static void gen8_dump_ppgtt(struct 
> i915_hw_ppgtt *ppgtt, struct seq_file *m)
>  	}
>  }
>  
> +static int gen8_preallocate_top_level_pdps(struct i915_hw_ppgtt 
> *ppgtt)
> +{
> +	unsigned long *new_page_dirs, **new_page_tables;
> +	uint32_t pdpes = I915_PDPES_PER_PDP(dev);
> +	int ret;
> +
> +	/* We allocate temp bitmap for page tables for no gain
> +	 * but as this is for init only, lets keep the things simple
> +	 */
> +	ret = alloc_gen8_temp_bitmaps(&new_page_dirs, 
> &new_page_tables, pdpes);
> +	if (ret)
> +		return ret;
> +
> +	/* Allocate for all pdps regardless of how the ppgtt
> +	 * was defined.
> +	 */
> +	ret = gen8_ppgtt_alloc_page_directories(&ppgtt->base, &ppgtt
> ->pdp,
> +						0, 1ULL << 32,
> +						new_page_dirs);
> +	if (!ret)
> +		*ppgtt->pdp.used_pdpes = *new_page_dirs;
> +
> +	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, 
> pdpes);
> +
> +	return ret;
> +}
> +
>  /*
>   * GEN8 legacy ppgtt programming is accomplished through a max 4 PDP 
> registers
>   * with a net effect resembling a 2-level page table in normal x86 
> terms. Each
> @@ -1484,6 +1511,12 @@ static int gen8_ppgtt_init(struct 
> i915_hw_ppgtt *ppgtt)
>  		trace_i915_page_directory_pointer_entry_alloc(&ppgtt
> ->base,
>  							      0, 0,
>  							     
>  GEN8_PML4E_SHIFT);
> +
> +		if (intel_vgpu_active(ppgtt->base.dev)) {
> +			ret = 
> gen8_preallocate_top_level_pdps(ppgtt);
> +			if (ret)
> +				goto free_scratch;
> +		}
>  	}
>  
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
> b/drivers/gpu/drm/i915/intel_lrc.c
> index e77b6b0..2dc8709 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1540,7 +1540,8 @@ static int gen8_emit_bb_start(struct 
> drm_i915_gem_request *req,
>  	 * not needed in 48-bit.*/
>  	if (req->ctx->ppgtt &&
>  	    (intel_ring_flag(req->ring) & req->ctx->ppgtt
> ->pd_dirty_rings)) {
> -		if (!USES_FULL_48BIT_PPGTT(req->i915)) {
> +		if (!USES_FULL_48BIT_PPGTT(req->i915) &&
> +		    !intel_vgpu_active(req->i915->dev)) {
>  			ret = intel_logical_ring_emit_pdps(req);
>  			if (ret)
>  				return ret;
Mika Kuoppala Aug. 26, 2015, 1:21 p.m. UTC | #2
Joonas Lahtinen <joonas.lahtinen@linux.intel.com> writes:

> Hi,
>
> Added Michel and Dave as CC too, to notice this, as they are specified
> in the patch as CC.
>
> On to, 2015-08-20 at 15:45 +0800, Zhiyuan Lv wrote:
>> This is based on Mika Kuoppala's patch below:
>> http://article.gmane.org/gmane.comp.freedesktop.xorg.drivers.intel/61
>> 104/match=workaround+hw+preload
>> 
>> The patch will preallocate the page directories for 32-bit PPGTT when
>> i915 runs inside a virtual machine with Intel GVT-g. With this 
>> change,
>> the root pointers in EXECLIST context will always keep the same.
>> 
>> The change is needed for vGPU because Intel GVT-g will do page table
>> shadowing, and needs to track all the page table changes from guest
>> i915 driver. However, if guest PPGTT is modified through GPU commands
>> like LRI, it is not possible to trap the operations in the right 
>> time,
>> so it will be hard to make shadow PPGTT to work correctly.
>> 
>> Shadow PPGTT could be much simpler with this change. Meanwhile
>> hypervisor could simply prohibit any attempt of PPGTT modification
>> through GPU command for security.
>> 
>> The function gen8_preallocate_top_level_pdps() in the patch is from
>> Mika, with only one change to set "used_pdpes" to avoid duplicated
>> allocation later.
>> 
>> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
>> Cc: Dave Gordon <david.s.gordon@intel.com>
>> Cc: Michel Thierry <michel.thierry@intel.com>
>> Signed-off-by: Zhiyuan Lv <zhiyuan.lv@intel.com>
>> Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
>> 
>
> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>
> I'm just wondering if it's worth keeping the LRI method of updating the
> PDPS at all, for the sake of a couple of KBs per PPGTT, now that we
> have an occasional need for making them static. So this patch is R-b:d,
> but I'd suggest discussion about removing the LRI update method, and
> favoring static PDPS always for 32-bit.
>

LRI update doesn't add context creation overhead. But it adds
stuff to command stream whenever the virtual address space grows,
as we flush the dirty status with the lri update. So i suspect
we flush way more than we should. And this is causing atleast
some runtime overhead.

It should be enough to only do lri update when the top level pdps
change. This could be achieved by doing more finegrained dirty
tracking. But as we never shrink the virtual memory, the
amount of lri flushes should diminish as the client
will eventually reach the max allocation pattern.

If we would go with static top level pdps regardless of
if vgpu is active, we would waste memory, usually 3 
pages worth. And trade gpu runtime overhead for context
setup overhead (those pages needs to be filled to point
to scratch). And some win with reduced complexity also.

-Mika


> Regards, Joonas
>
>> ---
>>  drivers/gpu/drm/i915/i915_gem_gtt.c | 33 
>> +++++++++++++++++++++++++++++++++
>>  drivers/gpu/drm/i915/intel_lrc.c    |  3 ++-
>>  2 files changed, 35 insertions(+), 1 deletion(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> index 4a76807..ed10e77 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> @@ -1441,6 +1441,33 @@ static void gen8_dump_ppgtt(struct 
>> i915_hw_ppgtt *ppgtt, struct seq_file *m)
>>  	}
>>  }
>>  
>> +static int gen8_preallocate_top_level_pdps(struct i915_hw_ppgtt 
>> *ppgtt)
>> +{
>> +	unsigned long *new_page_dirs, **new_page_tables;
>> +	uint32_t pdpes = I915_PDPES_PER_PDP(dev);
>> +	int ret;
>> +
>> +	/* We allocate temp bitmap for page tables for no gain
>> +	 * but as this is for init only, lets keep the things simple
>> +	 */
>> +	ret = alloc_gen8_temp_bitmaps(&new_page_dirs, 
>> &new_page_tables, pdpes);
>> +	if (ret)
>> +		return ret;
>> +
>> +	/* Allocate for all pdps regardless of how the ppgtt
>> +	 * was defined.
>> +	 */
>> +	ret = gen8_ppgtt_alloc_page_directories(&ppgtt->base, &ppgtt
>> ->pdp,
>> +						0, 1ULL << 32,
>> +						new_page_dirs);
>> +	if (!ret)
>> +		*ppgtt->pdp.used_pdpes = *new_page_dirs;
>> +
>> +	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, 
>> pdpes);
>> +
>> +	return ret;
>> +}
>> +
>>  /*
>>   * GEN8 legacy ppgtt programming is accomplished through a max 4 PDP 
>> registers
>>   * with a net effect resembling a 2-level page table in normal x86 
>> terms. Each
>> @@ -1484,6 +1511,12 @@ static int gen8_ppgtt_init(struct 
>> i915_hw_ppgtt *ppgtt)
>>  		trace_i915_page_directory_pointer_entry_alloc(&ppgtt
>> ->base,
>>  							      0, 0,
>>  							     
>>  GEN8_PML4E_SHIFT);
>> +
>> +		if (intel_vgpu_active(ppgtt->base.dev)) {
>> +			ret = 
>> gen8_preallocate_top_level_pdps(ppgtt);
>> +			if (ret)
>> +				goto free_scratch;
>> +		}
>>  	}
>>  
>>  	return 0;
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
>> b/drivers/gpu/drm/i915/intel_lrc.c
>> index e77b6b0..2dc8709 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -1540,7 +1540,8 @@ static int gen8_emit_bb_start(struct 
>> drm_i915_gem_request *req,
>>  	 * not needed in 48-bit.*/
>>  	if (req->ctx->ppgtt &&
>>  	    (intel_ring_flag(req->ring) & req->ctx->ppgtt
>> ->pd_dirty_rings)) {
>> -		if (!USES_FULL_48BIT_PPGTT(req->i915)) {
>> +		if (!USES_FULL_48BIT_PPGTT(req->i915) &&
>> +		    !intel_vgpu_active(req->i915->dev)) {
>>  			ret = intel_logical_ring_emit_pdps(req);
>>  			if (ret)
>>  				return ret;

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 4a76807..ed10e77 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1441,6 +1441,33 @@  static void gen8_dump_ppgtt(struct i915_hw_ppgtt *ppgtt, struct seq_file *m)
 	}
 }
 
+static int gen8_preallocate_top_level_pdps(struct i915_hw_ppgtt *ppgtt)
+{
+	unsigned long *new_page_dirs, **new_page_tables;
+	uint32_t pdpes = I915_PDPES_PER_PDP(dev);
+	int ret;
+
+	/* We allocate temp bitmap for page tables for no gain
+	 * but as this is for init only, lets keep the things simple
+	 */
+	ret = alloc_gen8_temp_bitmaps(&new_page_dirs, &new_page_tables, pdpes);
+	if (ret)
+		return ret;
+
+	/* Allocate for all pdps regardless of how the ppgtt
+	 * was defined.
+	 */
+	ret = gen8_ppgtt_alloc_page_directories(&ppgtt->base, &ppgtt->pdp,
+						0, 1ULL << 32,
+						new_page_dirs);
+	if (!ret)
+		*ppgtt->pdp.used_pdpes = *new_page_dirs;
+
+	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
+
+	return ret;
+}
+
 /*
  * GEN8 legacy ppgtt programming is accomplished through a max 4 PDP registers
  * with a net effect resembling a 2-level page table in normal x86 terms. Each
@@ -1484,6 +1511,12 @@  static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
 		trace_i915_page_directory_pointer_entry_alloc(&ppgtt->base,
 							      0, 0,
 							      GEN8_PML4E_SHIFT);
+
+		if (intel_vgpu_active(ppgtt->base.dev)) {
+			ret = gen8_preallocate_top_level_pdps(ppgtt);
+			if (ret)
+				goto free_scratch;
+		}
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e77b6b0..2dc8709 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1540,7 +1540,8 @@  static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
 	 * not needed in 48-bit.*/
 	if (req->ctx->ppgtt &&
 	    (intel_ring_flag(req->ring) & req->ctx->ppgtt->pd_dirty_rings)) {
-		if (!USES_FULL_48BIT_PPGTT(req->i915)) {
+		if (!USES_FULL_48BIT_PPGTT(req->i915) &&
+		    !intel_vgpu_active(req->i915->dev)) {
 			ret = intel_logical_ring_emit_pdps(req);
 			if (ret)
 				return ret;