diff mbox

[02/21] drm/i915/gtt: Workaround for HW preload not flushing pdps

Message ID 1432314314-23530-3-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala May 22, 2015, 5:04 p.m. UTC
With BDW/SKL and 32bit addressing mode only, the hardware preloads
pdps. However the TLB invalidation only has effect on levels below
the pdps. This means that if pdps change, hw might access with
stale pdp entry.

To combat this problem, preallocate the top pdps so that hw sees
them as immutable for each context.

Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Rafael Barbalho <rafael.barbalho@intel.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c | 50 +++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_reg.h     | 17 +++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c    | 15 +----------
 3 files changed, 68 insertions(+), 14 deletions(-)

Comments

Michel Thierry May 29, 2015, 11:05 a.m. UTC | #1
On 5/22/2015 6:04 PM, Mika Kuoppala wrote:
> With BDW/SKL and 32bit addressing mode only, the hardware preloads
> pdps. However the TLB invalidation only has effect on levels below
> the pdps. This means that if pdps change, hw might access with
> stale pdp entry.
>
> To combat this problem, preallocate the top pdps so that hw sees
> them as immutable for each context.
>
> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> Cc: Rafael Barbalho <rafael.barbalho@intel.com>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem_gtt.c | 50 +++++++++++++++++++++++++++++++++++++
>   drivers/gpu/drm/i915/i915_reg.h     | 17 +++++++++++++
>   drivers/gpu/drm/i915/intel_lrc.c    | 15 +----------
>   3 files changed, 68 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 0ffd459..1a5ad4c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -941,6 +941,48 @@ err_out:
>          return ret;
>   }
>
> +/* With some architectures and 32bit legacy mode, hardware pre-loads the
> + * top level pdps but the tlb invalidation only invalidates the lower levels.
> + * This might lead to hw fetching with stale pdp entries if top level
> + * structure changes, ie va space grows with dynamic page tables.
> + */
> +static bool hw_wont_flush_pdp_tlbs(struct i915_hw_ppgtt *ppgtt)
> +{
> +       struct drm_device *dev = ppgtt->base.dev;
> +
> +       if (GEN8_CTX_ADDRESSING_MODE != LEGACY_32B_CONTEXT)
> +               return false;
> +
> +       if (IS_BROADWELL(dev) || IS_SKYLAKE(dev))
> +               return true;
The pd load restriction is also true for chv and bxt.
And to be safe, we can set reg 0x4030 bit14 to '1' (PD load disable). 
Since this register is not part of the context state, it can be added 
with the other platform workarounds in intel_pm.c.

> +
> +       return false;
> +}
> +
> +static int gen8_preallocate_top_level_pdps(struct i915_hw_ppgtt *ppgtt)
> +{
> +       unsigned long *new_page_dirs, **new_page_tables;
> +       int ret;
> +
> +       /* We allocate temp bitmap for page tables for no gain
> +        * but as this is for init only, lets keep the things simple
> +        */
> +       ret = alloc_gen8_temp_bitmaps(&new_page_dirs, &new_page_tables);
> +       if (ret)
> +               return ret;
> +
> +       /* Allocate for all pdps regardless of how the ppgtt
> +        * was defined.
> +        */
> +       ret = gen8_ppgtt_alloc_page_directories(ppgtt, &ppgtt->pdp,
> +                                               0, 1ULL << 32,
> +                                               new_page_dirs);
> +
> +       free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
> +
> +       return ret;
> +}
> +
>   /*
>    * GEN8 legacy ppgtt programming is accomplished through a max 4 PDP registers
>    * with a net effect resembling a 2-level page table in normal x86 terms. Each
> @@ -972,6 +1014,14 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>
>          ppgtt->switch_mm = gen8_mm_switch;
>
> +       if (hw_wont_flush_pdp_tlbs(ppgtt)) {
> +               /* Avoid the tlb flush bug by preallocating
> +                * whole top level pdp structure so it stays
> +                * static even if our va space grows.
> +                */
> +               return gen8_preallocate_top_level_pdps(ppgtt);
> +       }
> +
>          return 0;
>   }
>
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 6eeba63..334324b 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -2777,6 +2777,23 @@ enum skl_disp_power_wells {
>   #define VLV_CLK_CTL2                   0x101104
>   #define   CLK_CTL2_CZCOUNT_30NS_SHIFT  28
>
> +/* Context descriptor format bits */
> +#define GEN8_CTX_VALID                 (1<<0)
> +#define GEN8_CTX_FORCE_PD_RESTORE      (1<<1)
> +#define GEN8_CTX_FORCE_RESTORE         (1<<2)
> +#define GEN8_CTX_L3LLC_COHERENT                (1<<5)
> +#define GEN8_CTX_PRIVILEGE             (1<<8)
> +
> +enum {
> +       ADVANCED_CONTEXT = 0,
> +       LEGACY_32B_CONTEXT,
> +       ADVANCED_AD_CONTEXT,
> +       LEGACY_64B_CONTEXT
> +};
> +
> +#define GEN8_CTX_ADDRESSING_MODE_SHIFT 3
> +#define GEN8_CTX_ADDRESSING_MODE       LEGACY_32B_CONTEXT
> +
>   /*
>    * Overlay regs
>    */
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 96ae90a..d793d4e 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -183,12 +183,6 @@
>   #define CTX_R_PWR_CLK_STATE            0x42
>   #define CTX_GPGPU_CSR_BASE_ADDRESS     0x44
>
> -#define GEN8_CTX_VALID (1<<0)
> -#define GEN8_CTX_FORCE_PD_RESTORE (1<<1)
> -#define GEN8_CTX_FORCE_RESTORE (1<<2)
> -#define GEN8_CTX_L3LLC_COHERENT (1<<5)
> -#define GEN8_CTX_PRIVILEGE (1<<8)
> -
>   #define ASSIGN_CTX_PDP(ppgtt, reg_state, n) { \
>          const u64 _addr = test_bit(n, ppgtt->pdp.used_pdpes) ? \
>                  ppgtt->pdp.page_directory[n]->daddr : \
> @@ -198,13 +192,6 @@
>   }
>
>   enum {
> -       ADVANCED_CONTEXT = 0,
> -       LEGACY_CONTEXT,
> -       ADVANCED_AD_CONTEXT,
> -       LEGACY_64B_CONTEXT
> -};
> -#define GEN8_CTX_MODE_SHIFT 3
> -enum {
>          FAULT_AND_HANG = 0,
>          FAULT_AND_HALT, /* Debug only */
>          FAULT_AND_STREAM,
> @@ -273,7 +260,7 @@ static uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring,
>          WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
>
>          desc = GEN8_CTX_VALID;
> -       desc |= LEGACY_CONTEXT << GEN8_CTX_MODE_SHIFT;
> +       desc |= GEN8_CTX_ADDRESSING_MODE << GEN8_CTX_ADDRESSING_MODE_SHIFT;
>          if (IS_GEN8(ctx_obj->base.dev))
>                  desc |= GEN8_CTX_L3LLC_COHERENT;
>          desc |= GEN8_CTX_PRIVILEGE;
> --
> 1.9.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Michel Thierry May 29, 2015, 12:53 p.m. UTC | #2
On 5/29/2015 12:05 PM, Michel Thierry wrote:
> On 5/22/2015 6:04 PM, Mika Kuoppala wrote:
>> With BDW/SKL and 32bit addressing mode only, the hardware preloads
>> pdps. However the TLB invalidation only has effect on levels below
>> the pdps. This means that if pdps change, hw might access with
>> stale pdp entry.
>>
>> To combat this problem, preallocate the top pdps so that hw sees
>> them as immutable for each context.
>>
>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>> Cc: Rafael Barbalho <rafael.barbalho@intel.com>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_gem_gtt.c | 50 
>> +++++++++++++++++++++++++++++++++++++
>>   drivers/gpu/drm/i915/i915_reg.h     | 17 +++++++++++++
>>   drivers/gpu/drm/i915/intel_lrc.c    | 15 +----------
>>   3 files changed, 68 insertions(+), 14 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> index 0ffd459..1a5ad4c 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> @@ -941,6 +941,48 @@ err_out:
>>          return ret;
>>   }
>>
>> +/* With some architectures and 32bit legacy mode, hardware pre-loads 
>> the
>> + * top level pdps but the tlb invalidation only invalidates the 
>> lower levels.
>> + * This might lead to hw fetching with stale pdp entries if top level
>> + * structure changes, ie va space grows with dynamic page tables.
>> + */
>> +static bool hw_wont_flush_pdp_tlbs(struct i915_hw_ppgtt *ppgtt)
>> +{
>> +       struct drm_device *dev = ppgtt->base.dev;
>> +
>> +       if (GEN8_CTX_ADDRESSING_MODE != LEGACY_32B_CONTEXT)
>> +               return false;
>> +
>> +       if (IS_BROADWELL(dev) || IS_SKYLAKE(dev))
>> +               return true;
> The pd load restriction is also true for chv and bxt.
> And to be safe, we can set reg 0x4030 bit14 to '1' (PD load disable). 
> Since this register is not part of the context state, it can be added 
> with the other platform workarounds in intel_pm.c.
>
>> +
>> +       return false;
>> +}
>> +
>> +static int gen8_preallocate_top_level_pdps(struct i915_hw_ppgtt *ppgtt)
>> +{
>> +       unsigned long *new_page_dirs, **new_page_tables;
>> +       int ret;
>> +
>> +       /* We allocate temp bitmap for page tables for no gain
>> +        * but as this is for init only, lets keep the things simple
>> +        */
>> +       ret = alloc_gen8_temp_bitmaps(&new_page_dirs, &new_page_tables);
>> +       if (ret)
>> +               return ret;
>> +
>> +       /* Allocate for all pdps regardless of how the ppgtt
>> +        * was defined.
>> +        */
>> +       ret = gen8_ppgtt_alloc_page_directories(ppgtt, &ppgtt->pdp,
>> +                                               0, 1ULL << 32,
>> +                                               new_page_dirs);
>> +
>> +       free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
>> +
>> +       return ret;
>> +}
>> +
>>   /*
>>    * GEN8 legacy ppgtt programming is accomplished through a max 4 
>> PDP registers
>>    * with a net effect resembling a 2-level page table in normal x86 
>> terms. Each
>> @@ -972,6 +1014,14 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt 
>> *ppgtt)
>>
>>          ppgtt->switch_mm = gen8_mm_switch;
>>
>> +       if (hw_wont_flush_pdp_tlbs(ppgtt)) {
>> +               /* Avoid the tlb flush bug by preallocating
>> +                * whole top level pdp structure so it stays
>> +                * static even if our va space grows.
>> +                */
>> +               return gen8_preallocate_top_level_pdps(ppgtt);
>> +       }
>> +
Also, we will need the same hw_wont_flush check in the cleanup function, 
and iterate each_pdpe (pd) from 0 to 4GiB (otherwise we will leak some 
of the preallocated page dirs).

>>          return 0;
>>   }
>>
>> diff --git a/drivers/gpu/drm/i915/i915_reg.h 
>> b/drivers/gpu/drm/i915/i915_reg.h
>> index 6eeba63..334324b 100644
>> --- a/drivers/gpu/drm/i915/i915_reg.h
>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>> @@ -2777,6 +2777,23 @@ enum skl_disp_power_wells {
>>   #define VLV_CLK_CTL2                   0x101104
>>   #define   CLK_CTL2_CZCOUNT_30NS_SHIFT  28
>>
>> +/* Context descriptor format bits */
>> +#define GEN8_CTX_VALID                 (1<<0)
>> +#define GEN8_CTX_FORCE_PD_RESTORE      (1<<1)
>> +#define GEN8_CTX_FORCE_RESTORE         (1<<2)
>> +#define GEN8_CTX_L3LLC_COHERENT                (1<<5)
>> +#define GEN8_CTX_PRIVILEGE             (1<<8)
>> +
>> +enum {
>> +       ADVANCED_CONTEXT = 0,
>> +       LEGACY_32B_CONTEXT,
>> +       ADVANCED_AD_CONTEXT,
>> +       LEGACY_64B_CONTEXT
>> +};
>> +
>> +#define GEN8_CTX_ADDRESSING_MODE_SHIFT 3
>> +#define GEN8_CTX_ADDRESSING_MODE       LEGACY_32B_CONTEXT
>> +
>>   /*
>>    * Overlay regs
>>    */
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
>> b/drivers/gpu/drm/i915/intel_lrc.c
>> index 96ae90a..d793d4e 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -183,12 +183,6 @@
>>   #define CTX_R_PWR_CLK_STATE            0x42
>>   #define CTX_GPGPU_CSR_BASE_ADDRESS     0x44
>>
>> -#define GEN8_CTX_VALID (1<<0)
>> -#define GEN8_CTX_FORCE_PD_RESTORE (1<<1)
>> -#define GEN8_CTX_FORCE_RESTORE (1<<2)
>> -#define GEN8_CTX_L3LLC_COHERENT (1<<5)
>> -#define GEN8_CTX_PRIVILEGE (1<<8)
>> -
>>   #define ASSIGN_CTX_PDP(ppgtt, reg_state, n) { \
>>          const u64 _addr = test_bit(n, ppgtt->pdp.used_pdpes) ? \
>>                  ppgtt->pdp.page_directory[n]->daddr : \
>> @@ -198,13 +192,6 @@
>>   }
>>
>>   enum {
>> -       ADVANCED_CONTEXT = 0,
>> -       LEGACY_CONTEXT,
>> -       ADVANCED_AD_CONTEXT,
>> -       LEGACY_64B_CONTEXT
>> -};
>> -#define GEN8_CTX_MODE_SHIFT 3
>> -enum {
>>          FAULT_AND_HANG = 0,
>>          FAULT_AND_HALT, /* Debug only */
>>          FAULT_AND_STREAM,
>> @@ -273,7 +260,7 @@ static uint64_t execlists_ctx_descriptor(struct 
>> intel_engine_cs *ring,
>>          WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
>>
>>          desc = GEN8_CTX_VALID;
>> -       desc |= LEGACY_CONTEXT << GEN8_CTX_MODE_SHIFT;
>> +       desc |= GEN8_CTX_ADDRESSING_MODE << 
>> GEN8_CTX_ADDRESSING_MODE_SHIFT;
>>          if (IS_GEN8(ctx_obj->base.dev))
>>                  desc |= GEN8_CTX_L3LLC_COHERENT;
>>          desc |= GEN8_CTX_PRIVILEGE;
>> -- 
>> 1.9.1
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Michel Thierry June 10, 2015, 11:42 a.m. UTC | #3
On 5/29/2015 1:53 PM, Michel Thierry wrote:
> On 5/29/2015 12:05 PM, Michel Thierry wrote:
>> On 5/22/2015 6:04 PM, Mika Kuoppala wrote:
>>> With BDW/SKL and 32bit addressing mode only, the hardware preloads
>>> pdps. However the TLB invalidation only has effect on levels below
>>> the pdps. This means that if pdps change, hw might access with
>>> stale pdp entry.
>>>
>>> To combat this problem, preallocate the top pdps so that hw sees
>>> them as immutable for each context.
>>>
>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>>> Cc: Rafael Barbalho <rafael.barbalho@intel.com>
>>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/i915_gem_gtt.c | 50
>>> +++++++++++++++++++++++++++++++++++++
>>>   drivers/gpu/drm/i915/i915_reg.h     | 17 +++++++++++++
>>>   drivers/gpu/drm/i915/intel_lrc.c    | 15 +----------
>>>   3 files changed, 68 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>> index 0ffd459..1a5ad4c 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>> @@ -941,6 +941,48 @@ err_out:
>>>          return ret;
>>>   }
>>>
>>> +/* With some architectures and 32bit legacy mode, hardware pre-loads
>>> the
>>> + * top level pdps but the tlb invalidation only invalidates the
>>> lower levels.
>>> + * This might lead to hw fetching with stale pdp entries if top level
>>> + * structure changes, ie va space grows with dynamic page tables.
>>> + */
>>> +static bool hw_wont_flush_pdp_tlbs(struct i915_hw_ppgtt *ppgtt)
>>> +{
>>> +       struct drm_device *dev = ppgtt->base.dev;
>>> +
>>> +       if (GEN8_CTX_ADDRESSING_MODE != LEGACY_32B_CONTEXT)
>>> +               return false;
>>> +
>>> +       if (IS_BROADWELL(dev) || IS_SKYLAKE(dev))
>>> +               return true;
>> The pd load restriction is also true for chv and bxt.
>> And to be safe, we can set reg 0x4030 bit14 to '1' (PD load disable).
>> Since this register is not part of the context state, it can be added
>> with the other platform workarounds in intel_pm.c.
>>
>>> +
>>> +       return false;
>>> +}
>>> +
>>> +static int gen8_preallocate_top_level_pdps(struct i915_hw_ppgtt *ppgtt)
>>> +{
>>> +       unsigned long *new_page_dirs, **new_page_tables;
>>> +       int ret;
>>> +
>>> +       /* We allocate temp bitmap for page tables for no gain
>>> +        * but as this is for init only, lets keep the things simple
>>> +        */
>>> +       ret = alloc_gen8_temp_bitmaps(&new_page_dirs, &new_page_tables);
>>> +       if (ret)
>>> +               return ret;
>>> +
>>> +       /* Allocate for all pdps regardless of how the ppgtt
>>> +        * was defined.
>>> +        */
>>> +       ret = gen8_ppgtt_alloc_page_directories(ppgtt, &ppgtt->pdp,
>>> +                                               0, 1ULL << 32,
>>> +                                               new_page_dirs);
>>> +
>>> +       free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);

Second thoughts on this, just set the used_pdpes bits, and then the 
cleanup function will free these pdps correctly:

+    /* mark all pdps as used, otherwise won't clean them correctly */
+    bitmap_fill(ppgtt->pdp.used_pdpes, GEN8_LEGACY_PDPES);

>>> +
>>> +       return ret;
>>> +}
>>> +
>>>   /*
>>>    * GEN8 legacy ppgtt programming is accomplished through a max 4
>>> PDP registers
>>>    * with a net effect resembling a 2-level page table in normal x86
>>> terms. Each
>>> @@ -972,6 +1014,14 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt
>>> *ppgtt)
>>>
>>>          ppgtt->switch_mm = gen8_mm_switch;
>>>
>>> +       if (hw_wont_flush_pdp_tlbs(ppgtt)) {
>>> +               /* Avoid the tlb flush bug by preallocating
>>> +                * whole top level pdp structure so it stays
>>> +                * static even if our va space grows.
>>> +                */
>>> +               return gen8_preallocate_top_level_pdps(ppgtt);
>>> +       }
>>> +
> Also, we will need the same hw_wont_flush check in the cleanup function,
> and iterate each_pdpe (pd) from 0 to 4GiB (otherwise we will leak some
> of the preallocated page dirs).
>
>>>          return 0;
>>>   }
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_reg.h
>>> b/drivers/gpu/drm/i915/i915_reg.h
>>> index 6eeba63..334324b 100644
>>> --- a/drivers/gpu/drm/i915/i915_reg.h
>>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>>> @@ -2777,6 +2777,23 @@ enum skl_disp_power_wells {
>>>   #define VLV_CLK_CTL2                   0x101104
>>>   #define   CLK_CTL2_CZCOUNT_30NS_SHIFT  28
>>>
>>> +/* Context descriptor format bits */
>>> +#define GEN8_CTX_VALID                 (1<<0)
>>> +#define GEN8_CTX_FORCE_PD_RESTORE      (1<<1)
>>> +#define GEN8_CTX_FORCE_RESTORE         (1<<2)
>>> +#define GEN8_CTX_L3LLC_COHERENT                (1<<5)
>>> +#define GEN8_CTX_PRIVILEGE             (1<<8)
>>> +
>>> +enum {
>>> +       ADVANCED_CONTEXT = 0,
>>> +       LEGACY_32B_CONTEXT,
>>> +       ADVANCED_AD_CONTEXT,
>>> +       LEGACY_64B_CONTEXT
>>> +};
>>> +
>>> +#define GEN8_CTX_ADDRESSING_MODE_SHIFT 3
>>> +#define GEN8_CTX_ADDRESSING_MODE       LEGACY_32B_CONTEXT
>>> +
>>>   /*
>>>    * Overlay regs
>>>    */
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c
>>> b/drivers/gpu/drm/i915/intel_lrc.c
>>> index 96ae90a..d793d4e 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -183,12 +183,6 @@
>>>   #define CTX_R_PWR_CLK_STATE            0x42
>>>   #define CTX_GPGPU_CSR_BASE_ADDRESS     0x44
>>>
>>> -#define GEN8_CTX_VALID (1<<0)
>>> -#define GEN8_CTX_FORCE_PD_RESTORE (1<<1)
>>> -#define GEN8_CTX_FORCE_RESTORE (1<<2)
>>> -#define GEN8_CTX_L3LLC_COHERENT (1<<5)
>>> -#define GEN8_CTX_PRIVILEGE (1<<8)
>>> -
>>>   #define ASSIGN_CTX_PDP(ppgtt, reg_state, n) { \
>>>          const u64 _addr = test_bit(n, ppgtt->pdp.used_pdpes) ? \
>>>                  ppgtt->pdp.page_directory[n]->daddr : \
>>> @@ -198,13 +192,6 @@
>>>   }
>>>
>>>   enum {
>>> -       ADVANCED_CONTEXT = 0,
>>> -       LEGACY_CONTEXT,
>>> -       ADVANCED_AD_CONTEXT,
>>> -       LEGACY_64B_CONTEXT
>>> -};
>>> -#define GEN8_CTX_MODE_SHIFT 3
>>> -enum {
>>>          FAULT_AND_HANG = 0,
>>>          FAULT_AND_HALT, /* Debug only */
>>>          FAULT_AND_STREAM,
>>> @@ -273,7 +260,7 @@ static uint64_t execlists_ctx_descriptor(struct
>>> intel_engine_cs *ring,
>>>          WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
>>>
>>>          desc = GEN8_CTX_VALID;
>>> -       desc |= LEGACY_CONTEXT << GEN8_CTX_MODE_SHIFT;
>>> +       desc |= GEN8_CTX_ADDRESSING_MODE <<
>>> GEN8_CTX_ADDRESSING_MODE_SHIFT;
>>>          if (IS_GEN8(ctx_obj->base.dev))
>>>                  desc |= GEN8_CTX_L3LLC_COHERENT;
>>>          desc |= GEN8_CTX_PRIVILEGE;
>>> --
>>> 1.9.1
>>>
>>> _______________________________________________
>>> Intel-gfx mailing list
>>> Intel-gfx@lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Dave Gordon June 11, 2015, 7:31 a.m. UTC | #4
On 10/06/15 12:42, Michel Thierry wrote:
> On 5/29/2015 1:53 PM, Michel Thierry wrote:
>> On 5/29/2015 12:05 PM, Michel Thierry wrote:
>>> On 5/22/2015 6:04 PM, Mika Kuoppala wrote:
>>>> With BDW/SKL and 32bit addressing mode only, the hardware preloads
>>>> pdps. However the TLB invalidation only has effect on levels below
>>>> the pdps. This means that if pdps change, hw might access with
>>>> stale pdp entry.
>>>>
>>>> To combat this problem, preallocate the top pdps so that hw sees
>>>> them as immutable for each context.
>>>>
>>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>>>> Cc: Rafael Barbalho <rafael.barbalho@intel.com>
>>>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>>>> ---
>>>>   drivers/gpu/drm/i915/i915_gem_gtt.c | 50
>>>> +++++++++++++++++++++++++++++++++++++
>>>>   drivers/gpu/drm/i915/i915_reg.h     | 17 +++++++++++++
>>>>   drivers/gpu/drm/i915/intel_lrc.c    | 15 +----------
>>>>   3 files changed, 68 insertions(+), 14 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>> index 0ffd459..1a5ad4c 100644
>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>> @@ -941,6 +941,48 @@ err_out:
>>>>          return ret;
>>>>   }
>>>>
>>>> +/* With some architectures and 32bit legacy mode, hardware pre-loads
>>>> the
>>>> + * top level pdps but the tlb invalidation only invalidates the
>>>> lower levels.
>>>> + * This might lead to hw fetching with stale pdp entries if top level
>>>> + * structure changes, ie va space grows with dynamic page tables.
>>>> + */

Is this still necessary if we reload PDPs via LRI instructions whenever
the address map has changed? That always (AFAICT) causes sufficient
invalidation, so then we might not need to preallocate at all :)

.Dave.
Michel Thierry June 11, 2015, 10:46 a.m. UTC | #5
On 6/11/2015 8:31 AM, Dave Gordon wrote:
> On 10/06/15 12:42, Michel Thierry wrote:
>> On 5/29/2015 1:53 PM, Michel Thierry wrote:
>>> On 5/29/2015 12:05 PM, Michel Thierry wrote:
>>>> On 5/22/2015 6:04 PM, Mika Kuoppala wrote:
>>>>> With BDW/SKL and 32bit addressing mode only, the hardware preloads
>>>>> pdps. However the TLB invalidation only has effect on levels below
>>>>> the pdps. This means that if pdps change, hw might access with
>>>>> stale pdp entry.
>>>>>
>>>>> To combat this problem, preallocate the top pdps so that hw sees
>>>>> them as immutable for each context.
>>>>>
>>>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>>>>> Cc: Rafael Barbalho <rafael.barbalho@intel.com>
>>>>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>>>>> ---
>>>>>    drivers/gpu/drm/i915/i915_gem_gtt.c | 50
>>>>> +++++++++++++++++++++++++++++++++++++
>>>>>    drivers/gpu/drm/i915/i915_reg.h     | 17 +++++++++++++
>>>>>    drivers/gpu/drm/i915/intel_lrc.c    | 15 +----------
>>>>>    3 files changed, 68 insertions(+), 14 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> index 0ffd459..1a5ad4c 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> @@ -941,6 +941,48 @@ err_out:
>>>>>           return ret;
>>>>>    }
>>>>>
>>>>> +/* With some architectures and 32bit legacy mode, hardware pre-loads
>>>>> the
>>>>> + * top level pdps but the tlb invalidation only invalidates the
>>>>> lower levels.
>>>>> + * This might lead to hw fetching with stale pdp entries if top level
>>>>> + * structure changes, ie va space grows with dynamic page tables.
>>>>> + */
>
> Is this still necessary if we reload PDPs via LRI instructions whenever
> the address map has changed? That always (AFAICT) causes sufficient
> invalidation, so then we might not need to preallocate at all :)

Correct, if we reload PDPs via LRI [1], the preallocation of top pdps is 
not needed.

[1] 1433954816-13787-2-git-send-email-michel.thierry@intel.com
Mika Kuoppala June 11, 2015, 1:57 p.m. UTC | #6
Dave Gordon <david.s.gordon@intel.com> writes:

> On 10/06/15 12:42, Michel Thierry wrote:
>> On 5/29/2015 1:53 PM, Michel Thierry wrote:
>>> On 5/29/2015 12:05 PM, Michel Thierry wrote:
>>>> On 5/22/2015 6:04 PM, Mika Kuoppala wrote:
>>>>> With BDW/SKL and 32bit addressing mode only, the hardware preloads
>>>>> pdps. However the TLB invalidation only has effect on levels below
>>>>> the pdps. This means that if pdps change, hw might access with
>>>>> stale pdp entry.
>>>>>
>>>>> To combat this problem, preallocate the top pdps so that hw sees
>>>>> them as immutable for each context.
>>>>>
>>>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>>>>> Cc: Rafael Barbalho <rafael.barbalho@intel.com>
>>>>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>>>>> ---
>>>>>   drivers/gpu/drm/i915/i915_gem_gtt.c | 50
>>>>> +++++++++++++++++++++++++++++++++++++
>>>>>   drivers/gpu/drm/i915/i915_reg.h     | 17 +++++++++++++
>>>>>   drivers/gpu/drm/i915/intel_lrc.c    | 15 +----------
>>>>>   3 files changed, 68 insertions(+), 14 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> index 0ffd459..1a5ad4c 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> @@ -941,6 +941,48 @@ err_out:
>>>>>          return ret;
>>>>>   }
>>>>>
>>>>> +/* With some architectures and 32bit legacy mode, hardware pre-loads
>>>>> the
>>>>> + * top level pdps but the tlb invalidation only invalidates the
>>>>> lower levels.
>>>>> + * This might lead to hw fetching with stale pdp entries if top level
>>>>> + * structure changes, ie va space grows with dynamic page tables.
>>>>> + */
>
> Is this still necessary if we reload PDPs via LRI instructions whenever
> the address map has changed? That always (AFAICT) causes sufficient
> invalidation, so then we might not need to preallocate at all :)
>

LRI reload gets my vote. Please ignore this patch.
-Mika

> .Dave.
Zhiyuan Lv Aug. 11, 2015, 5:05 a.m. UTC | #7
Hi Mika/Dave/Michel,

I saw the patch of using LRI for root pointer update has been merged to
drm-intel. When we consider i915 driver to run inside a virtual machine, e.g.
with XenGT, we may still need Mika's this patch like below:

"
        if (intel_vgpu_active(ppgtt->base.dev))
                gen8_preallocate_top_level_pdps(ppgtt);
"

Could you share with us your opinion? Thanks in advance!

The reason behind is that LRI command will make shadow PPGTT implementation
hard. In XenGT, we construct shadow page table for each PPGTT in guest i915
driver, and then track every guest page table change in order to update shadow
page table accordingly. The problem of page table updates with GPU command is
that they cannot be trapped by hypervisor to finish the shadow page table
update work. In XenGT, the only change we have is the command scan in context
submission. But that is not exactly the right time to do shadow page table
update. 

Mika's patch can address the problem nicely. With the preallocation, the root
pointers in EXECLIST context will always keep the same. Then we can treat any
attempt to change guest PPGTT with GPU commands as malicious behavior. Thanks!

Regards,
-Zhiyuan

On Thu, Jun 11, 2015 at 04:57:42PM +0300, Mika Kuoppala wrote:
> Dave Gordon <david.s.gordon@intel.com> writes:
> 
> > On 10/06/15 12:42, Michel Thierry wrote:
> >> On 5/29/2015 1:53 PM, Michel Thierry wrote:
> >>> On 5/29/2015 12:05 PM, Michel Thierry wrote:
> >>>> On 5/22/2015 6:04 PM, Mika Kuoppala wrote:
> >>>>> With BDW/SKL and 32bit addressing mode only, the hardware preloads
> >>>>> pdps. However the TLB invalidation only has effect on levels below
> >>>>> the pdps. This means that if pdps change, hw might access with
> >>>>> stale pdp entry.
> >>>>>
> >>>>> To combat this problem, preallocate the top pdps so that hw sees
> >>>>> them as immutable for each context.
> >>>>>
> >>>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> >>>>> Cc: Rafael Barbalho <rafael.barbalho@intel.com>
> >>>>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> >>>>> ---
> >>>>>   drivers/gpu/drm/i915/i915_gem_gtt.c | 50
> >>>>> +++++++++++++++++++++++++++++++++++++
> >>>>>   drivers/gpu/drm/i915/i915_reg.h     | 17 +++++++++++++
> >>>>>   drivers/gpu/drm/i915/intel_lrc.c    | 15 +----------
> >>>>>   3 files changed, 68 insertions(+), 14 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>> index 0ffd459..1a5ad4c 100644
> >>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>> @@ -941,6 +941,48 @@ err_out:
> >>>>>          return ret;
> >>>>>   }
> >>>>>
> >>>>> +/* With some architectures and 32bit legacy mode, hardware pre-loads
> >>>>> the
> >>>>> + * top level pdps but the tlb invalidation only invalidates the
> >>>>> lower levels.
> >>>>> + * This might lead to hw fetching with stale pdp entries if top level
> >>>>> + * structure changes, ie va space grows with dynamic page tables.
> >>>>> + */
> >
> > Is this still necessary if we reload PDPs via LRI instructions whenever
> > the address map has changed? That always (AFAICT) causes sufficient
> > invalidation, so then we might not need to preallocate at all :)
> >
> 
> LRI reload gets my vote. Please ignore this patch.
> -Mika
> 
> > .Dave.
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Michel Thierry Aug. 12, 2015, 7:56 a.m. UTC | #8
On 8/11/2015 1:05 PM, Zhiyuan Lv wrote:
> Hi Mika/Dave/Michel,
>
> I saw the patch of using LRI for root pointer update has been merged to
> drm-intel. When we consider i915 driver to run inside a virtual machine, e.g.
> with XenGT, we may still need Mika's this patch like below:
>
> "
>          if (intel_vgpu_active(ppgtt->base.dev))
>                  gen8_preallocate_top_level_pdps(ppgtt);
> "
>
> Could you share with us your opinion? Thanks in advance!

Hi Zhiyuan,

The change looks ok to me. If you need to preallocate the PDPs, 
gen8_ppgtt_init is the right place to do it. Only add a similar 
vgpu_active check to disable the LRI updates (in gen8_emit_bb_start).

>
> The reason behind is that LRI command will make shadow PPGTT implementation
> hard. In XenGT, we construct shadow page table for each PPGTT in guest i915
> driver, and then track every guest page table change in order to update shadow
> page table accordingly. The problem of page table updates with GPU command is
> that they cannot be trapped by hypervisor to finish the shadow page table
> update work. In XenGT, the only change we have is the command scan in context
> submission. But that is not exactly the right time to do shadow page table
> update.
>
> Mika's patch can address the problem nicely. With the preallocation, the root
> pointers in EXECLIST context will always keep the same. Then we can treat any
> attempt to change guest PPGTT with GPU commands as malicious behavior. Thanks!
>
> Regards,
> -Zhiyuan
>
> On Thu, Jun 11, 2015 at 04:57:42PM +0300, Mika Kuoppala wrote:
>> Dave Gordon <david.s.gordon@intel.com> writes:
>>
>>> On 10/06/15 12:42, Michel Thierry wrote:
>>>> On 5/29/2015 1:53 PM, Michel Thierry wrote:
>>>>> On 5/29/2015 12:05 PM, Michel Thierry wrote:
>>>>>> On 5/22/2015 6:04 PM, Mika Kuoppala wrote:
>>>>>>> With BDW/SKL and 32bit addressing mode only, the hardware preloads
>>>>>>> pdps. However the TLB invalidation only has effect on levels below
>>>>>>> the pdps. This means that if pdps change, hw might access with
>>>>>>> stale pdp entry.
>>>>>>>
>>>>>>> To combat this problem, preallocate the top pdps so that hw sees
>>>>>>> them as immutable for each context.
>>>>>>>
>>>>>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>>>>>>> Cc: Rafael Barbalho <rafael.barbalho@intel.com>
>>>>>>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>>>>>>> ---
>>>>>>>    drivers/gpu/drm/i915/i915_gem_gtt.c | 50
>>>>>>> +++++++++++++++++++++++++++++++++++++
>>>>>>>    drivers/gpu/drm/i915/i915_reg.h     | 17 +++++++++++++
>>>>>>>    drivers/gpu/drm/i915/intel_lrc.c    | 15 +----------
>>>>>>>    3 files changed, 68 insertions(+), 14 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>> index 0ffd459..1a5ad4c 100644
>>>>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>> @@ -941,6 +941,48 @@ err_out:
>>>>>>>           return ret;
>>>>>>>    }
>>>>>>>
>>>>>>> +/* With some architectures and 32bit legacy mode, hardware pre-loads
>>>>>>> the
>>>>>>> + * top level pdps but the tlb invalidation only invalidates the
>>>>>>> lower levels.
>>>>>>> + * This might lead to hw fetching with stale pdp entries if top level
>>>>>>> + * structure changes, ie va space grows with dynamic page tables.
>>>>>>> + */
>>>
>>> Is this still necessary if we reload PDPs via LRI instructions whenever
>>> the address map has changed? That always (AFAICT) causes sufficient
>>> invalidation, so then we might not need to preallocate at all :)
>>>
>>
>> LRI reload gets my vote. Please ignore this patch.
>> -Mika
>>
>>> .Dave.
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Dave Gordon Aug. 12, 2015, 3:09 p.m. UTC | #9
On 12/08/15 08:56, Thierry, Michel wrote:
> On 8/11/2015 1:05 PM, Zhiyuan Lv wrote:
>> Hi Mika/Dave/Michel,
>>
>> I saw the patch of using LRI for root pointer update has been merged to
>> drm-intel. When we consider i915 driver to run inside a virtual machine, e.g.
>> with XenGT, we may still need Mika's this patch like below:
>>
>> "
>>           if (intel_vgpu_active(ppgtt->base.dev))
>>                   gen8_preallocate_top_level_pdps(ppgtt);
>> "
>>
>> Could you share with us your opinion? Thanks in advance!
>
> Hi Zhiyuan,
>
> The change looks ok to me. If you need to preallocate the PDPs,
> gen8_ppgtt_init is the right place to do it. Only add a similar
> vgpu_active check to disable the LRI updates (in gen8_emit_bb_start).
>
>> The reason behind is that LRI command will make shadow PPGTT implementation
>> hard. In XenGT, we construct shadow page table for each PPGTT in guest i915
>> driver, and then track every guest page table change in order to update shadow
>> page table accordingly. The problem of page table updates with GPU command is
>> that they cannot be trapped by hypervisor to finish the shadow page table
>> update work. In XenGT, the only change we have is the command scan in context
>> submission. But that is not exactly the right time to do shadow page table
>> update.
>>
>> Mika's patch can address the problem nicely. With the preallocation, the root
>> pointers in EXECLIST context will always keep the same. Then we can treat any
>> attempt to change guest PPGTT with GPU commands as malicious behavior. Thanks!
>>
>> Regards,
>> -Zhiyuan

The bad thing that was happening if we didn't use LRIs was that the CPU 
would try to push the new mappings to the GPU by updating PDP registers 
in the saved context image. This is unsafe if the context is running, as 
switching away from it would result in the CPU-updated values being 
overwritten by the older values in the GPU h/w registers (if the context 
were known to be idle, then it would be safe).

Preallocating the top-level PDPs should mean that the values need never 
change, so there's then no need to update the context image, thus 
avoiding the write hazard :)

.Dave.

>> On Thu, Jun 11, 2015 at 04:57:42PM +0300, Mika Kuoppala wrote:
>>> Dave Gordon <david.s.gordon@intel.com> writes:
>>>
>>>> On 10/06/15 12:42, Michel Thierry wrote:
>>>>> On 5/29/2015 1:53 PM, Michel Thierry wrote:
>>>>>> On 5/29/2015 12:05 PM, Michel Thierry wrote:
>>>>>>> On 5/22/2015 6:04 PM, Mika Kuoppala wrote:
>>>>>>>> With BDW/SKL and 32bit addressing mode only, the hardware preloads
>>>>>>>> pdps. However the TLB invalidation only has effect on levels below
>>>>>>>> the pdps. This means that if pdps change, hw might access with
>>>>>>>> stale pdp entry.
>>>>>>>>
>>>>>>>> To combat this problem, preallocate the top pdps so that hw sees
>>>>>>>> them as immutable for each context.
>>>>>>>>
>>>>>>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>>>>>>>> Cc: Rafael Barbalho <rafael.barbalho@intel.com>
>>>>>>>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>>>>>>>> ---
>>>>>>>>     drivers/gpu/drm/i915/i915_gem_gtt.c | 50
>>>>>>>> +++++++++++++++++++++++++++++++++++++
>>>>>>>>     drivers/gpu/drm/i915/i915_reg.h     | 17 +++++++++++++
>>>>>>>>     drivers/gpu/drm/i915/intel_lrc.c    | 15 +----------
>>>>>>>>     3 files changed, 68 insertions(+), 14 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>> index 0ffd459..1a5ad4c 100644
>>>>>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>> @@ -941,6 +941,48 @@ err_out:
>>>>>>>>            return ret;
>>>>>>>>     }
>>>>>>>>
>>>>>>>> +/* With some architectures and 32bit legacy mode, hardware pre-loads
>>>>>>>> + * the top level pdps but the tlb invalidation only invalidates the
>>>>>>>> + * lower levels.
>>>>>>>> + * This might lead to hw fetching with stale pdp entries if top level
>>>>>>>> + * structure changes, ie va space grows with dynamic page tables.
>>>>>>>> + */
>>>>
>>>> Is this still necessary if we reload PDPs via LRI instructions whenever
>>>> the address map has changed? That always (AFAICT) causes sufficient
>>>> invalidation, so then we might not need to preallocate at all :)
>>>
>>> LRI reload gets my vote. Please ignore this patch.
>>> -Mika
>>>
>>>> .Dave.
>>> _______________________________________________
>>> Intel-gfx mailing list
>>> Intel-gfx@lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Zhiyuan Lv Aug. 13, 2015, 9:08 a.m. UTC | #10
Hi Michel,

Thanks for the reply!

I yet have another question: right now the mark_tlb_dirty() will be
called if any level of PPGTT table is changed. But for the EXECLIST
context submission, we only need LRI commands if there are L3 PDP root
pointer changes right? Thanks!

Regards,
-Zhiyuan

On Wed, Aug 12, 2015 at 03:56:49PM +0800, Michel Thierry wrote:
> On 8/11/2015 1:05 PM, Zhiyuan Lv wrote:
> >Hi Mika/Dave/Michel,
> >
> >I saw the patch of using LRI for root pointer update has been merged to
> >drm-intel. When we consider i915 driver to run inside a virtual machine, e.g.
> >with XenGT, we may still need Mika's this patch like below:
> >
> >"
> >         if (intel_vgpu_active(ppgtt->base.dev))
> >                 gen8_preallocate_top_level_pdps(ppgtt);
> >"
> >
> >Could you share with us your opinion? Thanks in advance!
> 
> Hi Zhiyuan,
> 
> The change looks ok to me. If you need to preallocate the PDPs,
> gen8_ppgtt_init is the right place to do it. Only add a similar
> vgpu_active check to disable the LRI updates (in
> gen8_emit_bb_start).
> 
> >
> >The reason behind is that LRI command will make shadow PPGTT implementation
> >hard. In XenGT, we construct shadow page table for each PPGTT in guest i915
> >driver, and then track every guest page table change in order to update shadow
> >page table accordingly. The problem of page table updates with GPU command is
> >that they cannot be trapped by hypervisor to finish the shadow page table
> >update work. In XenGT, the only change we have is the command scan in context
> >submission. But that is not exactly the right time to do shadow page table
> >update.
> >
> >Mika's patch can address the problem nicely. With the preallocation, the root
> >pointers in EXECLIST context will always keep the same. Then we can treat any
> >attempt to change guest PPGTT with GPU commands as malicious behavior. Thanks!
> >
> >Regards,
> >-Zhiyuan
> >
> >On Thu, Jun 11, 2015 at 04:57:42PM +0300, Mika Kuoppala wrote:
> >>Dave Gordon <david.s.gordon@intel.com> writes:
> >>
> >>>On 10/06/15 12:42, Michel Thierry wrote:
> >>>>On 5/29/2015 1:53 PM, Michel Thierry wrote:
> >>>>>On 5/29/2015 12:05 PM, Michel Thierry wrote:
> >>>>>>On 5/22/2015 6:04 PM, Mika Kuoppala wrote:
> >>>>>>>With BDW/SKL and 32bit addressing mode only, the hardware preloads
> >>>>>>>pdps. However the TLB invalidation only has effect on levels below
> >>>>>>>the pdps. This means that if pdps change, hw might access with
> >>>>>>>stale pdp entry.
> >>>>>>>
> >>>>>>>To combat this problem, preallocate the top pdps so that hw sees
> >>>>>>>them as immutable for each context.
> >>>>>>>
> >>>>>>>Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> >>>>>>>Cc: Rafael Barbalho <rafael.barbalho@intel.com>
> >>>>>>>Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> >>>>>>>---
> >>>>>>>   drivers/gpu/drm/i915/i915_gem_gtt.c | 50
> >>>>>>>+++++++++++++++++++++++++++++++++++++
> >>>>>>>   drivers/gpu/drm/i915/i915_reg.h     | 17 +++++++++++++
> >>>>>>>   drivers/gpu/drm/i915/intel_lrc.c    | 15 +----------
> >>>>>>>   3 files changed, 68 insertions(+), 14 deletions(-)
> >>>>>>>
> >>>>>>>diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>>b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>>index 0ffd459..1a5ad4c 100644
> >>>>>>>--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>>+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>>@@ -941,6 +941,48 @@ err_out:
> >>>>>>>          return ret;
> >>>>>>>   }
> >>>>>>>
> >>>>>>>+/* With some architectures and 32bit legacy mode, hardware pre-loads
> >>>>>>>the
> >>>>>>>+ * top level pdps but the tlb invalidation only invalidates the
> >>>>>>>lower levels.
> >>>>>>>+ * This might lead to hw fetching with stale pdp entries if top level
> >>>>>>>+ * structure changes, ie va space grows with dynamic page tables.
> >>>>>>>+ */
> >>>
> >>>Is this still necessary if we reload PDPs via LRI instructions whenever
> >>>the address map has changed? That always (AFAICT) causes sufficient
> >>>invalidation, so then we might not need to preallocate at all :)
> >>>
> >>
> >>LRI reload gets my vote. Please ignore this patch.
> >>-Mika
> >>
> >>>.Dave.
> >>_______________________________________________
> >>Intel-gfx mailing list
> >>Intel-gfx@lists.freedesktop.org
> >>http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Zhiyuan Lv Aug. 13, 2015, 9:36 a.m. UTC | #11
Hi Dave,

On Wed, Aug 12, 2015 at 04:09:18PM +0100, Dave Gordon wrote:
> On 12/08/15 08:56, Thierry, Michel wrote:
> >On 8/11/2015 1:05 PM, Zhiyuan Lv wrote:
> >>Hi Mika/Dave/Michel,
> >>
> >>I saw the patch of using LRI for root pointer update has been merged to
> >>drm-intel. When we consider i915 driver to run inside a virtual machine, e.g.
> >>with XenGT, we may still need Mika's this patch like below:
> >>
> >>"
> >>          if (intel_vgpu_active(ppgtt->base.dev))
> >>                  gen8_preallocate_top_level_pdps(ppgtt);
> >>"
> >>
> >>Could you share with us your opinion? Thanks in advance!
> >
> >Hi Zhiyuan,
> >
> >The change looks ok to me. If you need to preallocate the PDPs,
> >gen8_ppgtt_init is the right place to do it. Only add a similar
> >vgpu_active check to disable the LRI updates (in gen8_emit_bb_start).
> >
> >>The reason behind is that LRI command will make shadow PPGTT implementation
> >>hard. In XenGT, we construct shadow page table for each PPGTT in guest i915
> >>driver, and then track every guest page table change in order to update shadow
> >>page table accordingly. The problem of page table updates with GPU command is
> >>that they cannot be trapped by hypervisor to finish the shadow page table
> >>update work. In XenGT, the only change we have is the command scan in context
> >>submission. But that is not exactly the right time to do shadow page table
> >>update.
> >>
> >>Mika's patch can address the problem nicely. With the preallocation, the root
> >>pointers in EXECLIST context will always keep the same. Then we can treat any
> >>attempt to change guest PPGTT with GPU commands as malicious behavior. Thanks!
> >>
> >>Regards,
> >>-Zhiyuan
> 
> The bad thing that was happening if we didn't use LRIs was that the
> CPU would try to push the new mappings to the GPU by updating PDP
> registers in the saved context image. This is unsafe if the context
> is running, as switching away from it would result in the
> CPU-updated values being overwritten by the older values in the GPU
> h/w registers (if the context were known to be idle, then it would
> be safe).

Thank you very much for the detailed explanation! And I am curious
that if the root pointers update does not have side effect to the
current running context, for instance, only changing NULL to PD
without modifying existing pdpes, can we use "Force PD Restore" bit in
ctx descriptor?

Regards,
-Zhiyuan

> 
> Preallocating the top-level PDPs should mean that the values need
> never change, so there's then no need to update the context image,
> thus avoiding the write hazard :)
> 
> .Dave.
> 
> >>On Thu, Jun 11, 2015 at 04:57:42PM +0300, Mika Kuoppala wrote:
> >>>Dave Gordon <david.s.gordon@intel.com> writes:
> >>>
> >>>>On 10/06/15 12:42, Michel Thierry wrote:
> >>>>>On 5/29/2015 1:53 PM, Michel Thierry wrote:
> >>>>>>On 5/29/2015 12:05 PM, Michel Thierry wrote:
> >>>>>>>On 5/22/2015 6:04 PM, Mika Kuoppala wrote:
> >>>>>>>>With BDW/SKL and 32bit addressing mode only, the hardware preloads
> >>>>>>>>pdps. However the TLB invalidation only has effect on levels below
> >>>>>>>>the pdps. This means that if pdps change, hw might access with
> >>>>>>>>stale pdp entry.
> >>>>>>>>
> >>>>>>>>To combat this problem, preallocate the top pdps so that hw sees
> >>>>>>>>them as immutable for each context.
> >>>>>>>>
> >>>>>>>>Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> >>>>>>>>Cc: Rafael Barbalho <rafael.barbalho@intel.com>
> >>>>>>>>Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> >>>>>>>>---
> >>>>>>>>    drivers/gpu/drm/i915/i915_gem_gtt.c | 50
> >>>>>>>>+++++++++++++++++++++++++++++++++++++
> >>>>>>>>    drivers/gpu/drm/i915/i915_reg.h     | 17 +++++++++++++
> >>>>>>>>    drivers/gpu/drm/i915/intel_lrc.c    | 15 +----------
> >>>>>>>>    3 files changed, 68 insertions(+), 14 deletions(-)
> >>>>>>>>
> >>>>>>>>diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>>>b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>>>index 0ffd459..1a5ad4c 100644
> >>>>>>>>--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>>>+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >>>>>>>>@@ -941,6 +941,48 @@ err_out:
> >>>>>>>>           return ret;
> >>>>>>>>    }
> >>>>>>>>
> >>>>>>>>+/* With some architectures and 32bit legacy mode, hardware pre-loads
> >>>>>>>>+ * the top level pdps but the tlb invalidation only invalidates the
> >>>>>>>>+ * lower levels.
> >>>>>>>>+ * This might lead to hw fetching with stale pdp entries if top level
> >>>>>>>>+ * structure changes, ie va space grows with dynamic page tables.
> >>>>>>>>+ */
> >>>>
> >>>>Is this still necessary if we reload PDPs via LRI instructions whenever
> >>>>the address map has changed? That always (AFAICT) causes sufficient
> >>>>invalidation, so then we might not need to preallocate at all :)
> >>>
> >>>LRI reload gets my vote. Please ignore this patch.
> >>>-Mika
> >>>
> >>>>.Dave.
> >>>_______________________________________________
> >>>Intel-gfx mailing list
> >>>Intel-gfx@lists.freedesktop.org
> >>>http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Michel Thierry Aug. 13, 2015, 9:54 a.m. UTC | #12
On 8/13/2015 5:36 PM, Zhiyuan Lv wrote:
> Hi Dave,
>
> On Wed, Aug 12, 2015 at 04:09:18PM +0100, Dave Gordon wrote:
>> On 12/08/15 08:56, Thierry, Michel wrote:
>>> On 8/11/2015 1:05 PM, Zhiyuan Lv wrote:
>>>> Hi Mika/Dave/Michel,
>>>>
>>>> I saw the patch of using LRI for root pointer update has been merged to
>>>> drm-intel. When we consider i915 driver to run inside a virtual machine, e.g.
>>>> with XenGT, we may still need Mika's this patch like below:
>>>>
>>>> "
>>>>           if (intel_vgpu_active(ppgtt->base.dev))
>>>>                   gen8_preallocate_top_level_pdps(ppgtt);
>>>> "
>>>>
>>>> Could you share with us your opinion? Thanks in advance!
>>>
>>> Hi Zhiyuan,
>>>
>>> The change looks ok to me. If you need to preallocate the PDPs,
>>> gen8_ppgtt_init is the right place to do it. Only add a similar
>>> vgpu_active check to disable the LRI updates (in gen8_emit_bb_start).
>>>
>>>> The reason behind is that LRI command will make shadow PPGTT implementation
>>>> hard. In XenGT, we construct shadow page table for each PPGTT in guest i915
>>>> driver, and then track every guest page table change in order to update shadow
>>>> page table accordingly. The problem of page table updates with GPU command is
>>>> that they cannot be trapped by hypervisor to finish the shadow page table
>>>> update work. In XenGT, the only change we have is the command scan in context
>>>> submission. But that is not exactly the right time to do shadow page table
>>>> update.
>>>>
>>>> Mika's patch can address the problem nicely. With the preallocation, the root
>>>> pointers in EXECLIST context will always keep the same. Then we can treat any
>>>> attempt to change guest PPGTT with GPU commands as malicious behavior. Thanks!
>>>>
>>>> Regards,
>>>> -Zhiyuan
>>
>> The bad thing that was happening if we didn't use LRIs was that the
>> CPU would try to push the new mappings to the GPU by updating PDP
>> registers in the saved context image. This is unsafe if the context
>> is running, as switching away from it would result in the
>> CPU-updated values being overwritten by the older values in the GPU
>> h/w registers (if the context were known to be idle, then it would
>> be safe).
>
> Thank you very much for the detailed explanation! And I am curious
> that if the root pointers update does not have side effect to the
> current running context, for instance, only changing NULL to PD
> without modifying existing pdpes, can we use "Force PD Restore" bit in
> ctx descriptor?

We've been explicitly asked to not use "Force PD Restore".

>
> Regards,
> -Zhiyuan
>
>>
>> Preallocating the top-level PDPs should mean that the values need
>> never change, so there's then no need to update the context image,
>> thus avoiding the write hazard :)
>>
>> .Dave.
>>
>>>> On Thu, Jun 11, 2015 at 04:57:42PM +0300, Mika Kuoppala wrote:
>>>>> Dave Gordon <david.s.gordon@intel.com> writes:
>>>>>
>>>>>> On 10/06/15 12:42, Michel Thierry wrote:
>>>>>>> On 5/29/2015 1:53 PM, Michel Thierry wrote:
>>>>>>>> On 5/29/2015 12:05 PM, Michel Thierry wrote:
>>>>>>>>> On 5/22/2015 6:04 PM, Mika Kuoppala wrote:
>>>>>>>>>> With BDW/SKL and 32bit addressing mode only, the hardware preloads
>>>>>>>>>> pdps. However the TLB invalidation only has effect on levels below
>>>>>>>>>> the pdps. This means that if pdps change, hw might access with
>>>>>>>>>> stale pdp entry.
>>>>>>>>>>
>>>>>>>>>> To combat this problem, preallocate the top pdps so that hw sees
>>>>>>>>>> them as immutable for each context.
>>>>>>>>>>
>>>>>>>>>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>>>>>>>>>> Cc: Rafael Barbalho <rafael.barbalho@intel.com>
>>>>>>>>>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>>>>>>>>>> ---
>>>>>>>>>>     drivers/gpu/drm/i915/i915_gem_gtt.c | 50
>>>>>>>>>> +++++++++++++++++++++++++++++++++++++
>>>>>>>>>>     drivers/gpu/drm/i915/i915_reg.h     | 17 +++++++++++++
>>>>>>>>>>     drivers/gpu/drm/i915/intel_lrc.c    | 15 +----------
>>>>>>>>>>     3 files changed, 68 insertions(+), 14 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>>>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>>>> index 0ffd459..1a5ad4c 100644
>>>>>>>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>>>>>>> @@ -941,6 +941,48 @@ err_out:
>>>>>>>>>>            return ret;
>>>>>>>>>>     }
>>>>>>>>>>
>>>>>>>>>> +/* With some architectures and 32bit legacy mode, hardware pre-loads
>>>>>>>>>> + * the top level pdps but the tlb invalidation only invalidates the
>>>>>>>>>> + * lower levels.
>>>>>>>>>> + * This might lead to hw fetching with stale pdp entries if top level
>>>>>>>>>> + * structure changes, ie va space grows with dynamic page tables.
>>>>>>>>>> + */
>>>>>>
>>>>>> Is this still necessary if we reload PDPs via LRI instructions whenever
>>>>>> the address map has changed? That always (AFAICT) causes sufficient
>>>>>> invalidation, so then we might not need to preallocate at all :)
>>>>>
>>>>> LRI reload gets my vote. Please ignore this patch.
>>>>> -Mika
>>>>>
>>>>>> .Dave.
>>>>> _______________________________________________
>>>>> Intel-gfx mailing list
>>>>> Intel-gfx@lists.freedesktop.org
>>>>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Michel Thierry Aug. 13, 2015, 10:12 a.m. UTC | #13
On 8/13/2015 5:08 PM, Zhiyuan Lv wrote:
> Hi Michel,
>
> Thanks for the reply!
>
> I yet have another question: right now the mark_tlb_dirty() will be
> called if any level of PPGTT table is changed. But for the EXECLIST
> context submission, we only need LRI commands if there are L3 PDP root
> pointer changes right? Thanks!
>

mark_tlbs_dirty is not only for execlists mode, we re-used it since it 
was already there.

The update is only required when a PDP is allocated.

-Michel
Dave Gordon Aug. 13, 2015, 11:42 a.m. UTC | #14
On 13/08/15 11:12, Michel Thierry wrote:
> On 8/13/2015 5:08 PM, Zhiyuan Lv wrote:
>> Hi Michel,
>>
>> Thanks for the reply!
>>
>> I yet have another question: right now the mark_tlb_dirty() will be
>> called if any level of PPGTT table is changed. But for the EXECLIST
>> context submission, we only need LRI commands if there are L3 PDP root
>> pointer changes right? Thanks!
>
> mark_tlbs_dirty is not only for execlists mode, we re-used it since it
> was already there.
>
> The update is only required when a PDP is allocated.
>
> -Michel

Doesn't that depend on whether the context is running? The LRI reload 
has the side effect of flushing all current knowledge of mappings, so 
every level of PD gets refreshed from memory.

If we're not updating the top level PDPs, and we know the context isn't 
active, then we *assume* that lower-level PDs will be refreshed when the 
context is next loaded. (This hasn't been true on all hardware, some of 
which cached previously-retrieved PDs across ctx save-and-reload, and 
that's one reason why there's a "Force PD Restore" bit, but we've been 
told not to use it on current h/w). AFAICT, current chips don't cache 
previous PDs and don't need the "Force" bit for this case.

OTOH, if we don't know whether the context is running, then we can't be 
sure when (or whether) any PD updates will be seen. As long as the 
changes of interest are only ever *from* NULL *to* non-NULL, we *expect* 
it to work, because (we *assume*) the GPU won't cache negative results 
from PD lookups; so any lookup that previously hit an invalid mapping 
will be re-fetched next time it's required (and may now be good).

If we don't reload the PDPs with LRIs, then perhaps to be safe we need 
to inject some other instruction that will just force a re-fetch of the 
lower-level PDs from memory, without altering any top-level PDPs? In 
conjunction with preallocating the top-level entries, that ought to 
guarantee that the updates would be seen just before the point where 
they're about to be used?

.Dave.
Dave Gordon Aug. 13, 2015, 12:03 p.m. UTC | #15
On 13/08/15 12:42, Dave Gordon wrote:
> On 13/08/15 11:12, Michel Thierry wrote:
>> On 8/13/2015 5:08 PM, Zhiyuan Lv wrote:
>>> Hi Michel,
>>>
>>> Thanks for the reply!
>>>
>>> I yet have another question: right now the mark_tlb_dirty() will be
>>> called if any level of PPGTT table is changed. But for the EXECLIST
>>> context submission, we only need LRI commands if there are L3 PDP root
>>> pointer changes right? Thanks!
>>
>> mark_tlbs_dirty is not only for execlists mode, we re-used it since it
>> was already there.
>>
>> The update is only required when a PDP is allocated.
>>
>> -Michel
>
> Doesn't that depend on whether the context is running? The LRI reload
> has the side effect of flushing all current knowledge of mappings, so
> every level of PD gets refreshed from memory.
>
> If we're not updating the top level PDPs, and we know the context isn't
> active, then we *assume* that lower-level PDs will be refreshed when the
> context is next loaded. (This hasn't been true on all hardware, some of
> which cached previously-retrieved PDs across ctx save-and-reload, and
> that's one reason why there's a "Force PD Restore" bit, but we've been
> told not to use it on current h/w). AFAICT, current chips don't cache
> previous PDs and don't need the "Force" bit for this case.
>
> OTOH, if we don't know whether the context is running, then we can't be
> sure when (or whether) any PD updates will be seen. As long as the
> changes of interest are only ever *from* NULL *to* non-NULL, we *expect*
> it to work, because (we *assume*) the GPU won't cache negative results
> from PD lookups; so any lookup that previously hit an invalid mapping
> will be re-fetched next time it's required (and may now be good).
>
> If we don't reload the PDPs with LRIs, then perhaps to be safe we need
> to inject some other instruction that will just force a re-fetch of the
> lower-level PDs from memory, without altering any top-level PDPs? In
> conjunction with preallocating the top-level entries, that ought to
> guarantee that the updates would be seen just before the point where
> they're about to be used?
>
> .Dave.
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

I found the following comment in the BSpec:

"Pre-loading of Page Directory Entries (PD load) for 32b legacy mode is 
not supported from Gen9 onwards.  PD entries are loaded on demand when 
there is a miss in the PDE cache of the corresponding page walker.  Any 
new page additions by the driver are transparent to the HW, and the new 
page translations will be fetched on demand.  However, any removal of 
the pages by the driver should initiate a TLB invalidation to remove the 
stale entries."

So, I think that confirms that we should inject some form of TLB 
invalidation into the ring before the next batch uses any updated PDs. 
Presumably an MI_FLUSH_DW with TLB_INVALIDATE would do?

.Dave.
Zhiyuan Lv Aug. 13, 2015, 2:56 p.m. UTC | #16
On Thu, Aug 13, 2015 at 01:03:30PM +0100, Dave Gordon wrote:
> On 13/08/15 12:42, Dave Gordon wrote:
> >On 13/08/15 11:12, Michel Thierry wrote:
> >>On 8/13/2015 5:08 PM, Zhiyuan Lv wrote:
> >>>Hi Michel,
> >>>
> >>>Thanks for the reply!
> >>>
> >>>I yet have another question: right now the mark_tlb_dirty() will be
> >>>called if any level of PPGTT table is changed. But for the EXECLIST
> >>>context submission, we only need LRI commands if there are L3 PDP root
> >>>pointer changes right? Thanks!
> >>
> >>mark_tlbs_dirty is not only for execlists mode, we re-used it since it
> >>was already there.
> >>
> >>The update is only required when a PDP is allocated.
> >>
> >>-Michel
> >
> >Doesn't that depend on whether the context is running? The LRI reload
> >has the side effect of flushing all current knowledge of mappings, so
> >every level of PD gets refreshed from memory.
> >
> >If we're not updating the top level PDPs, and we know the context isn't
> >active, then we *assume* that lower-level PDs will be refreshed when the
> >context is next loaded. (This hasn't been true on all hardware, some of
> >which cached previously-retrieved PDs across ctx save-and-reload, and
> >that's one reason why there's a "Force PD Restore" bit, but we've been
> >told not to use it on current h/w). AFAICT, current chips don't cache
> >previous PDs and don't need the "Force" bit for this case.
> >
> >OTOH, if we don't know whether the context is running, then we can't be
> >sure when (or whether) any PD updates will be seen. As long as the
> >changes of interest are only ever *from* NULL *to* non-NULL, we *expect*
> >it to work, because (we *assume*) the GPU won't cache negative results
> >from PD lookups; so any lookup that previously hit an invalid mapping
> >will be re-fetched next time it's required (and may now be good).
> >
> >If we don't reload the PDPs with LRIs, then perhaps to be safe we need
> >to inject some other instruction that will just force a re-fetch of the
> >lower-level PDs from memory, without altering any top-level PDPs? In
> >conjunction with preallocating the top-level entries, that ought to
> >guarantee that the updates would be seen just before the point where
> >they're about to be used?
> >
> >.Dave.
> >_______________________________________________
> >Intel-gfx mailing list
> >Intel-gfx@lists.freedesktop.org
> >http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> I found the following comment in the BSpec:
> 
> "Pre-loading of Page Directory Entries (PD load) for 32b legacy mode
> is not supported from Gen9 onwards.  PD entries are loaded on demand
> when there is a miss in the PDE cache of the corresponding page
> walker.  Any new page additions by the driver are transparent to the
> HW, and the new page translations will be fetched on demand.
> However, any removal of the pages by the driver should initiate a
> TLB invalidation to remove the stale entries."
> 
> So, I think that confirms that we should inject some form of TLB
> invalidation into the ring before the next batch uses any updated
> PDs. Presumably an MI_FLUSH_DW with TLB_INVALIDATE would do?

Hi Dave and Michel,

So the conclusion is still the same: that for 32b legacy mode,
emit_pdps() is only needed for PDP changes. Other level page table
changes can be handled by TLB_INVALIDATE with ring buffer commands. Is
that correct? Thanks!

Regards,
-Zhiyuan

> 
> .Dave.
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 0ffd459..1a5ad4c 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -941,6 +941,48 @@  err_out:
 	return ret;
 }
 
+/* With some architectures and 32bit legacy mode, hardware pre-loads the
+ * top level pdps but the tlb invalidation only invalidates the lower levels.
+ * This might lead to hw fetching with stale pdp entries if top level
+ * structure changes, ie va space grows with dynamic page tables.
+ */
+static bool hw_wont_flush_pdp_tlbs(struct i915_hw_ppgtt *ppgtt)
+{
+	struct drm_device *dev = ppgtt->base.dev;
+
+	if (GEN8_CTX_ADDRESSING_MODE != LEGACY_32B_CONTEXT)
+		return false;
+
+	if (IS_BROADWELL(dev) || IS_SKYLAKE(dev))
+		return true;
+
+	return false;
+}
+
+static int gen8_preallocate_top_level_pdps(struct i915_hw_ppgtt *ppgtt)
+{
+	unsigned long *new_page_dirs, **new_page_tables;
+	int ret;
+
+	/* We allocate temp bitmap for page tables for no gain
+	 * but as this is for init only, lets keep the things simple
+	 */
+	ret = alloc_gen8_temp_bitmaps(&new_page_dirs, &new_page_tables);
+	if (ret)
+		return ret;
+
+	/* Allocate for all pdps regardless of how the ppgtt
+	 * was defined.
+	 */
+	ret = gen8_ppgtt_alloc_page_directories(ppgtt, &ppgtt->pdp,
+						0, 1ULL << 32,
+						new_page_dirs);
+
+	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
+
+	return ret;
+}
+
 /*
  * GEN8 legacy ppgtt programming is accomplished through a max 4 PDP registers
  * with a net effect resembling a 2-level page table in normal x86 terms. Each
@@ -972,6 +1014,14 @@  static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
 
 	ppgtt->switch_mm = gen8_mm_switch;
 
+	if (hw_wont_flush_pdp_tlbs(ppgtt)) {
+		/* Avoid the tlb flush bug by preallocating
+		 * whole top level pdp structure so it stays
+		 * static even if our va space grows.
+		 */
+		return gen8_preallocate_top_level_pdps(ppgtt);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 6eeba63..334324b 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2777,6 +2777,23 @@  enum skl_disp_power_wells {
 #define VLV_CLK_CTL2			0x101104
 #define   CLK_CTL2_CZCOUNT_30NS_SHIFT	28
 
+/* Context descriptor format bits */
+#define GEN8_CTX_VALID			(1<<0)
+#define GEN8_CTX_FORCE_PD_RESTORE	(1<<1)
+#define GEN8_CTX_FORCE_RESTORE		(1<<2)
+#define GEN8_CTX_L3LLC_COHERENT		(1<<5)
+#define GEN8_CTX_PRIVILEGE		(1<<8)
+
+enum {
+	ADVANCED_CONTEXT = 0,
+	LEGACY_32B_CONTEXT,
+	ADVANCED_AD_CONTEXT,
+	LEGACY_64B_CONTEXT
+};
+
+#define GEN8_CTX_ADDRESSING_MODE_SHIFT	3
+#define GEN8_CTX_ADDRESSING_MODE	LEGACY_32B_CONTEXT
+
 /*
  * Overlay regs
  */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 96ae90a..d793d4e 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -183,12 +183,6 @@ 
 #define CTX_R_PWR_CLK_STATE		0x42
 #define CTX_GPGPU_CSR_BASE_ADDRESS	0x44
 
-#define GEN8_CTX_VALID (1<<0)
-#define GEN8_CTX_FORCE_PD_RESTORE (1<<1)
-#define GEN8_CTX_FORCE_RESTORE (1<<2)
-#define GEN8_CTX_L3LLC_COHERENT (1<<5)
-#define GEN8_CTX_PRIVILEGE (1<<8)
-
 #define ASSIGN_CTX_PDP(ppgtt, reg_state, n) { \
 	const u64 _addr = test_bit(n, ppgtt->pdp.used_pdpes) ? \
 		ppgtt->pdp.page_directory[n]->daddr : \
@@ -198,13 +192,6 @@ 
 }
 
 enum {
-	ADVANCED_CONTEXT = 0,
-	LEGACY_CONTEXT,
-	ADVANCED_AD_CONTEXT,
-	LEGACY_64B_CONTEXT
-};
-#define GEN8_CTX_MODE_SHIFT 3
-enum {
 	FAULT_AND_HANG = 0,
 	FAULT_AND_HALT, /* Debug only */
 	FAULT_AND_STREAM,
@@ -273,7 +260,7 @@  static uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring,
 	WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
 
 	desc = GEN8_CTX_VALID;
-	desc |= LEGACY_CONTEXT << GEN8_CTX_MODE_SHIFT;
+	desc |= GEN8_CTX_ADDRESSING_MODE << GEN8_CTX_ADDRESSING_MODE_SHIFT;
 	if (IS_GEN8(ctx_obj->base.dev))
 		desc |= GEN8_CTX_L3LLC_COHERENT;
 	desc |= GEN8_CTX_PRIVILEGE;