diff mbox

[RFCv5,2/2] drm/i915: Introduce private PAT management

Message ID 1503993651-25665-2-git-send-email-zhi.a.wang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wang, Zhi A Aug. 29, 2017, 8 a.m. UTC
The private PAT management is to support PPAT entry manipulation. Two
APIs are introduced for dynamically managing PPAT entries: intel_ppat_get
and intel_ppat_put.

intel_ppat_get will search for an existing PPAT entry which perfectly
matches the required PPAT value. If not, it will try to allocate or
return a partially matched PPAT entry if there is any available PPAT
indexes or not.

intel_ppat_put will put back the PPAT entry which comes from
intel_ppat_get. If it's dynamically allocated, the reference count will
be decreased. If the reference count turns into zero, the PPAT index is
freed again.

Besides, another two callbacks are introduced to support the private PAT
management framework. One is ppat->update(), which writes the PPAT
configurations in ppat->entries into HW. Another one is ppat->match, which
will return a score to show how two PPAT values match with each other.

v5:

- Add check and warnnings for those platforms which don't have PPAT.

v3:

- Introduce dirty bitmap for PPAT registers. (Chris)
- Change the name of the pointer "dev_priv" to "i915". (Chris)
- intel_ppat_{get, put} returns/takes a const intel_ppat_entry *. (Chris)

v2:

- API re-design. (Chris)

Cc: Ben Widawsky <benjamin.widawsky@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |   2 +
 drivers/gpu/drm/i915/i915_gem_gtt.c | 289 ++++++++++++++++++++++++++++++------
 drivers/gpu/drm/i915/i915_gem_gtt.h |  34 +++++
 3 files changed, 276 insertions(+), 49 deletions(-)

Comments

Joonas Lahtinen Aug. 29, 2017, 9:37 a.m. UTC | #1
On Tue, 2017-08-29 at 16:00 +0800, Zhi Wang wrote:
> The private PAT management is to support PPAT entry manipulation. Two
> APIs are introduced for dynamically managing PPAT entries: intel_ppat_get
> and intel_ppat_put.
> 
> intel_ppat_get will search for an existing PPAT entry which perfectly
> matches the required PPAT value. If not, it will try to allocate or
> return a partially matched PPAT entry if there is any available PPAT
> indexes or not.
> 
> intel_ppat_put will put back the PPAT entry which comes from
> intel_ppat_get. If it's dynamically allocated, the reference count will
> be decreased. If the reference count turns into zero, the PPAT index is
> freed again.
> 
> Besides, another two callbacks are introduced to support the private PAT
> management framework. One is ppat->update(), which writes the PPAT
> configurations in ppat->entries into HW. Another one is ppat->match, which
> will return a score to show how two PPAT values match with each other.
> 
> v5:
> 
> - Add check and warnnings for those platforms which don't have PPAT.
> 
> v3:
> 
> - Introduce dirty bitmap for PPAT registers. (Chris)
> - Change the name of the pointer "dev_priv" to "i915". (Chris)
> - intel_ppat_{get, put} returns/takes a const intel_ppat_entry *. (Chris)
> 
> v2:
> 
> - API re-design. (Chris)
> 
> Cc: Ben Widawsky <benjamin.widawsky@intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>

<SNIP>

> -static void cnl_setup_private_ppat(struct drm_i915_private *dev_priv)
> +static struct intel_ppat_entry *alloc_ppat_entry(struct intel_ppat *ppat,
> +						 unsigned int index,
> +						 u8 value)
>  {
> +	struct intel_ppat_entry *entry = &ppat->entries[index];
> +
> +	entry->ppat = ppat;
> +	entry->value = value;
> +	kref_init(&entry->ref_count);
> +	set_bit(index, ppat->used);
> +	set_bit(index, ppat->dirty);
> +
> +	return entry;
> +}
> +
> +static void free_ppat_entry(struct intel_ppat_entry *entry)
> +{
> +	struct intel_ppat *ppat = entry->ppat;
> +	int index = entry - ppat->entries;
> +
> +	entry->value = ppat->dummy_value;
> +	clear_bit(index, ppat->used);
> +	set_bit(index, ppat->dirty);
> +}

Above functions should be __ prefixed as they do no checking if they
override existing data. The suitable names might be __ppat_get and
__ppat_put.

> +
> +/**
> + * intel_ppat_get - get a usable PPAT entry
> + * @i915: i915 device instance
> + * @value: the PPAT value required by the caller
> + *
> + * The function tries to search if there is an existing PPAT entry which
> + * matches with the required value. If perfectly matched, the existing PPAT
> + * entry will be used. If only partially matched, it will try to check if
> + * there is any available PPAT index. If yes, it will allocate a new PPAT
> + * index for the required entry and update the HW. If not, the partially
> + * matched entry will be used.
> + */
> +const struct intel_ppat_entry *intel_ppat_get(struct drm_i915_private *i915,

Maybe split the function type and name to avoid exceeding 80 chars on
next line.

> +					      u8 value)
> +{
> +	struct intel_ppat *ppat = &i915->ppat;
> +	struct intel_ppat_entry *entry;
> +	int i, used;
> +	unsigned int score, best_score;
> +
> +	if (WARN_ON(!ppat->max_entries))
> +		return ERR_PTR(-ENODEV);

No need for extra check like this, this will just lead to ENOSPC.

> +
> +	score = best_score = 0;
> +	used = 0;

This variable behaves more like scanned.

> +
> +	/* First, find a suitable value from available entries */

The next two lines repeat this information, no need to document "what".

> +	for_each_set_bit(i, ppat->used, ppat->max_entries) {

		score could be declared in this scope.

> +		score = ppat->match(ppat->entries[i].value, value);
> +		/* Perfect match */

This comment is literally repeating what code says.

> +		if (score == INTEL_PPAT_PERFECT_MATCH) {
> +			entry = &ppat->entries[i];
> +			kref_get(&entry->ref_count);
> +			return entry;
> +		}
> +
> +		if (score > best_score) {
> +			entry = &ppat->entries[i];

Above could be simplified:

			if (score == INTEL_PPAT_PERFECT_MATCH) {
				kref_get(&entry->ref);
				return entry;
			}

> +			best_score = score;
> +		}
> +		used++;
> +	}
> +
> +	/* No matched entry and we can't allocate a new entry. */

DRM_ERROR replicates this comment's information.

> +	if (!best_score && used == ppat->max_entries) {
> +		DRM_ERROR("Fail to get PPAT entry\n");

DRM_DEBUG_DRIVER at most.

> +		return ERR_PTR(-ENOSPC);
> +	}
> +
> +	/*
> +	 * Found a matched entry which is not perfect,
> +	 * and we can't allocate a new entry.
> +	 */
> +	if (best_score && used == ppat->max_entries) {
> +		kref_get(&entry->ref_count);
> +		return entry;
> +	}

Above code could be combined:

	if (scanned == ppat->max_entries) {
		if(!best_score)
			return ERR_PTR(-ENOSPC);
		kref_get(&entry->ref);
		return entry;
	}

> +
> +	/* Allocate a new entry */

This comment is also just telling "what", which we can see from code.

> +	i = find_first_zero_bit(ppat->used, ppat->max_entries);
> +	entry = alloc_ppat_entry(ppat, i, value);
> +	ppat->update(i915);
> +	return entry;
> +}
> +
> +static void put_ppat(struct kref *kref)

ppat_release might cause less confusion, otherwise there will be 3 put
functions.

> +{
> +	struct intel_ppat_entry *entry =
> +		container_of(kref, struct intel_ppat_entry, ref_count);
> +	struct drm_i915_private *i915 = entry->ppat->i915;
> +
> +	free_ppat_entry(entry);
> +	entry->ppat->update(i915);
> +}
> +
> +/**
> + * intel_ppat_put - put back the PPAT entry got from intel_ppat_get()
> + * @entry: an intel PPAT entry
> + *
> + * Put back the PPAT entry got from intel_ppat_get(). If the PPAT index of the
> + * entry is dynamically allocated, its reference count will be decreased. Once
> + * the reference count becomes into zero, the PPAT index becomes free again.
> + */
> +void intel_ppat_put(const struct intel_ppat_entry *entry)
> +{
> +	struct intel_ppat *ppat = entry->ppat;
> +	int index = entry - ppat->entries;
> +
> +	if (WARN_ON(!ppat->max_entries))
> +		return;

This is clearly a kernel programmer error (and a serious one, so could
be GEM_BUG_ON).

> +
> +	kref_put(&ppat->entries[index].ref_count, put_ppat);
> +}
> +
> +static void cnl_private_pat_update(struct drm_i915_private *dev_priv)
> +{
> +	struct intel_ppat *ppat = &dev_priv->ppat;
> +	int i;
> +
> +	for_each_set_bit(i, ppat->dirty, ppat->max_entries) {
> +		clear_bit(i, ppat->dirty);
> +		I915_WRITE(GEN10_PAT_INDEX(i), ppat->entries[i].value);

Clearing the bit after write is the logical thing to do.

> +	}
> +}
> +
> +static void bdw_private_pat_update(struct drm_i915_private *dev_priv)
> +{
> +	struct intel_ppat *ppat = &dev_priv->ppat;
> +	u64 pat = 0;
> +	int i;
> +
> +	for (i = 0; i < ppat->max_entries; i++)
> +		pat |= GEN8_PPAT(i, ppat->entries[i].value);
> +
> +	bitmap_clear(ppat->dirty, 0, ppat->max_entries);
> +
> +	I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);
> +	I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);

While moving the code, lower_32_bits and upper_32_bits, it should
really warn without them?

> +}
> +
> +#define gen8_pat_ca(v) ((v) & 0x3)
> +#define gen8_pat_tc(v) (((v) >> 2) & 0x3)
> +#define gen8_pat_age(v) (((v) >> 4) & 0x3)

Magic numbers. I'm also not 100% sure if KMD will want partial matches,
or if GVT should pass in a scoring function.

> +
> +static unsigned int bdw_private_pat_match(u8 src, u8 dst)
> +{
> +	unsigned int score = 0;
> +
> +	/* Cache attribute has to be matched. */
> +	if (gen8_pat_ca(src) != gen8_pat_ca(dst))
> +		return 0;
> +
> +	if (gen8_pat_age(src) == gen8_pat_age(dst))
> +		score += 1;
> +
> +	if (gen8_pat_tc(src) == gen8_pat_tc(dst))
> +		score += 2;

I'd lift this check to have them all in importance order.

> +
> +	if (score == 3)
> +		return INTEL_PPAT_PERFECT_MATCH;
> +
> +	return score;
> +}
> +
> +#define chv_get_snoop(v) (((v) >> 6) & 0x1)

Magic numbers, which need to be in i915_reg.h with #define. Should also
be chv_pat_snoop();

> +
> +static unsigned int chv_private_pat_match(u8 src, u8 dst)
> +{
> +	if (chv_get_snoop(src) == chv_get_snoop(dst))
> +		return INTEL_PPAT_PERFECT_MATCH;
> +
> +	return 0;

	return chv_pat_snoop(src) == chv_pat_snoop(dst) ?
	       INTEL_PPAT_PERFECT_MATCH : 0;

> +}
> +
> +static void cnl_setup_private_ppat(struct intel_ppat *ppat)
> +{
> +	ppat->max_entries = 8;
> +	ppat->update = cnl_private_pat_update;
> +	ppat->match = bdw_private_pat_match;
> +	ppat->dummy_value = GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3);

"dummy_value" is not a very descriptive name.

> +
>  	/* XXX: spec is unclear if this is still needed for CNL+ */
> -	if (!USES_PPGTT(dev_priv)) {
> -		I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_UC);
> +	if (!USES_PPGTT(ppat->i915)) {
> +		alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);
>  		return;
>  	}
>  
> -	I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_WB | GEN8_PPAT_LLC);
> -	I915_WRITE(GEN10_PAT_INDEX(1), GEN8_PPAT_WC | GEN8_PPAT_LLCELLC);
> -	I915_WRITE(GEN10_PAT_INDEX(2), GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);
> -	I915_WRITE(GEN10_PAT_INDEX(3), GEN8_PPAT_UC);
> -	I915_WRITE(GEN10_PAT_INDEX(4), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));
> -	I915_WRITE(GEN10_PAT_INDEX(5), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1));
> -	I915_WRITE(GEN10_PAT_INDEX(6), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2));
> -	I915_WRITE(GEN10_PAT_INDEX(7), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));
> +	alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);
> +	alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);
> +	alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);
> +	alloc_ppat_entry(ppat, 4, GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));

Maybe I'm not following at all, but this seems like quite a change to
me? 0-3 used to be WB, WC, WT, UC, now 1 is unset, like is 5-7.

I'd leave all changes to registers to a later patch and leave this
patch to do what the title says, "Introduce private PAT management".

>  static void setup_private_pat(struct drm_i915_private *dev_priv)
>  {
> +	struct intel_ppat *ppat = &dev_priv->ppat;
> +	int i;
> +
> +	ppat->i915 = dev_priv;
> +
> +	/* Load per-platform PPAT configurations */

This comment is again just taking space.

>  	if (INTEL_GEN(dev_priv) >= 10)
> -		cnl_setup_private_ppat(dev_priv);
> +		cnl_setup_private_ppat(ppat);
>  	else if (IS_CHERRYVIEW(dev_priv) || IS_GEN9_LP(dev_priv))
> -		chv_setup_private_ppat(dev_priv);
> +		chv_setup_private_ppat(ppat);
>  	else
> -		bdw_setup_private_ppat(dev_priv);
> +		bdw_setup_private_ppat(ppat);
> +
> +	GEM_BUG_ON(ppat->max_entries > INTEL_MAX_PPAT_ENTRIES);
> +
> +	if (!ppat->max_entries)
> +		return;

I'm not sure why we have "!max_entries" checks going, we should have
one GEM_BUG_ON at this point and in the rest you can assume it to be
nonzero.

> +
> +	/* Fill unused PPAT entries with dummy PPAT value */
> +	for_each_clear_bit(i, ppat->used, ppat->max_entries) {
> +		ppat->entries[i].value = ppat->dummy_value;
> +		ppat->entries[i].ppat = ppat;
> +		set_bit(i, ppat->dirty);
> +	}
> +
> +	/* Write the HW */

If the callback was named update_hw(), this comment would be
unnecessary.

> +	ppat->update(dev_priv);
>  }

<SNIP>

> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -536,6 +536,40 @@ i915_vm_to_ggtt(struct i915_address_space *vm)
>  	return container_of(vm, struct i915_ggtt, base);
>  }
>  
> +#define INTEL_MAX_PPAT_ENTRIES 8
> +#define INTEL_PPAT_PERFECT_MATCH (~0U)
> +
> +struct intel_ppat;
> +
> +struct intel_ppat_entry {
> +	struct intel_ppat *ppat;
> +	struct kref ref_count;

	Just "ref", like elsewhere in code.

> +	u8 value;
> +};
> +
> +struct intel_ppat {
> +	struct intel_ppat_entry entries[INTEL_MAX_PPAT_ENTRIES];
> +	DECLARE_BITMAP(used, INTEL_MAX_PPAT_ENTRIES);
> +	DECLARE_BITMAP(dirty, INTEL_MAX_PPAT_ENTRIES);
> +

This really goes with the previous group of variable. So no newline.

> +	unsigned int max_entries;
> +
> +	u8 dummy_value;

Better name, like "clear_value" and short description may be useful.

> +	/*
> +	 * Return a score to show how two PPAT values match,
> +	 * a INTEL_PPAT_PERFECT_MATCH indicates a perfect match
> +	 */
> +	unsigned int (*match)(u8 src, u8 dst);
> +	/* Write the PPAT configuration into HW. */
> +	void (*update)(struct drm_i915_private *i915);
> +
> +	struct drm_i915_private *i915;
> +};
> +
> +const struct intel_ppat_entry *intel_ppat_get(struct drm_i915_private *i915,

Same here with type vs name.

> +					      u8 value);
> +void intel_ppat_put(const struct intel_ppat_entry *entry);

Regards, joonas
Chris Wilson Aug. 29, 2017, 10:05 a.m. UTC | #2
Quoting Zhi Wang (2017-08-29 09:00:51)
> +static void cnl_setup_private_ppat(struct intel_ppat *ppat)
> +{
> +       ppat->max_entries = 8;
> +       ppat->update = cnl_private_pat_update;
> +       ppat->match = bdw_private_pat_match;
> +       ppat->dummy_value = GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3);
> +
>         /* XXX: spec is unclear if this is still needed for CNL+ */
> -       if (!USES_PPGTT(dev_priv)) {
> -               I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_UC);
> +       if (!USES_PPGTT(ppat->i915)) {
> +               alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);
>                 return;
>         }
>  
> -       I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_WB | GEN8_PPAT_LLC);
> -       I915_WRITE(GEN10_PAT_INDEX(1), GEN8_PPAT_WC | GEN8_PPAT_LLCELLC);
> -       I915_WRITE(GEN10_PAT_INDEX(2), GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);
> -       I915_WRITE(GEN10_PAT_INDEX(3), GEN8_PPAT_UC);
> -       I915_WRITE(GEN10_PAT_INDEX(4), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));
> -       I915_WRITE(GEN10_PAT_INDEX(5), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1));
> -       I915_WRITE(GEN10_PAT_INDEX(6), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2));
> -       I915_WRITE(GEN10_PAT_INDEX(7), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));
> +       alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);
> +       alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);
> +       alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);
> +       alloc_ppat_entry(ppat, 4, GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));
>  }
>  
>  /* The GGTT and PPGTT need a private PPAT setup in order to handle cacheability
>   * bits. When using advanced contexts each context stores its own PAT, but
>   * writing this data shouldn't be harmful even in those cases. */
> -static void bdw_setup_private_ppat(struct drm_i915_private *dev_priv)
> +static void bdw_setup_private_ppat(struct intel_ppat *ppat)
>  {
> -       u64 pat;
> -
> -       pat = GEN8_PPAT(0, GEN8_PPAT_WB | GEN8_PPAT_LLC)     | /* for normal objects, no eLLC */
> -             GEN8_PPAT(1, GEN8_PPAT_WC | GEN8_PPAT_LLCELLC) | /* for something pointing to ptes? */
> -             GEN8_PPAT(2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC) | /* for scanout with eLLC */
> -             GEN8_PPAT(3, GEN8_PPAT_UC)                     | /* Uncached objects, mostly for scanout */
> -             GEN8_PPAT(4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0)) |
> -             GEN8_PPAT(5, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1)) |
> -             GEN8_PPAT(6, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2)) |
> -             GEN8_PPAT(7, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));
> +       ppat->max_entries = 8;
> +       ppat->update = bdw_private_pat_update;
> +       ppat->match = bdw_private_pat_match;
> +       ppat->dummy_value = GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3);
>  
> -       if (!USES_PPGTT(dev_priv))
> +       if (!USES_PPGTT(dev_priv)) {
>                 /* Spec: "For GGTT, there is NO pat_sel[2:0] from the entry,
>                  * so RTL will always use the value corresponding to
>                  * pat_sel = 000".
> @@ -2864,17 +3038,22 @@ static void bdw_setup_private_ppat(struct drm_i915_private *dev_priv)
>                  * So we can still hold onto all our assumptions wrt cpu
>                  * clflushing on LLC machines.
>                  */
> -               pat = GEN8_PPAT(0, GEN8_PPAT_UC);
> +               alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);
> +               return;
> +       }
>  
> -       /* XXX: spec defines this as 2 distinct registers. It's unclear if a 64b
> -        * write would work. */
> -       I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);
> -       I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);
> +       alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);     /* for normal objects, no eLLC */
> +       alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC); /* for scanout with eLLC */
> +       alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);                     /* Uncached objects, mostly for scanout */
> +       alloc_ppat_entry(ppat, 4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));
>  }
>  
> -static void chv_setup_private_ppat(struct drm_i915_private *dev_priv)
> +static void chv_setup_private_ppat(struct intel_ppat *ppat)
>  {
> -       u64 pat;
> +       ppat->max_entries = 8;
> +       ppat->update = bdw_private_pat_update;
> +       ppat->match = chv_private_pat_match;
> +       ppat->dummy_value = CHV_PPAT_SNOOP;
>  
>         /*
>          * Map WB on BDW to snooped on CHV.
> @@ -2894,17 +3073,11 @@ static void chv_setup_private_ppat(struct drm_i915_private *dev_priv)
>          * Which means we must set the snoop bit in PAT entry 0
>          * in order to keep the global status page working.
>          */
> -       pat = GEN8_PPAT(0, CHV_PPAT_SNOOP) |
> -             GEN8_PPAT(1, 0) |
> -             GEN8_PPAT(2, 0) |
> -             GEN8_PPAT(3, 0) |
> -             GEN8_PPAT(4, CHV_PPAT_SNOOP) |
> -             GEN8_PPAT(5, CHV_PPAT_SNOOP) |
> -             GEN8_PPAT(6, CHV_PPAT_SNOOP) |
> -             GEN8_PPAT(7, CHV_PPAT_SNOOP);
>  
> -       I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);
> -       I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);
> +       alloc_ppat_entry(ppat, 0, CHV_PPAT_SNOOP);
> +       alloc_ppat_entry(ppat, 2, 0);
> +       alloc_ppat_entry(ppat, 3, 0);
> +       alloc_ppat_entry(ppat, 4, CHV_PPAT_SNOOP);
>  }

1 is dropped in all cases?

The current ABI is that we reserve (0, 1, 2) for userspace. 1 means
follow-PTE and is especially important.
-Chris
Chris Wilson Aug. 29, 2017, 10:39 a.m. UTC | #3
Quoting Chris Wilson (2017-08-29 11:05:21)
> Quoting Zhi Wang (2017-08-29 09:00:51)
> > +static void cnl_setup_private_ppat(struct intel_ppat *ppat)
> > +{
> > +       ppat->max_entries = 8;
> > +       ppat->update = cnl_private_pat_update;
> > +       ppat->match = bdw_private_pat_match;
> > +       ppat->dummy_value = GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3);
> > +
> >         /* XXX: spec is unclear if this is still needed for CNL+ */
> > -       if (!USES_PPGTT(dev_priv)) {
> > -               I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_UC);
> > +       if (!USES_PPGTT(ppat->i915)) {
> > +               alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);
> >                 return;
> >         }
> >  
> > -       I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_WB | GEN8_PPAT_LLC);
> > -       I915_WRITE(GEN10_PAT_INDEX(1), GEN8_PPAT_WC | GEN8_PPAT_LLCELLC);
> > -       I915_WRITE(GEN10_PAT_INDEX(2), GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);
> > -       I915_WRITE(GEN10_PAT_INDEX(3), GEN8_PPAT_UC);
> > -       I915_WRITE(GEN10_PAT_INDEX(4), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));
> > -       I915_WRITE(GEN10_PAT_INDEX(5), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1));
> > -       I915_WRITE(GEN10_PAT_INDEX(6), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2));
> > -       I915_WRITE(GEN10_PAT_INDEX(7), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));
> > +       alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);
> > +       alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);
> > +       alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);
> > +       alloc_ppat_entry(ppat, 4, GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));
> >  }
> >  
> >  /* The GGTT and PPGTT need a private PPAT setup in order to handle cacheability
> >   * bits. When using advanced contexts each context stores its own PAT, but
> >   * writing this data shouldn't be harmful even in those cases. */
> > -static void bdw_setup_private_ppat(struct drm_i915_private *dev_priv)
> > +static void bdw_setup_private_ppat(struct intel_ppat *ppat)
> >  {
> > -       u64 pat;
> > -
> > -       pat = GEN8_PPAT(0, GEN8_PPAT_WB | GEN8_PPAT_LLC)     | /* for normal objects, no eLLC */
> > -             GEN8_PPAT(1, GEN8_PPAT_WC | GEN8_PPAT_LLCELLC) | /* for something pointing to ptes? */
> > -             GEN8_PPAT(2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC) | /* for scanout with eLLC */
> > -             GEN8_PPAT(3, GEN8_PPAT_UC)                     | /* Uncached objects, mostly for scanout */
> > -             GEN8_PPAT(4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0)) |
> > -             GEN8_PPAT(5, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1)) |
> > -             GEN8_PPAT(6, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2)) |
> > -             GEN8_PPAT(7, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));
> > +       ppat->max_entries = 8;
> > +       ppat->update = bdw_private_pat_update;
> > +       ppat->match = bdw_private_pat_match;
> > +       ppat->dummy_value = GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3);
> >  
> > -       if (!USES_PPGTT(dev_priv))
> > +       if (!USES_PPGTT(dev_priv)) {
> >                 /* Spec: "For GGTT, there is NO pat_sel[2:0] from the entry,
> >                  * so RTL will always use the value corresponding to
> >                  * pat_sel = 000".
> > @@ -2864,17 +3038,22 @@ static void bdw_setup_private_ppat(struct drm_i915_private *dev_priv)
> >                  * So we can still hold onto all our assumptions wrt cpu
> >                  * clflushing on LLC machines.
> >                  */
> > -               pat = GEN8_PPAT(0, GEN8_PPAT_UC);
> > +               alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);
> > +               return;
> > +       }
> >  
> > -       /* XXX: spec defines this as 2 distinct registers. It's unclear if a 64b
> > -        * write would work. */
> > -       I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);
> > -       I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);
> > +       alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);     /* for normal objects, no eLLC */
> > +       alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC); /* for scanout with eLLC */
> > +       alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);                     /* Uncached objects, mostly for scanout */
> > +       alloc_ppat_entry(ppat, 4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));
> >  }
> >  
> > -static void chv_setup_private_ppat(struct drm_i915_private *dev_priv)
> > +static void chv_setup_private_ppat(struct intel_ppat *ppat)
> >  {
> > -       u64 pat;
> > +       ppat->max_entries = 8;
> > +       ppat->update = bdw_private_pat_update;
> > +       ppat->match = chv_private_pat_match;
> > +       ppat->dummy_value = CHV_PPAT_SNOOP;
> >  
> >         /*
> >          * Map WB on BDW to snooped on CHV.
> > @@ -2894,17 +3073,11 @@ static void chv_setup_private_ppat(struct drm_i915_private *dev_priv)
> >          * Which means we must set the snoop bit in PAT entry 0
> >          * in order to keep the global status page working.
> >          */
> > -       pat = GEN8_PPAT(0, CHV_PPAT_SNOOP) |
> > -             GEN8_PPAT(1, 0) |
> > -             GEN8_PPAT(2, 0) |
> > -             GEN8_PPAT(3, 0) |
> > -             GEN8_PPAT(4, CHV_PPAT_SNOOP) |
> > -             GEN8_PPAT(5, CHV_PPAT_SNOOP) |
> > -             GEN8_PPAT(6, CHV_PPAT_SNOOP) |
> > -             GEN8_PPAT(7, CHV_PPAT_SNOOP);
> >  
> > -       I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);
> > -       I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);
> > +       alloc_ppat_entry(ppat, 0, CHV_PPAT_SNOOP);
> > +       alloc_ppat_entry(ppat, 2, 0);
> > +       alloc_ppat_entry(ppat, 3, 0);
> > +       alloc_ppat_entry(ppat, 4, CHV_PPAT_SNOOP);
> >  }
> 
> 1 is dropped in all cases?
> 
> The current ABI is that we reserve (0, 1, 2) for userspace. 1 means
> follow-PTE and is especially important.

Ignore the ABI concern, getting my tables confused. But the question
still remains why do we need to reserve these in particular?
-Chris
Wang, Zhi A Aug. 29, 2017, 11:13 a.m. UTC | #4
Hi Chris:
    There is mapping between i915 cache level -> PPAT index. Currently it's:

static gen8_pte_t gen8_pte_encode(dma_addr_t addr,
                                  enum i915_cache_level level)
{
...
        switch (level) {
        case I915_CACHE_NONE:
                pte |= PPAT_UNCACHED_INDEX;
                break;
        case I915_CACHE_WT:
                pte |= PPAT_DISPLAY_ELLC_INDEX;
                break;
        default:
                pte |= PPAT_CACHED_INDEX;
                break;
        }
...

According to bspec, the PPAT index filled in the page table is calculated as:

PPAT index = 4 * PAT + 2 * PCD + PWT

In the i915_gem_gtt.c

#define PPAT_UNCACHED_INDEX             (_PAGE_PWT | _PAGE_PCD)     // PPAT INDEX =  1 + 2 * 1 = 3
#define PPAT_CACHED_PDE_INDEX           0 /* WB LLC */			//  PPAT INDEX = 0
#define PPAT_CACHED_INDEX               _PAGE_PAT /* WB LLCeLLC */	// PPAT INDEX = 4 * 1 = 4
#define PPAT_DISPLAY_ELLC_INDEX         _PAGE_PCD /* WT eLLC */           // PPAT INDEX = 2 * 1 = 2

Actually the PPAT index used by i915 are: 0 2 3 4, which has already been reserved in the RFC patches.

The reason I want to reserve that PPAT entries for host is to keep the mapping between I915_CACHE_* and PPAT_*_INDEX unchanged.

Thanks,
Zhi.

-----Original Message-----
From: Chris Wilson [mailto:chris@chris-wilson.co.uk] 

Sent: Tuesday, August 29, 2017 1:40 PM
To: Wang, Zhi A <zhi.a.wang@intel.com>; intel-gfx@lists.freedesktop.org; intel-gvt-dev@lists.freedesktop.org
Cc: joonas.lahtinen@linux.intel.com; zhenyuw@linux.intel.com; Wang, Zhi A <zhi.a.wang@intel.com>; Widawsky, Benjamin <benjamin.widawsky@intel.com>; Vivi, Rodrigo <rodrigo.vivi@intel.com>
Subject: Re: [RFCv5 2/2] drm/i915: Introduce private PAT management

Quoting Chris Wilson (2017-08-29 11:05:21)
> Quoting Zhi Wang (2017-08-29 09:00:51)

> > +static void cnl_setup_private_ppat(struct intel_ppat *ppat) {

> > +       ppat->max_entries = 8;

> > +       ppat->update = cnl_private_pat_update;

> > +       ppat->match = bdw_private_pat_match;

> > +       ppat->dummy_value = GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3);

> > +

> >         /* XXX: spec is unclear if this is still needed for CNL+ */

> > -       if (!USES_PPGTT(dev_priv)) {

> > -               I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_UC);

> > +       if (!USES_PPGTT(ppat->i915)) {

> > +               alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);

> >                 return;

> >         }

> >  

> > -       I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_WB | GEN8_PPAT_LLC);

> > -       I915_WRITE(GEN10_PAT_INDEX(1), GEN8_PPAT_WC | GEN8_PPAT_LLCELLC);

> > -       I915_WRITE(GEN10_PAT_INDEX(2), GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);

> > -       I915_WRITE(GEN10_PAT_INDEX(3), GEN8_PPAT_UC);

> > -       I915_WRITE(GEN10_PAT_INDEX(4), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));

> > -       I915_WRITE(GEN10_PAT_INDEX(5), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1));

> > -       I915_WRITE(GEN10_PAT_INDEX(6), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2));

> > -       I915_WRITE(GEN10_PAT_INDEX(7), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));

> > +       alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);

> > +       alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);

> > +       alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);

> > +       alloc_ppat_entry(ppat, 4, GEN8_PPAT_LLCELLC | 

> > + GEN8_PPAT_AGE(0));

> >  }

> >  

> >  /* The GGTT and PPGTT need a private PPAT setup in order to handle cacheability

> >   * bits. When using advanced contexts each context stores its own PAT, but

> >   * writing this data shouldn't be harmful even in those cases. */ 

> > -static void bdw_setup_private_ppat(struct drm_i915_private 

> > *dev_priv)

> > +static void bdw_setup_private_ppat(struct intel_ppat *ppat)

> >  {

> > -       u64 pat;

> > -

> > -       pat = GEN8_PPAT(0, GEN8_PPAT_WB | GEN8_PPAT_LLC)     | /* for normal objects, no eLLC */

> > -             GEN8_PPAT(1, GEN8_PPAT_WC | GEN8_PPAT_LLCELLC) | /* for something pointing to ptes? */

> > -             GEN8_PPAT(2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC) | /* for scanout with eLLC */

> > -             GEN8_PPAT(3, GEN8_PPAT_UC)                     | /* Uncached objects, mostly for scanout */

> > -             GEN8_PPAT(4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0)) |

> > -             GEN8_PPAT(5, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1)) |

> > -             GEN8_PPAT(6, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2)) |

> > -             GEN8_PPAT(7, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));

> > +       ppat->max_entries = 8;

> > +       ppat->update = bdw_private_pat_update;

> > +       ppat->match = bdw_private_pat_match;

> > +       ppat->dummy_value = GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | 

> > + GEN8_PPAT_AGE(3);

> >  

> > -       if (!USES_PPGTT(dev_priv))

> > +       if (!USES_PPGTT(dev_priv)) {

> >                 /* Spec: "For GGTT, there is NO pat_sel[2:0] from the entry,

> >                  * so RTL will always use the value corresponding to

> >                  * pat_sel = 000".

> > @@ -2864,17 +3038,22 @@ static void bdw_setup_private_ppat(struct drm_i915_private *dev_priv)

> >                  * So we can still hold onto all our assumptions wrt cpu

> >                  * clflushing on LLC machines.

> >                  */

> > -               pat = GEN8_PPAT(0, GEN8_PPAT_UC);

> > +               alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);

> > +               return;

> > +       }

> >  

> > -       /* XXX: spec defines this as 2 distinct registers. It's unclear if a 64b

> > -        * write would work. */

> > -       I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);

> > -       I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);

> > +       alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);     /* for normal objects, no eLLC */

> > +       alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC); /* for scanout with eLLC */

> > +       alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);                     /* Uncached objects, mostly for scanout */

> > +       alloc_ppat_entry(ppat, 4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | 

> > + GEN8_PPAT_AGE(0));

> >  }

> >  

> > -static void chv_setup_private_ppat(struct drm_i915_private 

> > *dev_priv)

> > +static void chv_setup_private_ppat(struct intel_ppat *ppat)

> >  {

> > -       u64 pat;

> > +       ppat->max_entries = 8;

> > +       ppat->update = bdw_private_pat_update;

> > +       ppat->match = chv_private_pat_match;

> > +       ppat->dummy_value = CHV_PPAT_SNOOP;

> >  

> >         /*

> >          * Map WB on BDW to snooped on CHV.

> > @@ -2894,17 +3073,11 @@ static void chv_setup_private_ppat(struct drm_i915_private *dev_priv)

> >          * Which means we must set the snoop bit in PAT entry 0

> >          * in order to keep the global status page working.

> >          */

> > -       pat = GEN8_PPAT(0, CHV_PPAT_SNOOP) |

> > -             GEN8_PPAT(1, 0) |

> > -             GEN8_PPAT(2, 0) |

> > -             GEN8_PPAT(3, 0) |

> > -             GEN8_PPAT(4, CHV_PPAT_SNOOP) |

> > -             GEN8_PPAT(5, CHV_PPAT_SNOOP) |

> > -             GEN8_PPAT(6, CHV_PPAT_SNOOP) |

> > -             GEN8_PPAT(7, CHV_PPAT_SNOOP);

> >  

> > -       I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);

> > -       I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);

> > +       alloc_ppat_entry(ppat, 0, CHV_PPAT_SNOOP);

> > +       alloc_ppat_entry(ppat, 2, 0);

> > +       alloc_ppat_entry(ppat, 3, 0);

> > +       alloc_ppat_entry(ppat, 4, CHV_PPAT_SNOOP);

> >  }

> 

> 1 is dropped in all cases?

> 

> The current ABI is that we reserve (0, 1, 2) for userspace. 1 means 

> follow-PTE and is especially important.


Ignore the ABI concern, getting my tables confused. But the question still remains why do we need to reserve these in particular?
-Chris
Wang, Zhi A Aug. 29, 2017, 11:18 a.m. UTC | #5
Thanks Joonas! :)

-----Original Message-----
From: Joonas Lahtinen [mailto:joonas.lahtinen@linux.intel.com] 

Sent: Tuesday, August 29, 2017 12:37 PM
To: Wang, Zhi A <zhi.a.wang@intel.com>; intel-gfx@lists.freedesktop.org; intel-gvt-dev@lists.freedesktop.org
Cc: chris@chris-wilson.co.uk; zhenyuw@linux.intel.com; Widawsky, Benjamin <benjamin.widawsky@intel.com>; Vivi, Rodrigo <rodrigo.vivi@intel.com>
Subject: Re: [RFCv5 2/2] drm/i915: Introduce private PAT management

On Tue, 2017-08-29 at 16:00 +0800, Zhi Wang wrote:
> The private PAT management is to support PPAT entry manipulation. Two 

> APIs are introduced for dynamically managing PPAT entries: 

> intel_ppat_get and intel_ppat_put.

> 

> intel_ppat_get will search for an existing PPAT entry which perfectly 

> matches the required PPAT value. If not, it will try to allocate or 

> return a partially matched PPAT entry if there is any available PPAT 

> indexes or not.

> 

> intel_ppat_put will put back the PPAT entry which comes from 

> intel_ppat_get. If it's dynamically allocated, the reference count 

> will be decreased. If the reference count turns into zero, the PPAT 

> index is freed again.

> 

> Besides, another two callbacks are introduced to support the private 

> PAT management framework. One is ppat->update(), which writes the PPAT 

> configurations in ppat->entries into HW. Another one is ppat->match, 

> which will return a score to show how two PPAT values match with each other.

> 

> v5:

> 

> - Add check and warnnings for those platforms which don't have PPAT.

> 

> v3:

> 

> - Introduce dirty bitmap for PPAT registers. (Chris)

> - Change the name of the pointer "dev_priv" to "i915". (Chris)

> - intel_ppat_{get, put} returns/takes a const intel_ppat_entry *. 

> (Chris)

> 

> v2:

> 

> - API re-design. (Chris)

> 

> Cc: Ben Widawsky <benjamin.widawsky@intel.com>

> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>

> Cc: Chris Wilson <chris@chris-wilson.co.uk>

> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

> Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>


<SNIP>

> -static void cnl_setup_private_ppat(struct drm_i915_private *dev_priv)

> +static struct intel_ppat_entry *alloc_ppat_entry(struct intel_ppat *ppat,

> +						 unsigned int index,

> +						 u8 value)

>  {

> +	struct intel_ppat_entry *entry = &ppat->entries[index];

> +

> +	entry->ppat = ppat;

> +	entry->value = value;

> +	kref_init(&entry->ref_count);

> +	set_bit(index, ppat->used);

> +	set_bit(index, ppat->dirty);

> +

> +	return entry;

> +}

> +

> +static void free_ppat_entry(struct intel_ppat_entry *entry) {

> +	struct intel_ppat *ppat = entry->ppat;

> +	int index = entry - ppat->entries;

> +

> +	entry->value = ppat->dummy_value;

> +	clear_bit(index, ppat->used);

> +	set_bit(index, ppat->dirty);

> +}


Above functions should be __ prefixed as they do no checking if they override existing data. The suitable names might be __ppat_get and __ppat_put.

> +

> +/**

> + * intel_ppat_get - get a usable PPAT entry

> + * @i915: i915 device instance

> + * @value: the PPAT value required by the caller

> + *

> + * The function tries to search if there is an existing PPAT entry 

> +which

> + * matches with the required value. If perfectly matched, the 

> +existing PPAT

> + * entry will be used. If only partially matched, it will try to 

> +check if

> + * there is any available PPAT index. If yes, it will allocate a new 

> +PPAT

> + * index for the required entry and update the HW. If not, the 

> +partially

> + * matched entry will be used.

> + */

> +const struct intel_ppat_entry *intel_ppat_get(struct drm_i915_private 

> +*i915,


Maybe split the function type and name to avoid exceeding 80 chars on next line.

> +					      u8 value)

> +{

> +	struct intel_ppat *ppat = &i915->ppat;

> +	struct intel_ppat_entry *entry;

> +	int i, used;

> +	unsigned int score, best_score;

> +

> +	if (WARN_ON(!ppat->max_entries))

> +		return ERR_PTR(-ENODEV);


No need for extra check like this, this will just lead to ENOSPC.

> +

> +	score = best_score = 0;

> +	used = 0;


This variable behaves more like scanned.

> +

> +	/* First, find a suitable value from available entries */


The next two lines repeat this information, no need to document "what".

> +	for_each_set_bit(i, ppat->used, ppat->max_entries) {


		score could be declared in this scope.

> +		score = ppat->match(ppat->entries[i].value, value);

> +		/* Perfect match */


This comment is literally repeating what code says.

> +		if (score == INTEL_PPAT_PERFECT_MATCH) {

> +			entry = &ppat->entries[i];

> +			kref_get(&entry->ref_count);

> +			return entry;

> +		}

> +

> +		if (score > best_score) {

> +			entry = &ppat->entries[i];


Above could be simplified:

			if (score == INTEL_PPAT_PERFECT_MATCH) {
				kref_get(&entry->ref);
				return entry;
			}

> +			best_score = score;

> +		}

> +		used++;

> +	}

> +

> +	/* No matched entry and we can't allocate a new entry. */


DRM_ERROR replicates this comment's information.

> +	if (!best_score && used == ppat->max_entries) {

> +		DRM_ERROR("Fail to get PPAT entry\n");


DRM_DEBUG_DRIVER at most.

> +		return ERR_PTR(-ENOSPC);

> +	}

> +

> +	/*

> +	 * Found a matched entry which is not perfect,

> +	 * and we can't allocate a new entry.

> +	 */

> +	if (best_score && used == ppat->max_entries) {

> +		kref_get(&entry->ref_count);

> +		return entry;

> +	}


Above code could be combined:

	if (scanned == ppat->max_entries) {
		if(!best_score)
			return ERR_PTR(-ENOSPC);
		kref_get(&entry->ref);
		return entry;
	}

> +

> +	/* Allocate a new entry */


This comment is also just telling "what", which we can see from code.

> +	i = find_first_zero_bit(ppat->used, ppat->max_entries);

> +	entry = alloc_ppat_entry(ppat, i, value);

> +	ppat->update(i915);

> +	return entry;

> +}

> +

> +static void put_ppat(struct kref *kref)


ppat_release might cause less confusion, otherwise there will be 3 put functions.

> +{

> +	struct intel_ppat_entry *entry =

> +		container_of(kref, struct intel_ppat_entry, ref_count);

> +	struct drm_i915_private *i915 = entry->ppat->i915;

> +

> +	free_ppat_entry(entry);

> +	entry->ppat->update(i915);

> +}

> +

> +/**

> + * intel_ppat_put - put back the PPAT entry got from intel_ppat_get()

> + * @entry: an intel PPAT entry

> + *

> + * Put back the PPAT entry got from intel_ppat_get(). If the PPAT 

> +index of the

> + * entry is dynamically allocated, its reference count will be 

> +decreased. Once

> + * the reference count becomes into zero, the PPAT index becomes free again.

> + */

> +void intel_ppat_put(const struct intel_ppat_entry *entry) {

> +	struct intel_ppat *ppat = entry->ppat;

> +	int index = entry - ppat->entries;

> +

> +	if (WARN_ON(!ppat->max_entries))

> +		return;


This is clearly a kernel programmer error (and a serious one, so could be GEM_BUG_ON).

> +

> +	kref_put(&ppat->entries[index].ref_count, put_ppat); }

> +

> +static void cnl_private_pat_update(struct drm_i915_private *dev_priv) 

> +{

> +	struct intel_ppat *ppat = &dev_priv->ppat;

> +	int i;

> +

> +	for_each_set_bit(i, ppat->dirty, ppat->max_entries) {

> +		clear_bit(i, ppat->dirty);

> +		I915_WRITE(GEN10_PAT_INDEX(i), ppat->entries[i].value);


Clearing the bit after write is the logical thing to do.

> +	}

> +}

> +

> +static void bdw_private_pat_update(struct drm_i915_private *dev_priv) 

> +{

> +	struct intel_ppat *ppat = &dev_priv->ppat;

> +	u64 pat = 0;

> +	int i;

> +

> +	for (i = 0; i < ppat->max_entries; i++)

> +		pat |= GEN8_PPAT(i, ppat->entries[i].value);

> +

> +	bitmap_clear(ppat->dirty, 0, ppat->max_entries);

> +

> +	I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);

> +	I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);


While moving the code, lower_32_bits and upper_32_bits, it should really warn without them?

> +}

> +

> +#define gen8_pat_ca(v) ((v) & 0x3)

> +#define gen8_pat_tc(v) (((v) >> 2) & 0x3) #define gen8_pat_age(v) 

> +(((v) >> 4) & 0x3)


Magic numbers. I'm also not 100% sure if KMD will want partial matches, or if GVT should pass in a scoring function.

> +

> +static unsigned int bdw_private_pat_match(u8 src, u8 dst) {

> +	unsigned int score = 0;

> +

> +	/* Cache attribute has to be matched. */

> +	if (gen8_pat_ca(src) != gen8_pat_ca(dst))

> +		return 0;

> +

> +	if (gen8_pat_age(src) == gen8_pat_age(dst))

> +		score += 1;

> +

> +	if (gen8_pat_tc(src) == gen8_pat_tc(dst))

> +		score += 2;


I'd lift this check to have them all in importance order.

> +

> +	if (score == 3)

> +		return INTEL_PPAT_PERFECT_MATCH;

> +

> +	return score;

> +}

> +

> +#define chv_get_snoop(v) (((v) >> 6) & 0x1)


Magic numbers, which need to be in i915_reg.h with #define. Should also be chv_pat_snoop();

> +

> +static unsigned int chv_private_pat_match(u8 src, u8 dst) {

> +	if (chv_get_snoop(src) == chv_get_snoop(dst))

> +		return INTEL_PPAT_PERFECT_MATCH;

> +

> +	return 0;


	return chv_pat_snoop(src) == chv_pat_snoop(dst) ?
	       INTEL_PPAT_PERFECT_MATCH : 0;

> +}

> +

> +static void cnl_setup_private_ppat(struct intel_ppat *ppat) {

> +	ppat->max_entries = 8;

> +	ppat->update = cnl_private_pat_update;

> +	ppat->match = bdw_private_pat_match;

> +	ppat->dummy_value = GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3);


"dummy_value" is not a very descriptive name.

> +

>  	/* XXX: spec is unclear if this is still needed for CNL+ */

> -	if (!USES_PPGTT(dev_priv)) {

> -		I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_UC);

> +	if (!USES_PPGTT(ppat->i915)) {

> +		alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);

>  		return;

>  	}

>  

> -	I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_WB | GEN8_PPAT_LLC);

> -	I915_WRITE(GEN10_PAT_INDEX(1), GEN8_PPAT_WC | GEN8_PPAT_LLCELLC);

> -	I915_WRITE(GEN10_PAT_INDEX(2), GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);

> -	I915_WRITE(GEN10_PAT_INDEX(3), GEN8_PPAT_UC);

> -	I915_WRITE(GEN10_PAT_INDEX(4), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));

> -	I915_WRITE(GEN10_PAT_INDEX(5), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1));

> -	I915_WRITE(GEN10_PAT_INDEX(6), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2));

> -	I915_WRITE(GEN10_PAT_INDEX(7), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));

> +	alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);

> +	alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);

> +	alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);

> +	alloc_ppat_entry(ppat, 4, GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));


Maybe I'm not following at all, but this seems like quite a change to me? 0-3 used to be WB, WC, WT, UC, now 1 is unset, like is 5-7.

I'd leave all changes to registers to a later patch and leave this patch to do what the title says, "Introduce private PAT management".

>  static void setup_private_pat(struct drm_i915_private *dev_priv)  {

> +	struct intel_ppat *ppat = &dev_priv->ppat;

> +	int i;

> +

> +	ppat->i915 = dev_priv;

> +

> +	/* Load per-platform PPAT configurations */


This comment is again just taking space.

>  	if (INTEL_GEN(dev_priv) >= 10)

> -		cnl_setup_private_ppat(dev_priv);

> +		cnl_setup_private_ppat(ppat);

>  	else if (IS_CHERRYVIEW(dev_priv) || IS_GEN9_LP(dev_priv))

> -		chv_setup_private_ppat(dev_priv);

> +		chv_setup_private_ppat(ppat);

>  	else

> -		bdw_setup_private_ppat(dev_priv);

> +		bdw_setup_private_ppat(ppat);

> +

> +	GEM_BUG_ON(ppat->max_entries > INTEL_MAX_PPAT_ENTRIES);

> +

> +	if (!ppat->max_entries)

> +		return;


I'm not sure why we have "!max_entries" checks going, we should have one GEM_BUG_ON at this point and in the rest you can assume it to be nonzero.

> +

> +	/* Fill unused PPAT entries with dummy PPAT value */

> +	for_each_clear_bit(i, ppat->used, ppat->max_entries) {

> +		ppat->entries[i].value = ppat->dummy_value;

> +		ppat->entries[i].ppat = ppat;

> +		set_bit(i, ppat->dirty);

> +	}

> +

> +	/* Write the HW */


If the callback was named update_hw(), this comment would be unnecessary.

> +	ppat->update(dev_priv);

>  }


<SNIP>

> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h

> @@ -536,6 +536,40 @@ i915_vm_to_ggtt(struct i915_address_space *vm)

>  	return container_of(vm, struct i915_ggtt, base);  }

>  

> +#define INTEL_MAX_PPAT_ENTRIES 8

> +#define INTEL_PPAT_PERFECT_MATCH (~0U)

> +

> +struct intel_ppat;

> +

> +struct intel_ppat_entry {

> +	struct intel_ppat *ppat;

> +	struct kref ref_count;


	Just "ref", like elsewhere in code.

> +	u8 value;

> +};

> +

> +struct intel_ppat {

> +	struct intel_ppat_entry entries[INTEL_MAX_PPAT_ENTRIES];

> +	DECLARE_BITMAP(used, INTEL_MAX_PPAT_ENTRIES);

> +	DECLARE_BITMAP(dirty, INTEL_MAX_PPAT_ENTRIES);

> +


This really goes with the previous group of variable. So no newline.

> +	unsigned int max_entries;

> +

> +	u8 dummy_value;


Better name, like "clear_value" and short description may be useful.

> +	/*

> +	 * Return a score to show how two PPAT values match,

> +	 * a INTEL_PPAT_PERFECT_MATCH indicates a perfect match

> +	 */

> +	unsigned int (*match)(u8 src, u8 dst);

> +	/* Write the PPAT configuration into HW. */

> +	void (*update)(struct drm_i915_private *i915);

> +

> +	struct drm_i915_private *i915;

> +};

> +

> +const struct intel_ppat_entry *intel_ppat_get(struct drm_i915_private 

> +*i915,


Same here with type vs name.

> +					      u8 value);

> +void intel_ppat_put(const struct intel_ppat_entry *entry);


Regards, joonas
--
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
Chris Wilson Aug. 29, 2017, 11:23 a.m. UTC | #6
Quoting Wang, Zhi A (2017-08-29 12:13:26)
> Hi Chris:
>     There is mapping between i915 cache level -> PPAT index. Currently it's:
> 
> static gen8_pte_t gen8_pte_encode(dma_addr_t addr,
>                                   enum i915_cache_level level)
> {
> ...
>         switch (level) {
>         case I915_CACHE_NONE:
>                 pte |= PPAT_UNCACHED_INDEX;
>                 break;
>         case I915_CACHE_WT:
>                 pte |= PPAT_DISPLAY_ELLC_INDEX;
>                 break;
>         default:
>                 pte |= PPAT_CACHED_INDEX;
>                 break;
>         }
> ...
> 
> According to bspec, the PPAT index filled in the page table is calculated as:
> 
> PPAT index = 4 * PAT + 2 * PCD + PWT
> 
> In the i915_gem_gtt.c
> 
> #define PPAT_UNCACHED_INDEX             (_PAGE_PWT | _PAGE_PCD)     // PPAT INDEX =  1 + 2 * 1 = 3
> #define PPAT_CACHED_PDE_INDEX           0 /* WB LLC */                  //  PPAT INDEX = 0
> #define PPAT_CACHED_INDEX               _PAGE_PAT /* WB LLCeLLC */      // PPAT INDEX = 4 * 1 = 4
> #define PPAT_DISPLAY_ELLC_INDEX         _PAGE_PCD /* WT eLLC */           // PPAT INDEX = 2 * 1 = 2
> 
> Actually the PPAT index used by i915 are: 0 2 3 4, which has already been reserved in the RFC patches.

So we can use these values in alloc_ppat, right? Still very concerned
about the hole -- it kind of implies there is an entry we should be
using but have forgotten!

> > > @@ -2864,17 +3038,22 @@ static void bdw_setup_private_ppat(struct drm_i915_private *dev_priv)
> > >                  * So we can still hold onto all our assumptions wrt cpu
> > >                  * clflushing on LLC machines.
> > >                  */
> > > -               pat = GEN8_PPAT(0, GEN8_PPAT_UC);
> > > +               alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);
> > > +               return;
> > > +       }
> > >  
> > > -       /* XXX: spec defines this as 2 distinct registers. It's unclear if a 64b
> > > -        * write would work. */
> > > -       I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);
> > > -       I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);
> > > +       alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);     /* for normal objects, no eLLC */
> > > +       alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC); /* for scanout with eLLC */
> > > +       alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);                     /* Uncached objects, mostly for scanout */
> > > +       alloc_ppat_entry(ppat, 4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | 

/* See gen8_pte_encode() for the mapping from cache-level to ppat */
alloc_ppage_entry(ppat, PPAT_CACHED_PDE_INDEX, GEN8_PPAT_WB | GEN8_PPAT_LLC);
alloc_ppage_entry(ppat, PPAT_DISPLAY_ELLC_INDEX, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);
alloc_ppage_entry(ppat, PPAT_UNCACHED_INDEX, GEN8_PPAT_UC);
alloc_ppage_entry(ppat, PPAT_CACHED_INDEX, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));

etc.
-Chris
Wang, Zhi A Aug. 29, 2017, 11:31 a.m. UTC | #7
Thanks for the reply! 

For the hole, per my understanding, the author wanted the mapping to be consistent: i915 cache level <-> PPAT index <-> cache attribute in IA page table in case the GPU and IA may share page tables in future, since actually PPAT index is represented as PAT/PCD/PWT bits in GPU page table entries. We can see the PPAT index is defined with those PAT/PCD/PWT bits from IA page tables. Maybe that's the reason of the hole. :)

Thanks,
Zhi. 

-----Original Message-----
From: Chris Wilson [mailto:chris@chris-wilson.co.uk] 

Sent: Tuesday, August 29, 2017 2:24 PM
To: Wang, Zhi A <zhi.a.wang@intel.com>; intel-gfx@lists.freedesktop.org; intel-gvt-dev@lists.freedesktop.org
Cc: joonas.lahtinen@linux.intel.com; zhenyuw@linux.intel.com; Widawsky, Benjamin <benjamin.widawsky@intel.com>; Vivi, Rodrigo <rodrigo.vivi@intel.com>
Subject: RE: [RFCv5 2/2] drm/i915: Introduce private PAT management

Quoting Wang, Zhi A (2017-08-29 12:13:26)
> Hi Chris:

>     There is mapping between i915 cache level -> PPAT index. Currently it's:

> 

> static gen8_pte_t gen8_pte_encode(dma_addr_t addr,

>                                   enum i915_cache_level level) { ...

>         switch (level) {

>         case I915_CACHE_NONE:

>                 pte |= PPAT_UNCACHED_INDEX;

>                 break;

>         case I915_CACHE_WT:

>                 pte |= PPAT_DISPLAY_ELLC_INDEX;

>                 break;

>         default:

>                 pte |= PPAT_CACHED_INDEX;

>                 break;

>         }

> ...

> 

> According to bspec, the PPAT index filled in the page table is calculated as:

> 

> PPAT index = 4 * PAT + 2 * PCD + PWT

> 

> In the i915_gem_gtt.c

> 

> #define PPAT_UNCACHED_INDEX             (_PAGE_PWT | _PAGE_PCD)     // PPAT INDEX =  1 + 2 * 1 = 3

> #define PPAT_CACHED_PDE_INDEX           0 /* WB LLC */                  //  PPAT INDEX = 0

> #define PPAT_CACHED_INDEX               _PAGE_PAT /* WB LLCeLLC */      // PPAT INDEX = 4 * 1 = 4

> #define PPAT_DISPLAY_ELLC_INDEX         _PAGE_PCD /* WT eLLC */           // PPAT INDEX = 2 * 1 = 2

> 

> Actually the PPAT index used by i915 are: 0 2 3 4, which has already been reserved in the RFC patches.


So we can use these values in alloc_ppat, right? Still very concerned about the hole -- it kind of implies there is an entry we should be using but have forgotten!

> > > @@ -2864,17 +3038,22 @@ static void bdw_setup_private_ppat(struct drm_i915_private *dev_priv)

> > >                  * So we can still hold onto all our assumptions wrt cpu

> > >                  * clflushing on LLC machines.

> > >                  */

> > > -               pat = GEN8_PPAT(0, GEN8_PPAT_UC);

> > > +               alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);

> > > +               return;

> > > +       }

> > >  

> > > -       /* XXX: spec defines this as 2 distinct registers. It's unclear if a 64b

> > > -        * write would work. */

> > > -       I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);

> > > -       I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);

> > > +       alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);     /* for normal objects, no eLLC */

> > > +       alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC); /* for scanout with eLLC */

> > > +       alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);                     /* Uncached objects, mostly for scanout */

> > > +       alloc_ppat_entry(ppat, 4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC 

> > > + |


/* See gen8_pte_encode() for the mapping from cache-level to ppat */ alloc_ppage_entry(ppat, PPAT_CACHED_PDE_INDEX, GEN8_PPAT_WB | GEN8_PPAT_LLC); alloc_ppage_entry(ppat, PPAT_DISPLAY_ELLC_INDEX, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC); alloc_ppage_entry(ppat, PPAT_UNCACHED_INDEX, GEN8_PPAT_UC); alloc_ppage_entry(ppat, PPAT_CACHED_INDEX, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));

etc.
-Chris
Joonas Lahtinen Aug. 29, 2017, 11:38 a.m. UTC | #8
On Tue, 2017-08-29 at 12:23 +0100, Chris Wilson wrote:
> Quoting Wang, Zhi A (2017-08-29 12:13:26)
> > Hi Chris:
> >     There is mapping between i915 cache level -> PPAT index. Currently it's:
> > 
> > static gen8_pte_t gen8_pte_encode(dma_addr_t addr,
> >                                   enum i915_cache_level level)
> > {
> > ...
> >         switch (level) {
> >         case I915_CACHE_NONE:
> >                 pte |= PPAT_UNCACHED_INDEX;
> >                 break;
> >         case I915_CACHE_WT:
> >                 pte |= PPAT_DISPLAY_ELLC_INDEX;
> >                 break;
> >         default:
> >                 pte |= PPAT_CACHED_INDEX;
> >                 break;
> >         }
> > ...
> > 
> > According to bspec, the PPAT index filled in the page table is calculated as:
> > 
> > PPAT index = 4 * PAT + 2 * PCD + PWT
> > 
> > In the i915_gem_gtt.c
> > 
> > #define PPAT_UNCACHED_INDEX             (_PAGE_PWT | _PAGE_PCD)     // PPAT INDEX =  1 + 2 * 1 = 3
> > #define PPAT_CACHED_PDE_INDEX           0 /* WB LLC */                  //  PPAT INDEX = 0
> > #define PPAT_CACHED_INDEX               _PAGE_PAT /* WB LLCeLLC */      // PPAT INDEX = 4 * 1 = 4
> > #define PPAT_DISPLAY_ELLC_INDEX         _PAGE_PCD /* WT eLLC */           // PPAT INDEX = 2 * 1 = 2
> > 
> > Actually the PPAT index used by i915 are: 0 2 3 4, which has already been reserved in the RFC patches.
> 
> So we can use these values in alloc_ppat, right? Still very concerned
> about the hole -- it kind of implies there is an entry we should be
> using but have forgotten!
> 
> > > > @@ -2864,17 +3038,22 @@ static void bdw_setup_private_ppat(struct drm_i915_private *dev_priv)
> > > >                  * So we can still hold onto all our assumptions wrt cpu
> > > >                  * clflushing on LLC machines.
> > > >                  */
> > > > -               pat = GEN8_PPAT(0, GEN8_PPAT_UC);
> > > > +               alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);
> > > > +               return;
> > > > +       }
> > > >  
> > > > -       /* XXX: spec defines this as 2 distinct registers. It's unclear if a 64b
> > > > -        * write would work. */
> > > > -       I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);
> > > > -       I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);
> > > > +       alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);     /* for normal objects, no eLLC */
> > > > +       alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC); /* for scanout with eLLC */
> > > > +       alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);                     /* Uncached objects, mostly for scanout */
> > > > +       alloc_ppat_entry(ppat, 4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | 
> 
> /* See gen8_pte_encode() for the mapping from cache-level to ppat */
> alloc_ppage_entry(ppat, PPAT_CACHED_PDE_INDEX, GEN8_PPAT_WB | GEN8_PPAT_LLC);
> alloc_ppage_entry(ppat, PPAT_DISPLAY_ELLC_INDEX, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);
> alloc_ppage_entry(ppat, PPAT_UNCACHED_INDEX, GEN8_PPAT_UC);
> alloc_ppage_entry(ppat, PPAT_CACHED_INDEX, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));
> 

+1 on that idea as follow-up patch which should do these entry changes.

I'd first make sure this patch leaves all actual register writes in
equal state as they were before the patch, just the code is
restructured. Bisecting will be easier if the actual added holes will
be brought in by separate patch.

Regards, Joonas
Chris Wilson Aug. 29, 2017, 12:12 p.m. UTC | #9
Quoting Zhi Wang (2017-08-29 09:00:51)
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index b74fa9d..b99b6ca 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -2816,41 +2816,215 @@ static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size)
>         return 0;
>  }
>  
> -static void cnl_setup_private_ppat(struct drm_i915_private *dev_priv)
> +static struct intel_ppat_entry *alloc_ppat_entry(struct intel_ppat *ppat,
> +                                                unsigned int index,
> +                                                u8 value)
>  {
> +       struct intel_ppat_entry *entry = &ppat->entries[index];

Considering the magic numbers flying around (using values defined
elsewhere) add

GEM_BUG_ON(index >= ppat->max_entries);
GEM_BUG_ON(test_bit(index, ppat->used));

> +
> +       entry->ppat = ppat;
> +       entry->value = value;
> +       kref_init(&entry->ref_count);
> +       set_bit(index, ppat->used);
> +       set_bit(index, ppat->dirty);
> +
> +       return entry;
> +}
> +
> +static void free_ppat_entry(struct intel_ppat_entry *entry)
> +{
> +       struct intel_ppat *ppat = entry->ppat;
> +       int index = entry - ppat->entries;

unsigned int index;

GEM_BUG_ON(index >= ppat->max_entries);
GEM_BUG_ON(!test_bit(index, ppat->used));

> +
> +       entry->value = ppat->dummy_value;
> +       clear_bit(index, ppat->used);
> +       set_bit(index, ppat->dirty);
> +}
Chris Wilson Aug. 29, 2017, 12:18 p.m. UTC | #10
Quoting Zhi Wang (2017-08-29 09:00:51)
> The private PAT management is to support PPAT entry manipulation. Two
> APIs are introduced for dynamically managing PPAT entries: intel_ppat_get
> and intel_ppat_put.
> 
> intel_ppat_get will search for an existing PPAT entry which perfectly
> matches the required PPAT value. If not, it will try to allocate or
> return a partially matched PPAT entry if there is any available PPAT
> indexes or not.
> 
> intel_ppat_put will put back the PPAT entry which comes from
> intel_ppat_get. If it's dynamically allocated, the reference count will
> be decreased. If the reference count turns into zero, the PPAT index is
> freed again.
> 
> Besides, another two callbacks are introduced to support the private PAT
> management framework. One is ppat->update(), which writes the PPAT
> configurations in ppat->entries into HW. Another one is ppat->match, which
> will return a score to show how two PPAT values match with each other.

Oh and since you are exporting an interface, I bet you would appreciate
it if we had some unittests in selftests/ ;)
-Chris
Wang, Zhi A Aug. 29, 2017, 12:41 p.m. UTC | #11
Sure. I'm digging selftest now.

-----Original Message-----
From: Chris Wilson [mailto:chris@chris-wilson.co.uk] 

Sent: Tuesday, August 29, 2017 3:18 PM
To: Wang, Zhi A <zhi.a.wang@intel.com>; intel-gfx@lists.freedesktop.org; intel-gvt-dev@lists.freedesktop.org
Cc: joonas.lahtinen@linux.intel.com; zhenyuw@linux.intel.com; Wang, Zhi A <zhi.a.wang@intel.com>; Widawsky, Benjamin <benjamin.widawsky@intel.com>; Vivi, Rodrigo <rodrigo.vivi@intel.com>
Subject: Re: [RFCv5 2/2] drm/i915: Introduce private PAT management

Quoting Zhi Wang (2017-08-29 09:00:51)
> The private PAT management is to support PPAT entry manipulation. Two 

> APIs are introduced for dynamically managing PPAT entries: 

> intel_ppat_get and intel_ppat_put.

> 

> intel_ppat_get will search for an existing PPAT entry which perfectly 

> matches the required PPAT value. If not, it will try to allocate or 

> return a partially matched PPAT entry if there is any available PPAT 

> indexes or not.

> 

> intel_ppat_put will put back the PPAT entry which comes from 

> intel_ppat_get. If it's dynamically allocated, the reference count 

> will be decreased. If the reference count turns into zero, the PPAT 

> index is freed again.

> 

> Besides, another two callbacks are introduced to support the private 

> PAT management framework. One is ppat->update(), which writes the PPAT 

> configurations in ppat->entries into HW. Another one is ppat->match, 

> which will return a score to show how two PPAT values match with each other.


Oh and since you are exporting an interface, I bet you would appreciate it if we had some unittests in selftests/ ;) -Chris
Wang, Zhi A Aug. 29, 2017, 5:54 p.m. UTC | #12
Another finding during the re-factoring are:

a)It looks like that the PPAT_CACHE_INDEX on BDW/SKL is mapped to:
GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0);

But the PPAT_CACHE_INDEX on CNL is mapped to 
GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0);

GEN8_PPAT_WB is missing here and by default the cache attribute is UC.

Is this set intentionally?

b) Looks like all the ages of PPAT in windows driver is AGE(3) because of some performance gains, is there any reason that i915 has to set it to AGE(0)?

Thanks,
Zhi.

-----Original Message-----
From: Wang, Zhi A 

Sent: Tuesday, August 29, 2017 2:19 PM
To: 'Joonas Lahtinen' <joonas.lahtinen@linux.intel.com>; intel-gfx@lists.freedesktop.org; intel-gvt-dev@lists.freedesktop.org
Cc: chris@chris-wilson.co.uk; zhenyuw@linux.intel.com; Widawsky, Benjamin <benjamin.widawsky@intel.com>; Vivi, Rodrigo <rodrigo.vivi@intel.com>
Subject: RE: [RFCv5 2/2] drm/i915: Introduce private PAT management

Thanks Joonas! :)

-----Original Message-----
From: Joonas Lahtinen [mailto:joonas.lahtinen@linux.intel.com]

Sent: Tuesday, August 29, 2017 12:37 PM
To: Wang, Zhi A <zhi.a.wang@intel.com>; intel-gfx@lists.freedesktop.org; intel-gvt-dev@lists.freedesktop.org
Cc: chris@chris-wilson.co.uk; zhenyuw@linux.intel.com; Widawsky, Benjamin <benjamin.widawsky@intel.com>; Vivi, Rodrigo <rodrigo.vivi@intel.com>
Subject: Re: [RFCv5 2/2] drm/i915: Introduce private PAT management

On Tue, 2017-08-29 at 16:00 +0800, Zhi Wang wrote:
> The private PAT management is to support PPAT entry manipulation. Two 

> APIs are introduced for dynamically managing PPAT entries:

> intel_ppat_get and intel_ppat_put.

> 

> intel_ppat_get will search for an existing PPAT entry which perfectly 

> matches the required PPAT value. If not, it will try to allocate or 

> return a partially matched PPAT entry if there is any available PPAT 

> indexes or not.

> 

> intel_ppat_put will put back the PPAT entry which comes from 

> intel_ppat_get. If it's dynamically allocated, the reference count 

> will be decreased. If the reference count turns into zero, the PPAT 

> index is freed again.

> 

> Besides, another two callbacks are introduced to support the private 

> PAT management framework. One is ppat->update(), which writes the PPAT 

> configurations in ppat->entries into HW. Another one is ppat->match, 

> which will return a score to show how two PPAT values match with each other.

> 

> v5:

> 

> - Add check and warnnings for those platforms which don't have PPAT.

> 

> v3:

> 

> - Introduce dirty bitmap for PPAT registers. (Chris)

> - Change the name of the pointer "dev_priv" to "i915". (Chris)

> - intel_ppat_{get, put} returns/takes a const intel_ppat_entry *. 

> (Chris)

> 

> v2:

> 

> - API re-design. (Chris)

> 

> Cc: Ben Widawsky <benjamin.widawsky@intel.com>

> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>

> Cc: Chris Wilson <chris@chris-wilson.co.uk>

> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

> Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>


<SNIP>

> -static void cnl_setup_private_ppat(struct drm_i915_private *dev_priv)

> +static struct intel_ppat_entry *alloc_ppat_entry(struct intel_ppat *ppat,

> +						 unsigned int index,

> +						 u8 value)

>  {

> +	struct intel_ppat_entry *entry = &ppat->entries[index];

> +

> +	entry->ppat = ppat;

> +	entry->value = value;

> +	kref_init(&entry->ref_count);

> +	set_bit(index, ppat->used);

> +	set_bit(index, ppat->dirty);

> +

> +	return entry;

> +}

> +

> +static void free_ppat_entry(struct intel_ppat_entry *entry) {

> +	struct intel_ppat *ppat = entry->ppat;

> +	int index = entry - ppat->entries;

> +

> +	entry->value = ppat->dummy_value;

> +	clear_bit(index, ppat->used);

> +	set_bit(index, ppat->dirty);

> +}


Above functions should be __ prefixed as they do no checking if they override existing data. The suitable names might be __ppat_get and __ppat_put.

> +

> +/**

> + * intel_ppat_get - get a usable PPAT entry

> + * @i915: i915 device instance

> + * @value: the PPAT value required by the caller

> + *

> + * The function tries to search if there is an existing PPAT entry 

> +which

> + * matches with the required value. If perfectly matched, the 

> +existing PPAT

> + * entry will be used. If only partially matched, it will try to 

> +check if

> + * there is any available PPAT index. If yes, it will allocate a new 

> +PPAT

> + * index for the required entry and update the HW. If not, the 

> +partially

> + * matched entry will be used.

> + */

> +const struct intel_ppat_entry *intel_ppat_get(struct drm_i915_private 

> +*i915,


Maybe split the function type and name to avoid exceeding 80 chars on next line.

> +					      u8 value)

> +{

> +	struct intel_ppat *ppat = &i915->ppat;

> +	struct intel_ppat_entry *entry;

> +	int i, used;

> +	unsigned int score, best_score;

> +

> +	if (WARN_ON(!ppat->max_entries))

> +		return ERR_PTR(-ENODEV);


No need for extra check like this, this will just lead to ENOSPC.

> +

> +	score = best_score = 0;

> +	used = 0;


This variable behaves more like scanned.

> +

> +	/* First, find a suitable value from available entries */


The next two lines repeat this information, no need to document "what".

> +	for_each_set_bit(i, ppat->used, ppat->max_entries) {


		score could be declared in this scope.

> +		score = ppat->match(ppat->entries[i].value, value);

> +		/* Perfect match */


This comment is literally repeating what code says.

> +		if (score == INTEL_PPAT_PERFECT_MATCH) {

> +			entry = &ppat->entries[i];

> +			kref_get(&entry->ref_count);

> +			return entry;

> +		}

> +

> +		if (score > best_score) {

> +			entry = &ppat->entries[i];


Above could be simplified:

			if (score == INTEL_PPAT_PERFECT_MATCH) {
				kref_get(&entry->ref);
				return entry;
			}

> +			best_score = score;

> +		}

> +		used++;

> +	}

> +

> +	/* No matched entry and we can't allocate a new entry. */


DRM_ERROR replicates this comment's information.

> +	if (!best_score && used == ppat->max_entries) {

> +		DRM_ERROR("Fail to get PPAT entry\n");


DRM_DEBUG_DRIVER at most.

> +		return ERR_PTR(-ENOSPC);

> +	}

> +

> +	/*

> +	 * Found a matched entry which is not perfect,

> +	 * and we can't allocate a new entry.

> +	 */

> +	if (best_score && used == ppat->max_entries) {

> +		kref_get(&entry->ref_count);

> +		return entry;

> +	}


Above code could be combined:

	if (scanned == ppat->max_entries) {
		if(!best_score)
			return ERR_PTR(-ENOSPC);
		kref_get(&entry->ref);
		return entry;
	}

> +

> +	/* Allocate a new entry */


This comment is also just telling "what", which we can see from code.

> +	i = find_first_zero_bit(ppat->used, ppat->max_entries);

> +	entry = alloc_ppat_entry(ppat, i, value);

> +	ppat->update(i915);

> +	return entry;

> +}

> +

> +static void put_ppat(struct kref *kref)


ppat_release might cause less confusion, otherwise there will be 3 put functions.

> +{

> +	struct intel_ppat_entry *entry =

> +		container_of(kref, struct intel_ppat_entry, ref_count);

> +	struct drm_i915_private *i915 = entry->ppat->i915;

> +

> +	free_ppat_entry(entry);

> +	entry->ppat->update(i915);

> +}

> +

> +/**

> + * intel_ppat_put - put back the PPAT entry got from intel_ppat_get()

> + * @entry: an intel PPAT entry

> + *

> + * Put back the PPAT entry got from intel_ppat_get(). If the PPAT 

> +index of the

> + * entry is dynamically allocated, its reference count will be 

> +decreased. Once

> + * the reference count becomes into zero, the PPAT index becomes free again.

> + */

> +void intel_ppat_put(const struct intel_ppat_entry *entry) {

> +	struct intel_ppat *ppat = entry->ppat;

> +	int index = entry - ppat->entries;

> +

> +	if (WARN_ON(!ppat->max_entries))

> +		return;


This is clearly a kernel programmer error (and a serious one, so could be GEM_BUG_ON).

> +

> +	kref_put(&ppat->entries[index].ref_count, put_ppat); }

> +

> +static void cnl_private_pat_update(struct drm_i915_private *dev_priv) 

> +{

> +	struct intel_ppat *ppat = &dev_priv->ppat;

> +	int i;

> +

> +	for_each_set_bit(i, ppat->dirty, ppat->max_entries) {

> +		clear_bit(i, ppat->dirty);

> +		I915_WRITE(GEN10_PAT_INDEX(i), ppat->entries[i].value);


Clearing the bit after write is the logical thing to do.

> +	}

> +}

> +

> +static void bdw_private_pat_update(struct drm_i915_private *dev_priv) 

> +{

> +	struct intel_ppat *ppat = &dev_priv->ppat;

> +	u64 pat = 0;

> +	int i;

> +

> +	for (i = 0; i < ppat->max_entries; i++)

> +		pat |= GEN8_PPAT(i, ppat->entries[i].value);

> +

> +	bitmap_clear(ppat->dirty, 0, ppat->max_entries);

> +

> +	I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);

> +	I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);


While moving the code, lower_32_bits and upper_32_bits, it should really warn without them?

> +}

> +

> +#define gen8_pat_ca(v) ((v) & 0x3)

> +#define gen8_pat_tc(v) (((v) >> 2) & 0x3) #define gen8_pat_age(v)

> +(((v) >> 4) & 0x3)


Magic numbers. I'm also not 100% sure if KMD will want partial matches, or if GVT should pass in a scoring function.

> +

> +static unsigned int bdw_private_pat_match(u8 src, u8 dst) {

> +	unsigned int score = 0;

> +

> +	/* Cache attribute has to be matched. */

> +	if (gen8_pat_ca(src) != gen8_pat_ca(dst))

> +		return 0;

> +

> +	if (gen8_pat_age(src) == gen8_pat_age(dst))

> +		score += 1;

> +

> +	if (gen8_pat_tc(src) == gen8_pat_tc(dst))

> +		score += 2;


I'd lift this check to have them all in importance order.

> +

> +	if (score == 3)

> +		return INTEL_PPAT_PERFECT_MATCH;

> +

> +	return score;

> +}

> +

> +#define chv_get_snoop(v) (((v) >> 6) & 0x1)


Magic numbers, which need to be in i915_reg.h with #define. Should also be chv_pat_snoop();

> +

> +static unsigned int chv_private_pat_match(u8 src, u8 dst) {

> +	if (chv_get_snoop(src) == chv_get_snoop(dst))

> +		return INTEL_PPAT_PERFECT_MATCH;

> +

> +	return 0;


	return chv_pat_snoop(src) == chv_pat_snoop(dst) ?
	       INTEL_PPAT_PERFECT_MATCH : 0;

> +}

> +

> +static void cnl_setup_private_ppat(struct intel_ppat *ppat) {

> +	ppat->max_entries = 8;

> +	ppat->update = cnl_private_pat_update;

> +	ppat->match = bdw_private_pat_match;

> +	ppat->dummy_value = GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3);


"dummy_value" is not a very descriptive name.

> +

>  	/* XXX: spec is unclear if this is still needed for CNL+ */

> -	if (!USES_PPGTT(dev_priv)) {

> -		I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_UC);

> +	if (!USES_PPGTT(ppat->i915)) {

> +		alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);

>  		return;

>  	}

>  

> -	I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_WB | GEN8_PPAT_LLC);

> -	I915_WRITE(GEN10_PAT_INDEX(1), GEN8_PPAT_WC | GEN8_PPAT_LLCELLC);

> -	I915_WRITE(GEN10_PAT_INDEX(2), GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);

> -	I915_WRITE(GEN10_PAT_INDEX(3), GEN8_PPAT_UC);

> -	I915_WRITE(GEN10_PAT_INDEX(4), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));

> -	I915_WRITE(GEN10_PAT_INDEX(5), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1));

> -	I915_WRITE(GEN10_PAT_INDEX(6), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2));

> -	I915_WRITE(GEN10_PAT_INDEX(7), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));

> +	alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);

> +	alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);

> +	alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);

> +	alloc_ppat_entry(ppat, 4, GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));


Maybe I'm not following at all, but this seems like quite a change to me? 0-3 used to be WB, WC, WT, UC, now 1 is unset, like is 5-7.

I'd leave all changes to registers to a later patch and leave this patch to do what the title says, "Introduce private PAT management".

>  static void setup_private_pat(struct drm_i915_private *dev_priv)  {

> +	struct intel_ppat *ppat = &dev_priv->ppat;

> +	int i;

> +

> +	ppat->i915 = dev_priv;

> +

> +	/* Load per-platform PPAT configurations */


This comment is again just taking space.

>  	if (INTEL_GEN(dev_priv) >= 10)

> -		cnl_setup_private_ppat(dev_priv);

> +		cnl_setup_private_ppat(ppat);

>  	else if (IS_CHERRYVIEW(dev_priv) || IS_GEN9_LP(dev_priv))

> -		chv_setup_private_ppat(dev_priv);

> +		chv_setup_private_ppat(ppat);

>  	else

> -		bdw_setup_private_ppat(dev_priv);

> +		bdw_setup_private_ppat(ppat);

> +

> +	GEM_BUG_ON(ppat->max_entries > INTEL_MAX_PPAT_ENTRIES);

> +

> +	if (!ppat->max_entries)

> +		return;


I'm not sure why we have "!max_entries" checks going, we should have one GEM_BUG_ON at this point and in the rest you can assume it to be nonzero.

> +

> +	/* Fill unused PPAT entries with dummy PPAT value */

> +	for_each_clear_bit(i, ppat->used, ppat->max_entries) {

> +		ppat->entries[i].value = ppat->dummy_value;

> +		ppat->entries[i].ppat = ppat;

> +		set_bit(i, ppat->dirty);

> +	}

> +

> +	/* Write the HW */


If the callback was named update_hw(), this comment would be unnecessary.

> +	ppat->update(dev_priv);

>  }


<SNIP>

> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h

> @@ -536,6 +536,40 @@ i915_vm_to_ggtt(struct i915_address_space *vm)

>  	return container_of(vm, struct i915_ggtt, base);  }

>  

> +#define INTEL_MAX_PPAT_ENTRIES 8

> +#define INTEL_PPAT_PERFECT_MATCH (~0U)

> +

> +struct intel_ppat;

> +

> +struct intel_ppat_entry {

> +	struct intel_ppat *ppat;

> +	struct kref ref_count;


	Just "ref", like elsewhere in code.

> +	u8 value;

> +};

> +

> +struct intel_ppat {

> +	struct intel_ppat_entry entries[INTEL_MAX_PPAT_ENTRIES];

> +	DECLARE_BITMAP(used, INTEL_MAX_PPAT_ENTRIES);

> +	DECLARE_BITMAP(dirty, INTEL_MAX_PPAT_ENTRIES);

> +


This really goes with the previous group of variable. So no newline.

> +	unsigned int max_entries;

> +

> +	u8 dummy_value;


Better name, like "clear_value" and short description may be useful.

> +	/*

> +	 * Return a score to show how two PPAT values match,

> +	 * a INTEL_PPAT_PERFECT_MATCH indicates a perfect match

> +	 */

> +	unsigned int (*match)(u8 src, u8 dst);

> +	/* Write the PPAT configuration into HW. */

> +	void (*update)(struct drm_i915_private *i915);

> +

> +	struct drm_i915_private *i915;

> +};

> +

> +const struct intel_ppat_entry *intel_ppat_get(struct drm_i915_private 

> +*i915,


Same here with type vs name.

> +					      u8 value);

> +void intel_ppat_put(const struct intel_ppat_entry *entry);


Regards, joonas
--
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
Chris Wilson Aug. 29, 2017, 6:14 p.m. UTC | #13
Quoting Wang, Zhi A (2017-08-29 18:54:51)
> Another finding during the re-factoring are:
> 
> a)It looks like that the PPAT_CACHE_INDEX on BDW/SKL is mapped to:
> GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0);
> 
> But the PPAT_CACHE_INDEX on CNL is mapped to 
> GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0);
> 
> GEN8_PPAT_WB is missing here and by default the cache attribute is UC.
> 
> Is this set intentionally?

That sounds like a nasty little bug.

> b) Looks like all the ages of PPAT in windows driver is AGE(3) because of some performance gains, is there any reason that i915 has to set it to AGE(0)?

Nope, it's never been rigorously tested. On occasion, we've swapped it
around (at least for the older gen) and never found a significant
difference; I haven't even heard if anyone has tried such experiments
on gen8+. Off the top of my head, the age should only matter when you
have PTE with different ages (unless there's some automatic clock
algorithm tracking the age on each page in a shadow, the challenge
being then when you decide to refresh the age from the PTE.)
-Chris
Wang, Zhi A Aug. 29, 2017, 6:19 p.m. UTC | #14
I see. 

For 1) I can fix it in the next RFC.
For 2) I can find some VPG guys to ask for the details.

Thanks,
Zhi.

-----Original Message-----
From: Chris Wilson [mailto:chris@chris-wilson.co.uk] 

Sent: Tuesday, August 29, 2017 9:14 PM
To: Wang, Zhi A <zhi.a.wang@intel.com>; Joonas Lahtinen <joonas.lahtinen@linux.intel.com>; intel-gfx@lists.freedesktop.org; intel-gvt-dev@lists.freedesktop.org
Cc: zhenyuw@linux.intel.com; Widawsky, Benjamin <benjamin.widawsky@intel.com>; Vivi, Rodrigo <rodrigo.vivi@intel.com>
Subject: RE: [RFCv5 2/2] drm/i915: Introduce private PAT management

Quoting Wang, Zhi A (2017-08-29 18:54:51)
> Another finding during the re-factoring are:

> 

> a)It looks like that the PPAT_CACHE_INDEX on BDW/SKL is mapped to:

> GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0);

> 

> But the PPAT_CACHE_INDEX on CNL is mapped to GEN8_PPAT_LLCELLC | 

> GEN8_PPAT_AGE(0);

> 

> GEN8_PPAT_WB is missing here and by default the cache attribute is UC.

> 

> Is this set intentionally?


That sounds like a nasty little bug.

> b) Looks like all the ages of PPAT in windows driver is AGE(3) because of some performance gains, is there any reason that i915 has to set it to AGE(0)?


Nope, it's never been rigorously tested. On occasion, we've swapped it around (at least for the older gen) and never found a significant difference; I haven't even heard if anyone has tried such experiments on gen8+. Off the top of my head, the age should only matter when you have PTE with different ages (unless there's some automatic clock algorithm tracking the age on each page in a shadow, the challenge being then when you decide to refresh the age from the PTE.) -Chris
Joonas Lahtinen Aug. 30, 2017, 11:09 a.m. UTC | #15
On Tue, 2017-08-29 at 18:19 +0000, Wang, Zhi A wrote:
> I see. 
> 
> For 1) I can fix it in the next RFC.

Please send a separate bugfix for this so we can proceed to test and
merge immediately.

Regards, Joonas

> For 2) I can find some VPG guys to ask for the details.
> 
> Thanks,
> Zhi.
> 
> -----Original Message-----
> From: Chris Wilson [mailto:chris@chris-wilson.co.uk] 
> Sent: Tuesday, August 29, 2017 9:14 PM
> To: Wang, Zhi A <zhi.a.wang@intel.com>; Joonas Lahtinen <joonas.lahtinen@linux.intel.com>; intel-gfx@lists.freedesktop.org; intel-gvt-dev@lists.freedesktop.org
> Cc: zhenyuw@linux.intel.com; Widawsky, Benjamin <benjamin.widawsky@intel.com>; Vivi, Rodrigo <rodrigo.vivi@intel.com>
> Subject: RE: [RFCv5 2/2] drm/i915: Introduce private PAT management
> 
> Quoting Wang, Zhi A (2017-08-29 18:54:51)
> > Another finding during the re-factoring are:
> > 
> > a)It looks like that the PPAT_CACHE_INDEX on BDW/SKL is mapped to:
> > GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0);
> > 
> > But the PPAT_CACHE_INDEX on CNL is mapped to GEN8_PPAT_LLCELLC | 
> > GEN8_PPAT_AGE(0);
> > 
> > GEN8_PPAT_WB is missing here and by default the cache attribute is UC.
> > 
> > Is this set intentionally?
> 
> That sounds like a nasty little bug.
> 
> > b) Looks like all the ages of PPAT in windows driver is AGE(3) because of some performance gains, is there any reason that i915 has to set it to AGE(0)?
> 
> Nope, it's never been rigorously tested. On occasion, we've swapped it around (at least for the older gen) and never found a significant difference; I haven't even heard if anyone has tried such experiments on gen8+. Off the top of my head, the age should only matter when you have PTE with different ages (unless there's some automatic clock algorithm tracking the age on each page in a shadow, the challenge being then when you decide to refresh the age from the PTE.) -Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7587ef5..5ffde10 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2312,6 +2312,8 @@  struct drm_i915_private {
 	DECLARE_HASHTABLE(mm_structs, 7);
 	struct mutex mm_lock;
 
+	struct intel_ppat ppat;
+
 	/* Kernel Modesetting */
 
 	struct intel_crtc *plane_to_crtc_mapping[I915_MAX_PIPES];
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index b74fa9d..b99b6ca 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2816,41 +2816,215 @@  static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size)
 	return 0;
 }
 
-static void cnl_setup_private_ppat(struct drm_i915_private *dev_priv)
+static struct intel_ppat_entry *alloc_ppat_entry(struct intel_ppat *ppat,
+						 unsigned int index,
+						 u8 value)
 {
+	struct intel_ppat_entry *entry = &ppat->entries[index];
+
+	entry->ppat = ppat;
+	entry->value = value;
+	kref_init(&entry->ref_count);
+	set_bit(index, ppat->used);
+	set_bit(index, ppat->dirty);
+
+	return entry;
+}
+
+static void free_ppat_entry(struct intel_ppat_entry *entry)
+{
+	struct intel_ppat *ppat = entry->ppat;
+	int index = entry - ppat->entries;
+
+	entry->value = ppat->dummy_value;
+	clear_bit(index, ppat->used);
+	set_bit(index, ppat->dirty);
+}
+
+/**
+ * intel_ppat_get - get a usable PPAT entry
+ * @i915: i915 device instance
+ * @value: the PPAT value required by the caller
+ *
+ * The function tries to search if there is an existing PPAT entry which
+ * matches with the required value. If perfectly matched, the existing PPAT
+ * entry will be used. If only partially matched, it will try to check if
+ * there is any available PPAT index. If yes, it will allocate a new PPAT
+ * index for the required entry and update the HW. If not, the partially
+ * matched entry will be used.
+ */
+const struct intel_ppat_entry *intel_ppat_get(struct drm_i915_private *i915,
+					      u8 value)
+{
+	struct intel_ppat *ppat = &i915->ppat;
+	struct intel_ppat_entry *entry;
+	int i, used;
+	unsigned int score, best_score;
+
+	if (WARN_ON(!ppat->max_entries))
+		return ERR_PTR(-ENODEV);
+
+	score = best_score = 0;
+	used = 0;
+
+	/* First, find a suitable value from available entries */
+	for_each_set_bit(i, ppat->used, ppat->max_entries) {
+		score = ppat->match(ppat->entries[i].value, value);
+		/* Perfect match */
+		if (score == INTEL_PPAT_PERFECT_MATCH) {
+			entry = &ppat->entries[i];
+			kref_get(&entry->ref_count);
+			return entry;
+		}
+
+		if (score > best_score) {
+			entry = &ppat->entries[i];
+			best_score = score;
+		}
+		used++;
+	}
+
+	/* No matched entry and we can't allocate a new entry. */
+	if (!best_score && used == ppat->max_entries) {
+		DRM_ERROR("Fail to get PPAT entry\n");
+		return ERR_PTR(-ENOSPC);
+	}
+
+	/*
+	 * Found a matched entry which is not perfect,
+	 * and we can't allocate a new entry.
+	 */
+	if (best_score && used == ppat->max_entries) {
+		kref_get(&entry->ref_count);
+		return entry;
+	}
+
+	/* Allocate a new entry */
+	i = find_first_zero_bit(ppat->used, ppat->max_entries);
+	entry = alloc_ppat_entry(ppat, i, value);
+	ppat->update(i915);
+	return entry;
+}
+
+static void put_ppat(struct kref *kref)
+{
+	struct intel_ppat_entry *entry =
+		container_of(kref, struct intel_ppat_entry, ref_count);
+	struct drm_i915_private *i915 = entry->ppat->i915;
+
+	free_ppat_entry(entry);
+	entry->ppat->update(i915);
+}
+
+/**
+ * intel_ppat_put - put back the PPAT entry got from intel_ppat_get()
+ * @entry: an intel PPAT entry
+ *
+ * Put back the PPAT entry got from intel_ppat_get(). If the PPAT index of the
+ * entry is dynamically allocated, its reference count will be decreased. Once
+ * the reference count becomes into zero, the PPAT index becomes free again.
+ */
+void intel_ppat_put(const struct intel_ppat_entry *entry)
+{
+	struct intel_ppat *ppat = entry->ppat;
+	int index = entry - ppat->entries;
+
+	if (WARN_ON(!ppat->max_entries))
+		return;
+
+	kref_put(&ppat->entries[index].ref_count, put_ppat);
+}
+
+static void cnl_private_pat_update(struct drm_i915_private *dev_priv)
+{
+	struct intel_ppat *ppat = &dev_priv->ppat;
+	int i;
+
+	for_each_set_bit(i, ppat->dirty, ppat->max_entries) {
+		clear_bit(i, ppat->dirty);
+		I915_WRITE(GEN10_PAT_INDEX(i), ppat->entries[i].value);
+	}
+}
+
+static void bdw_private_pat_update(struct drm_i915_private *dev_priv)
+{
+	struct intel_ppat *ppat = &dev_priv->ppat;
+	u64 pat = 0;
+	int i;
+
+	for (i = 0; i < ppat->max_entries; i++)
+		pat |= GEN8_PPAT(i, ppat->entries[i].value);
+
+	bitmap_clear(ppat->dirty, 0, ppat->max_entries);
+
+	I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);
+	I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);
+}
+
+#define gen8_pat_ca(v) ((v) & 0x3)
+#define gen8_pat_tc(v) (((v) >> 2) & 0x3)
+#define gen8_pat_age(v) (((v) >> 4) & 0x3)
+
+static unsigned int bdw_private_pat_match(u8 src, u8 dst)
+{
+	unsigned int score = 0;
+
+	/* Cache attribute has to be matched. */
+	if (gen8_pat_ca(src) != gen8_pat_ca(dst))
+		return 0;
+
+	if (gen8_pat_age(src) == gen8_pat_age(dst))
+		score += 1;
+
+	if (gen8_pat_tc(src) == gen8_pat_tc(dst))
+		score += 2;
+
+	if (score == 3)
+		return INTEL_PPAT_PERFECT_MATCH;
+
+	return score;
+}
+
+#define chv_get_snoop(v) (((v) >> 6) & 0x1)
+
+static unsigned int chv_private_pat_match(u8 src, u8 dst)
+{
+	if (chv_get_snoop(src) == chv_get_snoop(dst))
+		return INTEL_PPAT_PERFECT_MATCH;
+
+	return 0;
+}
+
+static void cnl_setup_private_ppat(struct intel_ppat *ppat)
+{
+	ppat->max_entries = 8;
+	ppat->update = cnl_private_pat_update;
+	ppat->match = bdw_private_pat_match;
+	ppat->dummy_value = GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3);
+
 	/* XXX: spec is unclear if this is still needed for CNL+ */
-	if (!USES_PPGTT(dev_priv)) {
-		I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_UC);
+	if (!USES_PPGTT(ppat->i915)) {
+		alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);
 		return;
 	}
 
-	I915_WRITE(GEN10_PAT_INDEX(0), GEN8_PPAT_WB | GEN8_PPAT_LLC);
-	I915_WRITE(GEN10_PAT_INDEX(1), GEN8_PPAT_WC | GEN8_PPAT_LLCELLC);
-	I915_WRITE(GEN10_PAT_INDEX(2), GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);
-	I915_WRITE(GEN10_PAT_INDEX(3), GEN8_PPAT_UC);
-	I915_WRITE(GEN10_PAT_INDEX(4), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));
-	I915_WRITE(GEN10_PAT_INDEX(5), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1));
-	I915_WRITE(GEN10_PAT_INDEX(6), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2));
-	I915_WRITE(GEN10_PAT_INDEX(7), GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));
+	alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);
+	alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);
+	alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);
+	alloc_ppat_entry(ppat, 4, GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));
 }
 
 /* The GGTT and PPGTT need a private PPAT setup in order to handle cacheability
  * bits. When using advanced contexts each context stores its own PAT, but
  * writing this data shouldn't be harmful even in those cases. */
-static void bdw_setup_private_ppat(struct drm_i915_private *dev_priv)
+static void bdw_setup_private_ppat(struct intel_ppat *ppat)
 {
-	u64 pat;
-
-	pat = GEN8_PPAT(0, GEN8_PPAT_WB | GEN8_PPAT_LLC)     | /* for normal objects, no eLLC */
-	      GEN8_PPAT(1, GEN8_PPAT_WC | GEN8_PPAT_LLCELLC) | /* for something pointing to ptes? */
-	      GEN8_PPAT(2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC) | /* for scanout with eLLC */
-	      GEN8_PPAT(3, GEN8_PPAT_UC)                     | /* Uncached objects, mostly for scanout */
-	      GEN8_PPAT(4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0)) |
-	      GEN8_PPAT(5, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1)) |
-	      GEN8_PPAT(6, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2)) |
-	      GEN8_PPAT(7, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));
+	ppat->max_entries = 8;
+	ppat->update = bdw_private_pat_update;
+	ppat->match = bdw_private_pat_match;
+	ppat->dummy_value = GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3);
 
-	if (!USES_PPGTT(dev_priv))
+	if (!USES_PPGTT(dev_priv)) {
 		/* Spec: "For GGTT, there is NO pat_sel[2:0] from the entry,
 		 * so RTL will always use the value corresponding to
 		 * pat_sel = 000".
@@ -2864,17 +3038,22 @@  static void bdw_setup_private_ppat(struct drm_i915_private *dev_priv)
 		 * So we can still hold onto all our assumptions wrt cpu
 		 * clflushing on LLC machines.
 		 */
-		pat = GEN8_PPAT(0, GEN8_PPAT_UC);
+		alloc_ppat_entry(ppat, 0, GEN8_PPAT_UC);
+		return;
+	}
 
-	/* XXX: spec defines this as 2 distinct registers. It's unclear if a 64b
-	 * write would work. */
-	I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);
-	I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);
+	alloc_ppat_entry(ppat, 0, GEN8_PPAT_WB | GEN8_PPAT_LLC);     /* for normal objects, no eLLC */
+	alloc_ppat_entry(ppat, 2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC); /* for scanout with eLLC */
+	alloc_ppat_entry(ppat, 3, GEN8_PPAT_UC);		     /* Uncached objects, mostly for scanout */
+	alloc_ppat_entry(ppat, 4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));
 }
 
-static void chv_setup_private_ppat(struct drm_i915_private *dev_priv)
+static void chv_setup_private_ppat(struct intel_ppat *ppat)
 {
-	u64 pat;
+	ppat->max_entries = 8;
+	ppat->update = bdw_private_pat_update;
+	ppat->match = chv_private_pat_match;
+	ppat->dummy_value = CHV_PPAT_SNOOP;
 
 	/*
 	 * Map WB on BDW to snooped on CHV.
@@ -2894,17 +3073,11 @@  static void chv_setup_private_ppat(struct drm_i915_private *dev_priv)
 	 * Which means we must set the snoop bit in PAT entry 0
 	 * in order to keep the global status page working.
 	 */
-	pat = GEN8_PPAT(0, CHV_PPAT_SNOOP) |
-	      GEN8_PPAT(1, 0) |
-	      GEN8_PPAT(2, 0) |
-	      GEN8_PPAT(3, 0) |
-	      GEN8_PPAT(4, CHV_PPAT_SNOOP) |
-	      GEN8_PPAT(5, CHV_PPAT_SNOOP) |
-	      GEN8_PPAT(6, CHV_PPAT_SNOOP) |
-	      GEN8_PPAT(7, CHV_PPAT_SNOOP);
 
-	I915_WRITE(GEN8_PRIVATE_PAT_LO, pat);
-	I915_WRITE(GEN8_PRIVATE_PAT_HI, pat >> 32);
+	alloc_ppat_entry(ppat, 0, CHV_PPAT_SNOOP);
+	alloc_ppat_entry(ppat, 2, 0);
+	alloc_ppat_entry(ppat, 3, 0);
+	alloc_ppat_entry(ppat, 4, CHV_PPAT_SNOOP);
 }
 
 static void gen6_gmch_remove(struct i915_address_space *vm)
@@ -2917,12 +3090,33 @@  static void gen6_gmch_remove(struct i915_address_space *vm)
 
 static void setup_private_pat(struct drm_i915_private *dev_priv)
 {
+	struct intel_ppat *ppat = &dev_priv->ppat;
+	int i;
+
+	ppat->i915 = dev_priv;
+
+	/* Load per-platform PPAT configurations */
 	if (INTEL_GEN(dev_priv) >= 10)
-		cnl_setup_private_ppat(dev_priv);
+		cnl_setup_private_ppat(ppat);
 	else if (IS_CHERRYVIEW(dev_priv) || IS_GEN9_LP(dev_priv))
-		chv_setup_private_ppat(dev_priv);
+		chv_setup_private_ppat(ppat);
 	else
-		bdw_setup_private_ppat(dev_priv);
+		bdw_setup_private_ppat(ppat);
+
+	GEM_BUG_ON(ppat->max_entries > INTEL_MAX_PPAT_ENTRIES);
+
+	if (!ppat->max_entries)
+		return;
+
+	/* Fill unused PPAT entries with dummy PPAT value */
+	for_each_clear_bit(i, ppat->used, ppat->max_entries) {
+		ppat->entries[i].value = ppat->dummy_value;
+		ppat->entries[i].ppat = ppat;
+		set_bit(i, ppat->dirty);
+	}
+
+	/* Write the HW */
+	ppat->update(dev_priv);
 }
 
 static int gen8_gmch_probe(struct i915_ggtt *ggtt)
@@ -3236,13 +3430,10 @@  void i915_gem_restore_gtt_mappings(struct drm_i915_private *dev_priv)
 	ggtt->base.closed = false;
 
 	if (INTEL_GEN(dev_priv) >= 8) {
-		if (INTEL_GEN(dev_priv) >= 10)
-			cnl_setup_private_ppat(dev_priv);
-		else if (IS_CHERRYVIEW(dev_priv) || IS_GEN9_LP(dev_priv))
-			chv_setup_private_ppat(dev_priv);
-		else
-			bdw_setup_private_ppat(dev_priv);
+		struct intel_ppat *ppat = &dev_priv->ppat;
 
+		bitmap_set(ppat->dirty, 0, ppat->max_entries);
+		dev_priv->ppat.update(dev_priv);
 		return;
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index b4e3aa7..6bf5521 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -536,6 +536,40 @@  i915_vm_to_ggtt(struct i915_address_space *vm)
 	return container_of(vm, struct i915_ggtt, base);
 }
 
+#define INTEL_MAX_PPAT_ENTRIES 8
+#define INTEL_PPAT_PERFECT_MATCH (~0U)
+
+struct intel_ppat;
+
+struct intel_ppat_entry {
+	struct intel_ppat *ppat;
+	struct kref ref_count;
+	u8 value;
+};
+
+struct intel_ppat {
+	struct intel_ppat_entry entries[INTEL_MAX_PPAT_ENTRIES];
+	DECLARE_BITMAP(used, INTEL_MAX_PPAT_ENTRIES);
+	DECLARE_BITMAP(dirty, INTEL_MAX_PPAT_ENTRIES);
+
+	unsigned int max_entries;
+
+	u8 dummy_value;
+	/*
+	 * Return a score to show how two PPAT values match,
+	 * a INTEL_PPAT_PERFECT_MATCH indicates a perfect match
+	 */
+	unsigned int (*match)(u8 src, u8 dst);
+	/* Write the PPAT configuration into HW. */
+	void (*update)(struct drm_i915_private *i915);
+
+	struct drm_i915_private *i915;
+};
+
+const struct intel_ppat_entry *intel_ppat_get(struct drm_i915_private *i915,
+					      u8 value);
+void intel_ppat_put(const struct intel_ppat_entry *entry);
+
 int i915_gem_init_aliasing_ppgtt(struct drm_i915_private *i915);
 void i915_gem_fini_aliasing_ppgtt(struct drm_i915_private *i915);