diff mbox series

[1/5] drm/i915: document caching related bits

Message ID 20210713104554.2381406-1-matthew.auld@intel.com (mailing list archive)
State New, archived
Headers show
Series [1/5] drm/i915: document caching related bits | expand

Commit Message

Matthew Auld July 13, 2021, 10:45 a.m. UTC
Try to document the object caching related bits, like cache_coherent and
cache_dirty.

Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
---
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 135 +++++++++++++++++-
 drivers/gpu/drm/i915/i915_drv.h               |   9 --
 2 files changed, 131 insertions(+), 13 deletions(-)

Comments

Mika Kuoppala July 13, 2021, 11:41 a.m. UTC | #1
Matthew Auld <matthew.auld@intel.com> writes:

> Try to document the object caching related bits, like cache_coherent and
> cache_dirty.
>
> Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> Signed-off-by: Matthew Auld <matthew.auld@intel.com>
> ---
>  .../gpu/drm/i915/gem/i915_gem_object_types.h  | 135 +++++++++++++++++-
>  drivers/gpu/drm/i915/i915_drv.h               |   9 --
>  2 files changed, 131 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
> index ef3de2ae9723..02c3529b774c 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
> @@ -92,6 +92,57 @@ struct drm_i915_gem_object_ops {
>  	const char *name; /* friendly name for debug, e.g. lockdep classes */
>  };
>  
> +/**
> + * enum i915_cache_level - The supported GTT caching values for system memory
> + * pages.
> + *
> + * These translate to some special GTT PTE bits when binding pages into some
> + * address space. It also determines whether an object, or rather its pages are
> + * coherent with the GPU, when also reading or writing through the CPU cache
> + * with those pages.
> + *
> + * Userspace can also control this through struct drm_i915_gem_caching.
> + */
> +enum i915_cache_level {
> +	/**
> +	 * @I915_CACHE_NONE:
> +	 *
> +	 * Not coherent with the CPU cache. If the cache is dirty and we need
> +	 * the underlying pages to be coherent with some later GPU access then
> +	 * we need to manually flush the pages.
> +	 *
> +	 * Note that on shared-LLC platforms reads through the CPU cache are
> +	 * still coherent even with this setting. See also
> +	 * I915_BO_CACHE_COHERENT_FOR_READ for more details.
> +	 */
> +	I915_CACHE_NONE = 0,
> +	/**
> +	 * @I915_CACHE_LLC:
> +	 *
> +	 * Coherent with the CPU cache. If the cache is dirty, then the GPU will
> +	 * ensure that access remains coherent, when both reading and writing
> +	 * through the CPU cache.
> +	 *
> +	 * Applies to both platforms with shared-LLC(HAS_LLC), and snooping
> +	 * based platforms(HAS_SNOOP).
> +	 */
> +	I915_CACHE_LLC,
> +	/**
> +	 * @I915_CACHE_L3_LLC:
> +	 *
> +	 * gen7+, L3 sits between the domain specifc caches, eg sampler/render

typo: specifc

> +	 * caches, and the large Last-Level-Cache. LLC is coherent with the CPU,
> +	 * but L3 is only visible to the GPU.
> +	 */

I dont get the difference between this and I915_CACHE_LLC.
Could the diff between LLC and L3_LLC be described here with example?

Thanks,
-Mika

> +	I915_CACHE_L3_LLC,
> +	/**
> +	 * @I915_CACHE_WT:
> +	 *
> +	 * hsw:gt3e Write-through for scanout buffers.
> +	 */
> +	I915_CACHE_WT,
> +};
> +
>  enum i915_map_type {
>  	I915_MAP_WB = 0,
>  	I915_MAP_WC,
> @@ -228,14 +279,90 @@ struct drm_i915_gem_object {
>  	unsigned int mem_flags;
>  #define I915_BO_FLAG_STRUCT_PAGE BIT(0) /* Object backed by struct pages */
>  #define I915_BO_FLAG_IOMEM       BIT(1) /* Object backed by IO memory */
> -	/*
> -	 * Is the object to be mapped as read-only to the GPU
> -	 * Only honoured if hardware has relevant pte bit
> +	/**
> +	 * @cache_level: The desired GTT caching level.
> +	 *
> +	 * See enum i915_cache_level for possible values, along with what
> +	 * each does.
>  	 */
>  	unsigned int cache_level:3;
> -	unsigned int cache_coherent:2;
> +	/**
> +	 * @cache_coherent:
> +	 *
> +	 * Track whether the pages are coherent with the GPU if reading or
> +	 * writing through the CPU cache.
> +	 *
> +	 * This largely depends on the @cache_level, for example if the object
> +	 * is marked as I915_CACHE_LLC, then GPU access is coherent for both
> +	 * reads and writes through the CPU cache.
> +	 *
> +	 * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> +	 * the CPU cache are always coherent, regardless of the @cache_level. On
> +	 * snooping based platforms this is not the case, unless the full
> +	 * I915_CACHE_LLC or similar setting is used.
> +	 *
> +	 * As a result of this we need to track coherency separately for reads
> +	 * and writes, in order to avoid superfluous flushing on shared-LLC
> +	 * platforms, for reads.
> +	 *
> +	 * I915_BO_CACHE_COHERENT_FOR_READ:
> +	 *
> +	 * When reading through the CPU cache, the GPU is still coherent. Note
> +	 * that no data has actually been modified here, so it might seem
> +	 * strange that we care about this.
> +	 *
> +	 * As an example, if some object is mapped on the CPU with write-back
> +	 * caching, and we read some page, then the cache likely now contains
> +	 * the data from that read. At this point the cache and main memory
> +	 * match up, so all good. But next the GPU needs to write some data to
> +	 * that same page. Now if the @cache_level is I915_CACHE_NONE and the
> +	 * the platform doesn't have the shared-LLC, then the GPU will
> +	 * effectively skip invalidating the cache(or however that works
> +	 * internally) when writing the new value.  This is really bad since the
> +	 * GPU has just written some new data to main memory, but the CPU cache
> +	 * is still valid and now contains stale data. As a result the next time
> +	 * we do a cached read with the CPU, we are rewarded with stale data.
> +	 * Likewise if the cache is later flushed, we might be rewarded with
> +	 * overwriting main memory with stale data.
> +	 *
> +	 * I915_BO_CACHE_COHERENT_FOR_WRITE:
> +	 *
> +	 * When writing through the CPU cache, the GPU is still coherent. Note
> +	 * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.
> +	 *
> +	 * This is never set when I915_CACHE_NONE is used for @cache_level,
> +	 * where instead we have to manually flush the caches after writing
> +	 * through the CPU cache. For other cache levels this should be set and
> +	 * the object is therefore considered coherent for both reads and writes
> +	 * through the CPU cache.
> +	 */
>  #define I915_BO_CACHE_COHERENT_FOR_READ BIT(0)
>  #define I915_BO_CACHE_COHERENT_FOR_WRITE BIT(1)
> +	unsigned int cache_coherent:2;
> +	/**
> +	 * @cache_dirty:
> +	 *
> +	 * Track if the cache might be dirty for the @pages i.e it has yet to be
> +	 * written back to main memory. As a result reading directly from main
> +	 * memory might yield stale data.
> +	 *
> +	 * This also ties into whether the kernel is tracking the object as
> +	 * coherent with the GPU, as per @cache_coherent, as it determines if
> +	 * flushing might be needed at various points.
> +	 *
> +	 * Another part of @cache_dirty is managing flushing when first
> +	 * acquiring the pages for system memory, at this point the pages are
> +	 * considered foreign, so the default assumption is that the cache is
> +	 * dirty, for example the page zeroing done my the kernel might leave
> +	 * writes though the CPU cache, or swapping-in, while the actual data in
> +	 * main memory is potentially stale.  Note that this is a potential
> +	 * security issue when dealing with userspace objects and zeroing. Now,
> +	 * whether we actually need apply the big sledgehammer of flushing all
> +	 * the pages on acquire depends on if @cache_coherent is marked as
> +	 * I915_BO_CACHE_COHERENT_FOR_WRITE, i.e that the GPU will be coherent
> +	 * for both reads and writes though the CPU cache. So pretty much this
> +	 * should only be needed for I915_CACHE_NONE objects.
> +	 */
>  	unsigned int cache_dirty:1;
>  
>  	/**
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index c4747f4407ef..37bb1a3cadd4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -394,15 +394,6 @@ struct drm_i915_display_funcs {
>  	void (*read_luts)(struct intel_crtc_state *crtc_state);
>  };
>  
> -enum i915_cache_level {
> -	I915_CACHE_NONE = 0,
> -	I915_CACHE_LLC, /* also used for snoopable memory on non-LLC */
> -	I915_CACHE_L3_LLC, /* gen7+, L3 sits between the domain specifc
> -			      caches, eg sampler/render caches, and the
> -			      large Last-Level-Cache. LLC is coherent with
> -			      the CPU, but L3 is only visible to the GPU. */
> -	I915_CACHE_WT, /* hsw:gt3e WriteThrough for scanouts */
> -};
>  
>  #define I915_COLOR_UNEVICTABLE (-1) /* a non-vma sharing the address space */
>  
> -- 
> 2.26.3
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Ville Syrjälä July 13, 2021, 3:55 p.m. UTC | #2
On Tue, Jul 13, 2021 at 11:45:50AM +0100, Matthew Auld wrote:
> +	/**
> +	 * @cache_coherent:
> +	 *
> +	 * Track whether the pages are coherent with the GPU if reading or
> +	 * writing through the CPU cache.
> +	 *
> +	 * This largely depends on the @cache_level, for example if the object
> +	 * is marked as I915_CACHE_LLC, then GPU access is coherent for both
> +	 * reads and writes through the CPU cache.
> +	 *
> +	 * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> +	 * the CPU cache are always coherent, regardless of the @cache_level. On
> +	 * snooping based platforms this is not the case, unless the full
> +	 * I915_CACHE_LLC or similar setting is used.
> +	 *
> +	 * As a result of this we need to track coherency separately for reads
> +	 * and writes, in order to avoid superfluous flushing on shared-LLC
> +	 * platforms, for reads.
> +	 *
> +	 * I915_BO_CACHE_COHERENT_FOR_READ:
> +	 *
> +	 * When reading through the CPU cache, the GPU is still coherent. Note
> +	 * that no data has actually been modified here, so it might seem
> +	 * strange that we care about this.
> +	 *
> +	 * As an example, if some object is mapped on the CPU with write-back
> +	 * caching, and we read some page, then the cache likely now contains
> +	 * the data from that read. At this point the cache and main memory
> +	 * match up, so all good. But next the GPU needs to write some data to
> +	 * that same page. Now if the @cache_level is I915_CACHE_NONE and the
> +	 * the platform doesn't have the shared-LLC, then the GPU will
> +	 * effectively skip invalidating the cache(or however that works
> +	 * internally) when writing the new value.  This is really bad since the
> +	 * GPU has just written some new data to main memory, but the CPU cache
> +	 * is still valid and now contains stale data. As a result the next time
> +	 * we do a cached read with the CPU, we are rewarded with stale data.
> +	 * Likewise if the cache is later flushed, we might be rewarded with
> +	 * overwriting main memory with stale data.
> +	 *
> +	 * I915_BO_CACHE_COHERENT_FOR_WRITE:
> +	 *
> +	 * When writing through the CPU cache, the GPU is still coherent. Note
> +	 * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.
> +	 *
> +	 * This is never set when I915_CACHE_NONE is used for @cache_level,
> +	 * where instead we have to manually flush the caches after writing
> +	 * through the CPU cache. For other cache levels this should be set and
> +	 * the object is therefore considered coherent for both reads and writes
> +	 * through the CPU cache.

I don't remember why we have this read vs. write split and this new
documentation doesn't seem to really explain it either.

Is it for optimizing some display related case where we can omit the
invalidates but still have to do the writeback to keep the display
engine happy?
Matthew Auld July 13, 2021, 4:13 p.m. UTC | #3
On Tue, 13 Jul 2021 at 16:55, Ville Syrjälä
<ville.syrjala@linux.intel.com> wrote:
>
> On Tue, Jul 13, 2021 at 11:45:50AM +0100, Matthew Auld wrote:
> > +     /**
> > +      * @cache_coherent:
> > +      *
> > +      * Track whether the pages are coherent with the GPU if reading or
> > +      * writing through the CPU cache.
> > +      *
> > +      * This largely depends on the @cache_level, for example if the object
> > +      * is marked as I915_CACHE_LLC, then GPU access is coherent for both
> > +      * reads and writes through the CPU cache.
> > +      *
> > +      * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > +      * the CPU cache are always coherent, regardless of the @cache_level. On
> > +      * snooping based platforms this is not the case, unless the full
> > +      * I915_CACHE_LLC or similar setting is used.
> > +      *
> > +      * As a result of this we need to track coherency separately for reads
> > +      * and writes, in order to avoid superfluous flushing on shared-LLC
> > +      * platforms, for reads.
> > +      *
> > +      * I915_BO_CACHE_COHERENT_FOR_READ:
> > +      *
> > +      * When reading through the CPU cache, the GPU is still coherent. Note
> > +      * that no data has actually been modified here, so it might seem
> > +      * strange that we care about this.
> > +      *
> > +      * As an example, if some object is mapped on the CPU with write-back
> > +      * caching, and we read some page, then the cache likely now contains
> > +      * the data from that read. At this point the cache and main memory
> > +      * match up, so all good. But next the GPU needs to write some data to
> > +      * that same page. Now if the @cache_level is I915_CACHE_NONE and the
> > +      * the platform doesn't have the shared-LLC, then the GPU will
> > +      * effectively skip invalidating the cache(or however that works
> > +      * internally) when writing the new value.  This is really bad since the
> > +      * GPU has just written some new data to main memory, but the CPU cache
> > +      * is still valid and now contains stale data. As a result the next time
> > +      * we do a cached read with the CPU, we are rewarded with stale data.
> > +      * Likewise if the cache is later flushed, we might be rewarded with
> > +      * overwriting main memory with stale data.
> > +      *
> > +      * I915_BO_CACHE_COHERENT_FOR_WRITE:
> > +      *
> > +      * When writing through the CPU cache, the GPU is still coherent. Note
> > +      * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.
> > +      *
> > +      * This is never set when I915_CACHE_NONE is used for @cache_level,
> > +      * where instead we have to manually flush the caches after writing
> > +      * through the CPU cache. For other cache levels this should be set and
> > +      * the object is therefore considered coherent for both reads and writes
> > +      * through the CPU cache.
>
> I don't remember why we have this read vs. write split and this new
> documentation doesn't seem to really explain it either.

Hmm, I attempted to explain that earlier:

* Note that on platforms with shared-LLC support(HAS_LLC) reads through
* the CPU cache are always coherent, regardless of the @cache_level. On
* snooping based platforms this is not the case, unless the full
* I915_CACHE_LLC or similar setting is used.
*
* As a result of this we need to track coherency separately for reads
* and writes, in order to avoid superfluous flushing on shared-LLC
* platforms, for reads.

So AFAIK it's just because shared-LLC can be coherent for reads, while
also not being coherent for writes(CACHE_NONE), so being able to track
each separately is kind of needed to avoid unnecessary flushing for
the read cases i.e simple boolean for coherent vs non-coherent is not
enough.

I can try to reword things to make that more clear.

>
> Is it for optimizing some display related case where we can omit the
> invalidates but still have to do the writeback to keep the display
> engine happy?
>
> --
> Ville Syrjälä
> Intel
Daniel Vetter July 13, 2021, 4:50 p.m. UTC | #4
On Tue, Jul 13, 2021 at 6:14 PM Matthew Auld
<matthew.william.auld@gmail.com> wrote:
> On Tue, 13 Jul 2021 at 16:55, Ville Syrjälä
> <ville.syrjala@linux.intel.com> wrote:
> >
> > On Tue, Jul 13, 2021 at 11:45:50AM +0100, Matthew Auld wrote:
> > > +     /**
> > > +      * @cache_coherent:
> > > +      *
> > > +      * Track whether the pages are coherent with the GPU if reading or
> > > +      * writing through the CPU cache.
> > > +      *
> > > +      * This largely depends on the @cache_level, for example if the object
> > > +      * is marked as I915_CACHE_LLC, then GPU access is coherent for both
> > > +      * reads and writes through the CPU cache.
> > > +      *
> > > +      * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > > +      * the CPU cache are always coherent, regardless of the @cache_level. On
> > > +      * snooping based platforms this is not the case, unless the full
> > > +      * I915_CACHE_LLC or similar setting is used.
> > > +      *
> > > +      * As a result of this we need to track coherency separately for reads
> > > +      * and writes, in order to avoid superfluous flushing on shared-LLC
> > > +      * platforms, for reads.
> > > +      *
> > > +      * I915_BO_CACHE_COHERENT_FOR_READ:
> > > +      *
> > > +      * When reading through the CPU cache, the GPU is still coherent. Note
> > > +      * that no data has actually been modified here, so it might seem
> > > +      * strange that we care about this.
> > > +      *
> > > +      * As an example, if some object is mapped on the CPU with write-back
> > > +      * caching, and we read some page, then the cache likely now contains
> > > +      * the data from that read. At this point the cache and main memory
> > > +      * match up, so all good. But next the GPU needs to write some data to
> > > +      * that same page. Now if the @cache_level is I915_CACHE_NONE and the
> > > +      * the platform doesn't have the shared-LLC, then the GPU will
> > > +      * effectively skip invalidating the cache(or however that works
> > > +      * internally) when writing the new value.  This is really bad since the
> > > +      * GPU has just written some new data to main memory, but the CPU cache
> > > +      * is still valid and now contains stale data. As a result the next time
> > > +      * we do a cached read with the CPU, we are rewarded with stale data.
> > > +      * Likewise if the cache is later flushed, we might be rewarded with
> > > +      * overwriting main memory with stale data.
> > > +      *
> > > +      * I915_BO_CACHE_COHERENT_FOR_WRITE:
> > > +      *
> > > +      * When writing through the CPU cache, the GPU is still coherent. Note
> > > +      * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.
> > > +      *
> > > +      * This is never set when I915_CACHE_NONE is used for @cache_level,
> > > +      * where instead we have to manually flush the caches after writing
> > > +      * through the CPU cache. For other cache levels this should be set and
> > > +      * the object is therefore considered coherent for both reads and writes
> > > +      * through the CPU cache.
> >
> > I don't remember why we have this read vs. write split and this new
> > documentation doesn't seem to really explain it either.
>
> Hmm, I attempted to explain that earlier:
>
> * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> * the CPU cache are always coherent, regardless of the @cache_level. On
> * snooping based platforms this is not the case, unless the full
> * I915_CACHE_LLC or similar setting is used.
> *
> * As a result of this we need to track coherency separately for reads
> * and writes, in order to avoid superfluous flushing on shared-LLC
> * platforms, for reads.
>
> So AFAIK it's just because shared-LLC can be coherent for reads, while
> also not being coherent for writes(CACHE_NONE), so being able to track
> each separately is kind of needed to avoid unnecessary flushing for
> the read cases i.e simple boolean for coherent vs non-coherent is not
> enough.
>
> I can try to reword things to make that more clear.

Maybe highlight the security aspect a bit more: When reads are always
coherent, we don't have to force the clflush. If reads are not
coherent we must ensure that the clflush has finished before userspace
can get at the backing storage, like writing ptes and similar things.
Writes otoh can only result in userspace eating cacheling corruption
if it races against the kernel (by e.g. trying to predict where we'll
bind a buffer and issuing gpu access to that location before the
buffer is actually bound from some other engine in parallel with an
execbuf that binds the buffer).

Atm we don't do a great job with that, but that's something that I
think is getting looked into.
-Daniel

> > Is it for optimizing some display related case where we can omit the
> > invalidates but still have to do the writeback to keep the display
> > engine happy?
> >
> > --
> > Ville Syrjälä
> > Intel
Ville Syrjälä July 13, 2021, 5:47 p.m. UTC | #5
On Tue, Jul 13, 2021 at 05:13:37PM +0100, Matthew Auld wrote:
> On Tue, 13 Jul 2021 at 16:55, Ville Syrjälä
> <ville.syrjala@linux.intel.com> wrote:
> >
> > On Tue, Jul 13, 2021 at 11:45:50AM +0100, Matthew Auld wrote:
> > > +     /**
> > > +      * @cache_coherent:
> > > +      *
> > > +      * Track whether the pages are coherent with the GPU if reading or
> > > +      * writing through the CPU cache.
> > > +      *
> > > +      * This largely depends on the @cache_level, for example if the object
> > > +      * is marked as I915_CACHE_LLC, then GPU access is coherent for both
> > > +      * reads and writes through the CPU cache.
> > > +      *
> > > +      * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > > +      * the CPU cache are always coherent, regardless of the @cache_level. On
> > > +      * snooping based platforms this is not the case, unless the full
> > > +      * I915_CACHE_LLC or similar setting is used.
> > > +      *
> > > +      * As a result of this we need to track coherency separately for reads
> > > +      * and writes, in order to avoid superfluous flushing on shared-LLC
> > > +      * platforms, for reads.
> > > +      *
> > > +      * I915_BO_CACHE_COHERENT_FOR_READ:
> > > +      *
> > > +      * When reading through the CPU cache, the GPU is still coherent. Note
> > > +      * that no data has actually been modified here, so it might seem
> > > +      * strange that we care about this.
> > > +      *
> > > +      * As an example, if some object is mapped on the CPU with write-back
> > > +      * caching, and we read some page, then the cache likely now contains
> > > +      * the data from that read. At this point the cache and main memory
> > > +      * match up, so all good. But next the GPU needs to write some data to
> > > +      * that same page. Now if the @cache_level is I915_CACHE_NONE and the
> > > +      * the platform doesn't have the shared-LLC, then the GPU will
> > > +      * effectively skip invalidating the cache(or however that works
> > > +      * internally) when writing the new value.  This is really bad since the
> > > +      * GPU has just written some new data to main memory, but the CPU cache
> > > +      * is still valid and now contains stale data. As a result the next time
> > > +      * we do a cached read with the CPU, we are rewarded with stale data.
> > > +      * Likewise if the cache is later flushed, we might be rewarded with
> > > +      * overwriting main memory with stale data.
> > > +      *
> > > +      * I915_BO_CACHE_COHERENT_FOR_WRITE:
> > > +      *
> > > +      * When writing through the CPU cache, the GPU is still coherent. Note
> > > +      * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.
> > > +      *
> > > +      * This is never set when I915_CACHE_NONE is used for @cache_level,
> > > +      * where instead we have to manually flush the caches after writing
> > > +      * through the CPU cache. For other cache levels this should be set and
> > > +      * the object is therefore considered coherent for both reads and writes
> > > +      * through the CPU cache.
> >
> > I don't remember why we have this read vs. write split and this new
> > documentation doesn't seem to really explain it either.
> 
> Hmm, I attempted to explain that earlier:
> 
> * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> * the CPU cache are always coherent, regardless of the @cache_level. On
> * snooping based platforms this is not the case, unless the full
> * I915_CACHE_LLC or similar setting is used.
> *
> * As a result of this we need to track coherency separately for reads
> * and writes, in order to avoid superfluous flushing on shared-LLC
> * platforms, for reads.
> 
> So AFAIK it's just because shared-LLC can be coherent for reads, while
> also not being coherent for writes(CACHE_NONE),

CPU vs. GPU is fully coherent when it comes to LLC. Or at least I've
never heard of any mechanism that would make it only partially coherent.
Matthew Auld July 13, 2021, 6:24 p.m. UTC | #6
On Tue, 13 Jul 2021 at 18:47, Ville Syrjälä
<ville.syrjala@linux.intel.com> wrote:
>
> On Tue, Jul 13, 2021 at 05:13:37PM +0100, Matthew Auld wrote:
> > On Tue, 13 Jul 2021 at 16:55, Ville Syrjälä
> > <ville.syrjala@linux.intel.com> wrote:
> > >
> > > On Tue, Jul 13, 2021 at 11:45:50AM +0100, Matthew Auld wrote:
> > > > +     /**
> > > > +      * @cache_coherent:
> > > > +      *
> > > > +      * Track whether the pages are coherent with the GPU if reading or
> > > > +      * writing through the CPU cache.
> > > > +      *
> > > > +      * This largely depends on the @cache_level, for example if the object
> > > > +      * is marked as I915_CACHE_LLC, then GPU access is coherent for both
> > > > +      * reads and writes through the CPU cache.
> > > > +      *
> > > > +      * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > > > +      * the CPU cache are always coherent, regardless of the @cache_level. On
> > > > +      * snooping based platforms this is not the case, unless the full
> > > > +      * I915_CACHE_LLC or similar setting is used.
> > > > +      *
> > > > +      * As a result of this we need to track coherency separately for reads
> > > > +      * and writes, in order to avoid superfluous flushing on shared-LLC
> > > > +      * platforms, for reads.
> > > > +      *
> > > > +      * I915_BO_CACHE_COHERENT_FOR_READ:
> > > > +      *
> > > > +      * When reading through the CPU cache, the GPU is still coherent. Note
> > > > +      * that no data has actually been modified here, so it might seem
> > > > +      * strange that we care about this.
> > > > +      *
> > > > +      * As an example, if some object is mapped on the CPU with write-back
> > > > +      * caching, and we read some page, then the cache likely now contains
> > > > +      * the data from that read. At this point the cache and main memory
> > > > +      * match up, so all good. But next the GPU needs to write some data to
> > > > +      * that same page. Now if the @cache_level is I915_CACHE_NONE and the
> > > > +      * the platform doesn't have the shared-LLC, then the GPU will
> > > > +      * effectively skip invalidating the cache(or however that works
> > > > +      * internally) when writing the new value.  This is really bad since the
> > > > +      * GPU has just written some new data to main memory, but the CPU cache
> > > > +      * is still valid and now contains stale data. As a result the next time
> > > > +      * we do a cached read with the CPU, we are rewarded with stale data.
> > > > +      * Likewise if the cache is later flushed, we might be rewarded with
> > > > +      * overwriting main memory with stale data.
> > > > +      *
> > > > +      * I915_BO_CACHE_COHERENT_FOR_WRITE:
> > > > +      *
> > > > +      * When writing through the CPU cache, the GPU is still coherent. Note
> > > > +      * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.
> > > > +      *
> > > > +      * This is never set when I915_CACHE_NONE is used for @cache_level,
> > > > +      * where instead we have to manually flush the caches after writing
> > > > +      * through the CPU cache. For other cache levels this should be set and
> > > > +      * the object is therefore considered coherent for both reads and writes
> > > > +      * through the CPU cache.
> > >
> > > I don't remember why we have this read vs. write split and this new
> > > documentation doesn't seem to really explain it either.
> >
> > Hmm, I attempted to explain that earlier:
> >
> > * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > * the CPU cache are always coherent, regardless of the @cache_level. On
> > * snooping based platforms this is not the case, unless the full
> > * I915_CACHE_LLC or similar setting is used.
> > *
> > * As a result of this we need to track coherency separately for reads
> > * and writes, in order to avoid superfluous flushing on shared-LLC
> > * platforms, for reads.
> >
> > So AFAIK it's just because shared-LLC can be coherent for reads, while
> > also not being coherent for writes(CACHE_NONE),
>
> CPU vs. GPU is fully coherent when it comes to LLC. Or at least I've
> never heard of any mechanism that would make it only partially coherent.

What do you mean by "comes to LLC", are you talking about HAS_LLC() or
I915_CACHE_LLC?

If you set I915_CACHE_LLC, then yes it is fully coherent for both
HAS_LLC() and HAS_SNOOP().

If you set I915_CACHE_NONE, then reads are still coherent on
HAS_LLC(), for HAS_SNOOP() they are not. Or at least that is the
existing behaviour in the driver AFAIK.

>
> --
> Ville Syrjälä
> Intel
Ville Syrjälä July 13, 2021, 6:46 p.m. UTC | #7
On Tue, Jul 13, 2021 at 07:24:23PM +0100, Matthew Auld wrote:
> On Tue, 13 Jul 2021 at 18:47, Ville Syrjälä
> <ville.syrjala@linux.intel.com> wrote:
> >
> > On Tue, Jul 13, 2021 at 05:13:37PM +0100, Matthew Auld wrote:
> > > On Tue, 13 Jul 2021 at 16:55, Ville Syrjälä
> > > <ville.syrjala@linux.intel.com> wrote:
> > > >
> > > > On Tue, Jul 13, 2021 at 11:45:50AM +0100, Matthew Auld wrote:
> > > > > +     /**
> > > > > +      * @cache_coherent:
> > > > > +      *
> > > > > +      * Track whether the pages are coherent with the GPU if reading or
> > > > > +      * writing through the CPU cache.
> > > > > +      *
> > > > > +      * This largely depends on the @cache_level, for example if the object
> > > > > +      * is marked as I915_CACHE_LLC, then GPU access is coherent for both
> > > > > +      * reads and writes through the CPU cache.
> > > > > +      *
> > > > > +      * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > > > > +      * the CPU cache are always coherent, regardless of the @cache_level. On
> > > > > +      * snooping based platforms this is not the case, unless the full
> > > > > +      * I915_CACHE_LLC or similar setting is used.
> > > > > +      *
> > > > > +      * As a result of this we need to track coherency separately for reads
> > > > > +      * and writes, in order to avoid superfluous flushing on shared-LLC
> > > > > +      * platforms, for reads.
> > > > > +      *
> > > > > +      * I915_BO_CACHE_COHERENT_FOR_READ:
> > > > > +      *
> > > > > +      * When reading through the CPU cache, the GPU is still coherent. Note
> > > > > +      * that no data has actually been modified here, so it might seem
> > > > > +      * strange that we care about this.
> > > > > +      *
> > > > > +      * As an example, if some object is mapped on the CPU with write-back
> > > > > +      * caching, and we read some page, then the cache likely now contains
> > > > > +      * the data from that read. At this point the cache and main memory
> > > > > +      * match up, so all good. But next the GPU needs to write some data to
> > > > > +      * that same page. Now if the @cache_level is I915_CACHE_NONE and the
> > > > > +      * the platform doesn't have the shared-LLC, then the GPU will
> > > > > +      * effectively skip invalidating the cache(or however that works
> > > > > +      * internally) when writing the new value.  This is really bad since the
> > > > > +      * GPU has just written some new data to main memory, but the CPU cache
> > > > > +      * is still valid and now contains stale data. As a result the next time
> > > > > +      * we do a cached read with the CPU, we are rewarded with stale data.
> > > > > +      * Likewise if the cache is later flushed, we might be rewarded with
> > > > > +      * overwriting main memory with stale data.
> > > > > +      *
> > > > > +      * I915_BO_CACHE_COHERENT_FOR_WRITE:
> > > > > +      *
> > > > > +      * When writing through the CPU cache, the GPU is still coherent. Note
> > > > > +      * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.
> > > > > +      *
> > > > > +      * This is never set when I915_CACHE_NONE is used for @cache_level,
> > > > > +      * where instead we have to manually flush the caches after writing
> > > > > +      * through the CPU cache. For other cache levels this should be set and
> > > > > +      * the object is therefore considered coherent for both reads and writes
> > > > > +      * through the CPU cache.
> > > >
> > > > I don't remember why we have this read vs. write split and this new
> > > > documentation doesn't seem to really explain it either.
> > >
> > > Hmm, I attempted to explain that earlier:
> > >
> > > * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > > * the CPU cache are always coherent, regardless of the @cache_level. On
> > > * snooping based platforms this is not the case, unless the full
> > > * I915_CACHE_LLC or similar setting is used.
> > > *
> > > * As a result of this we need to track coherency separately for reads
> > > * and writes, in order to avoid superfluous flushing on shared-LLC
> > > * platforms, for reads.
> > >
> > > So AFAIK it's just because shared-LLC can be coherent for reads, while
> > > also not being coherent for writes(CACHE_NONE),
> >
> > CPU vs. GPU is fully coherent when it comes to LLC. Or at least I've
> > never heard of any mechanism that would make it only partially coherent.
> 
> What do you mean by "comes to LLC", are you talking about HAS_LLC() or
> I915_CACHE_LLC?

I'm talking about the actual cache.

> 
> If you set I915_CACHE_LLC, then yes it is fully coherent for both
> HAS_LLC() and HAS_SNOOP().
> 
> If you set I915_CACHE_NONE, then reads are still coherent on
> HAS_LLC(),

Reads and writes both. The only thing that's not coherent is the
display engine.

> for HAS_SNOOP() they are not. Or at least that is the
> existing behaviour in the driver AFAIK.
> 
> >
> > --
> > Ville Syrjälä
> > Intel
Daniel Vetter July 14, 2021, 11:16 a.m. UTC | #8
On Tue, Jul 13, 2021 at 09:46:30PM +0300, Ville Syrjälä wrote:
> On Tue, Jul 13, 2021 at 07:24:23PM +0100, Matthew Auld wrote:
> > On Tue, 13 Jul 2021 at 18:47, Ville Syrjälä
> > <ville.syrjala@linux.intel.com> wrote:
> > >
> > > On Tue, Jul 13, 2021 at 05:13:37PM +0100, Matthew Auld wrote:
> > > > On Tue, 13 Jul 2021 at 16:55, Ville Syrjälä
> > > > <ville.syrjala@linux.intel.com> wrote:
> > > > >
> > > > > On Tue, Jul 13, 2021 at 11:45:50AM +0100, Matthew Auld wrote:
> > > > > > +     /**
> > > > > > +      * @cache_coherent:
> > > > > > +      *
> > > > > > +      * Track whether the pages are coherent with the GPU if reading or
> > > > > > +      * writing through the CPU cache.
> > > > > > +      *
> > > > > > +      * This largely depends on the @cache_level, for example if the object
> > > > > > +      * is marked as I915_CACHE_LLC, then GPU access is coherent for both
> > > > > > +      * reads and writes through the CPU cache.
> > > > > > +      *
> > > > > > +      * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > > > > > +      * the CPU cache are always coherent, regardless of the @cache_level. On
> > > > > > +      * snooping based platforms this is not the case, unless the full
> > > > > > +      * I915_CACHE_LLC or similar setting is used.
> > > > > > +      *
> > > > > > +      * As a result of this we need to track coherency separately for reads
> > > > > > +      * and writes, in order to avoid superfluous flushing on shared-LLC
> > > > > > +      * platforms, for reads.
> > > > > > +      *
> > > > > > +      * I915_BO_CACHE_COHERENT_FOR_READ:
> > > > > > +      *
> > > > > > +      * When reading through the CPU cache, the GPU is still coherent. Note
> > > > > > +      * that no data has actually been modified here, so it might seem
> > > > > > +      * strange that we care about this.
> > > > > > +      *
> > > > > > +      * As an example, if some object is mapped on the CPU with write-back
> > > > > > +      * caching, and we read some page, then the cache likely now contains
> > > > > > +      * the data from that read. At this point the cache and main memory
> > > > > > +      * match up, so all good. But next the GPU needs to write some data to
> > > > > > +      * that same page. Now if the @cache_level is I915_CACHE_NONE and the
> > > > > > +      * the platform doesn't have the shared-LLC, then the GPU will
> > > > > > +      * effectively skip invalidating the cache(or however that works
> > > > > > +      * internally) when writing the new value.  This is really bad since the
> > > > > > +      * GPU has just written some new data to main memory, but the CPU cache
> > > > > > +      * is still valid and now contains stale data. As a result the next time
> > > > > > +      * we do a cached read with the CPU, we are rewarded with stale data.
> > > > > > +      * Likewise if the cache is later flushed, we might be rewarded with
> > > > > > +      * overwriting main memory with stale data.
> > > > > > +      *
> > > > > > +      * I915_BO_CACHE_COHERENT_FOR_WRITE:
> > > > > > +      *
> > > > > > +      * When writing through the CPU cache, the GPU is still coherent. Note
> > > > > > +      * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.
> > > > > > +      *
> > > > > > +      * This is never set when I915_CACHE_NONE is used for @cache_level,
> > > > > > +      * where instead we have to manually flush the caches after writing
> > > > > > +      * through the CPU cache. For other cache levels this should be set and
> > > > > > +      * the object is therefore considered coherent for both reads and writes
> > > > > > +      * through the CPU cache.
> > > > >
> > > > > I don't remember why we have this read vs. write split and this new
> > > > > documentation doesn't seem to really explain it either.
> > > >
> > > > Hmm, I attempted to explain that earlier:
> > > >
> > > > * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > > > * the CPU cache are always coherent, regardless of the @cache_level. On
> > > > * snooping based platforms this is not the case, unless the full
> > > > * I915_CACHE_LLC or similar setting is used.
> > > > *
> > > > * As a result of this we need to track coherency separately for reads
> > > > * and writes, in order to avoid superfluous flushing on shared-LLC
> > > > * platforms, for reads.
> > > >
> > > > So AFAIK it's just because shared-LLC can be coherent for reads, while
> > > > also not being coherent for writes(CACHE_NONE),
> > >
> > > CPU vs. GPU is fully coherent when it comes to LLC. Or at least I've
> > > never heard of any mechanism that would make it only partially coherent.
> > 
> > What do you mean by "comes to LLC", are you talking about HAS_LLC() or
> > I915_CACHE_LLC?
> 
> I'm talking about the actual cache.
> 
> > 
> > If you set I915_CACHE_LLC, then yes it is fully coherent for both
> > HAS_LLC() and HAS_SNOOP().
> > 
> > If you set I915_CACHE_NONE, then reads are still coherent on
> > HAS_LLC(),
> 
> Reads and writes both. The only thing that's not coherent is the
> display engine.

There's a lot of code which seems to disagree, plus there's now this new
MOCS thing. I really hope we don't have all those cache coherency bits
just because the code complexity is entertaining?

Are there some igts to verify this all? Or selftests probably more
appropriate.
-Daniel


> > for HAS_SNOOP() they are not. Or at least that is the
> > existing behaviour in the driver AFAIK.
> > 
> > >
> > > --
> > > Ville Syrjälä
> > > Intel
> 
> -- 
> Ville Syrjälä
> Intel
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Ville Syrjälä July 14, 2021, 11:42 a.m. UTC | #9
On Wed, Jul 14, 2021 at 01:16:57PM +0200, Daniel Vetter wrote:
> On Tue, Jul 13, 2021 at 09:46:30PM +0300, Ville Syrjälä wrote:
> > On Tue, Jul 13, 2021 at 07:24:23PM +0100, Matthew Auld wrote:
> > > On Tue, 13 Jul 2021 at 18:47, Ville Syrjälä
> > > <ville.syrjala@linux.intel.com> wrote:
> > > >
> > > > On Tue, Jul 13, 2021 at 05:13:37PM +0100, Matthew Auld wrote:
> > > > > On Tue, 13 Jul 2021 at 16:55, Ville Syrjälä
> > > > > <ville.syrjala@linux.intel.com> wrote:
> > > > > >
> > > > > > On Tue, Jul 13, 2021 at 11:45:50AM +0100, Matthew Auld wrote:
> > > > > > > +     /**
> > > > > > > +      * @cache_coherent:
> > > > > > > +      *
> > > > > > > +      * Track whether the pages are coherent with the GPU if reading or
> > > > > > > +      * writing through the CPU cache.
> > > > > > > +      *
> > > > > > > +      * This largely depends on the @cache_level, for example if the object
> > > > > > > +      * is marked as I915_CACHE_LLC, then GPU access is coherent for both
> > > > > > > +      * reads and writes through the CPU cache.
> > > > > > > +      *
> > > > > > > +      * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > > > > > > +      * the CPU cache are always coherent, regardless of the @cache_level. On
> > > > > > > +      * snooping based platforms this is not the case, unless the full
> > > > > > > +      * I915_CACHE_LLC or similar setting is used.
> > > > > > > +      *
> > > > > > > +      * As a result of this we need to track coherency separately for reads
> > > > > > > +      * and writes, in order to avoid superfluous flushing on shared-LLC
> > > > > > > +      * platforms, for reads.
> > > > > > > +      *
> > > > > > > +      * I915_BO_CACHE_COHERENT_FOR_READ:
> > > > > > > +      *
> > > > > > > +      * When reading through the CPU cache, the GPU is still coherent. Note
> > > > > > > +      * that no data has actually been modified here, so it might seem
> > > > > > > +      * strange that we care about this.
> > > > > > > +      *
> > > > > > > +      * As an example, if some object is mapped on the CPU with write-back
> > > > > > > +      * caching, and we read some page, then the cache likely now contains
> > > > > > > +      * the data from that read. At this point the cache and main memory
> > > > > > > +      * match up, so all good. But next the GPU needs to write some data to
> > > > > > > +      * that same page. Now if the @cache_level is I915_CACHE_NONE and the
> > > > > > > +      * the platform doesn't have the shared-LLC, then the GPU will
> > > > > > > +      * effectively skip invalidating the cache(or however that works
> > > > > > > +      * internally) when writing the new value.  This is really bad since the
> > > > > > > +      * GPU has just written some new data to main memory, but the CPU cache
> > > > > > > +      * is still valid and now contains stale data. As a result the next time
> > > > > > > +      * we do a cached read with the CPU, we are rewarded with stale data.
> > > > > > > +      * Likewise if the cache is later flushed, we might be rewarded with
> > > > > > > +      * overwriting main memory with stale data.
> > > > > > > +      *
> > > > > > > +      * I915_BO_CACHE_COHERENT_FOR_WRITE:
> > > > > > > +      *
> > > > > > > +      * When writing through the CPU cache, the GPU is still coherent. Note
> > > > > > > +      * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.
> > > > > > > +      *
> > > > > > > +      * This is never set when I915_CACHE_NONE is used for @cache_level,
> > > > > > > +      * where instead we have to manually flush the caches after writing
> > > > > > > +      * through the CPU cache. For other cache levels this should be set and
> > > > > > > +      * the object is therefore considered coherent for both reads and writes
> > > > > > > +      * through the CPU cache.
> > > > > >
> > > > > > I don't remember why we have this read vs. write split and this new
> > > > > > documentation doesn't seem to really explain it either.
> > > > >
> > > > > Hmm, I attempted to explain that earlier:
> > > > >
> > > > > * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > > > > * the CPU cache are always coherent, regardless of the @cache_level. On
> > > > > * snooping based platforms this is not the case, unless the full
> > > > > * I915_CACHE_LLC or similar setting is used.
> > > > > *
> > > > > * As a result of this we need to track coherency separately for reads
> > > > > * and writes, in order to avoid superfluous flushing on shared-LLC
> > > > > * platforms, for reads.
> > > > >
> > > > > So AFAIK it's just because shared-LLC can be coherent for reads, while
> > > > > also not being coherent for writes(CACHE_NONE),
> > > >
> > > > CPU vs. GPU is fully coherent when it comes to LLC. Or at least I've
> > > > never heard of any mechanism that would make it only partially coherent.
> > > 
> > > What do you mean by "comes to LLC", are you talking about HAS_LLC() or
> > > I915_CACHE_LLC?
> > 
> > I'm talking about the actual cache.
> > 
> > > 
> > > If you set I915_CACHE_LLC, then yes it is fully coherent for both
> > > HAS_LLC() and HAS_SNOOP().
> > > 
> > > If you set I915_CACHE_NONE, then reads are still coherent on
> > > HAS_LLC(),
> > 
> > Reads and writes both. The only thing that's not coherent is the
> > display engine.
> 
> There's a lot of code which seems to disagree,

Can't even imagine why anyone would make a cache coherency protocol
that only handles reads but not writes...

> plus there's now this new
> MOCS thing.

That's just a full LLC bypass AFAICS. Can't omit invalidates if
you use that one or you'll just get stale data from the cache
on reads as well.

> I really hope we don't have all those cache coherency bits
> just because the code complexity is entertaining?

They were definitely added to fix a display issue, and before
that it was just a single flag, which wasn't doing what the display
needed. I think before the flag was added we used some other indicators
to check when we need to clflush, or maybe we did a some extra pointless
clflushes here and there and the broken single flag was supposed to
avoid those. Not quite sure.

I suppose these two flags should maybe have been named more like
"needs invalidate" and "needs writeback" to make it clear what 
one needs to do.
Ville Syrjälä July 14, 2021, 12:08 p.m. UTC | #10
On Wed, Jul 14, 2021 at 02:42:53PM +0300, Ville Syrjälä wrote:
> On Wed, Jul 14, 2021 at 01:16:57PM +0200, Daniel Vetter wrote:
> > On Tue, Jul 13, 2021 at 09:46:30PM +0300, Ville Syrjälä wrote:
> > > On Tue, Jul 13, 2021 at 07:24:23PM +0100, Matthew Auld wrote:
> > > > On Tue, 13 Jul 2021 at 18:47, Ville Syrjälä
> > > > <ville.syrjala@linux.intel.com> wrote:
> > > > >
> > > > > On Tue, Jul 13, 2021 at 05:13:37PM +0100, Matthew Auld wrote:
> > > > > > On Tue, 13 Jul 2021 at 16:55, Ville Syrjälä
> > > > > > <ville.syrjala@linux.intel.com> wrote:
> > > > > > >
> > > > > > > On Tue, Jul 13, 2021 at 11:45:50AM +0100, Matthew Auld wrote:
> > > > > > > > +     /**
> > > > > > > > +      * @cache_coherent:
> > > > > > > > +      *
> > > > > > > > +      * Track whether the pages are coherent with the GPU if reading or
> > > > > > > > +      * writing through the CPU cache.
> > > > > > > > +      *
> > > > > > > > +      * This largely depends on the @cache_level, for example if the object
> > > > > > > > +      * is marked as I915_CACHE_LLC, then GPU access is coherent for both
> > > > > > > > +      * reads and writes through the CPU cache.
> > > > > > > > +      *
> > > > > > > > +      * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > > > > > > > +      * the CPU cache are always coherent, regardless of the @cache_level. On
> > > > > > > > +      * snooping based platforms this is not the case, unless the full
> > > > > > > > +      * I915_CACHE_LLC or similar setting is used.
> > > > > > > > +      *
> > > > > > > > +      * As a result of this we need to track coherency separately for reads
> > > > > > > > +      * and writes, in order to avoid superfluous flushing on shared-LLC
> > > > > > > > +      * platforms, for reads.
> > > > > > > > +      *
> > > > > > > > +      * I915_BO_CACHE_COHERENT_FOR_READ:
> > > > > > > > +      *
> > > > > > > > +      * When reading through the CPU cache, the GPU is still coherent. Note
> > > > > > > > +      * that no data has actually been modified here, so it might seem
> > > > > > > > +      * strange that we care about this.
> > > > > > > > +      *
> > > > > > > > +      * As an example, if some object is mapped on the CPU with write-back
> > > > > > > > +      * caching, and we read some page, then the cache likely now contains
> > > > > > > > +      * the data from that read. At this point the cache and main memory
> > > > > > > > +      * match up, so all good. But next the GPU needs to write some data to
> > > > > > > > +      * that same page. Now if the @cache_level is I915_CACHE_NONE and the
> > > > > > > > +      * the platform doesn't have the shared-LLC, then the GPU will
> > > > > > > > +      * effectively skip invalidating the cache(or however that works
> > > > > > > > +      * internally) when writing the new value.  This is really bad since the
> > > > > > > > +      * GPU has just written some new data to main memory, but the CPU cache
> > > > > > > > +      * is still valid and now contains stale data. As a result the next time
> > > > > > > > +      * we do a cached read with the CPU, we are rewarded with stale data.
> > > > > > > > +      * Likewise if the cache is later flushed, we might be rewarded with
> > > > > > > > +      * overwriting main memory with stale data.
> > > > > > > > +      *
> > > > > > > > +      * I915_BO_CACHE_COHERENT_FOR_WRITE:
> > > > > > > > +      *
> > > > > > > > +      * When writing through the CPU cache, the GPU is still coherent. Note
> > > > > > > > +      * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.
> > > > > > > > +      *
> > > > > > > > +      * This is never set when I915_CACHE_NONE is used for @cache_level,
> > > > > > > > +      * where instead we have to manually flush the caches after writing
> > > > > > > > +      * through the CPU cache. For other cache levels this should be set and
> > > > > > > > +      * the object is therefore considered coherent for both reads and writes
> > > > > > > > +      * through the CPU cache.
> > > > > > >
> > > > > > > I don't remember why we have this read vs. write split and this new
> > > > > > > documentation doesn't seem to really explain it either.
> > > > > >
> > > > > > Hmm, I attempted to explain that earlier:
> > > > > >
> > > > > > * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > > > > > * the CPU cache are always coherent, regardless of the @cache_level. On
> > > > > > * snooping based platforms this is not the case, unless the full
> > > > > > * I915_CACHE_LLC or similar setting is used.
> > > > > > *
> > > > > > * As a result of this we need to track coherency separately for reads
> > > > > > * and writes, in order to avoid superfluous flushing on shared-LLC
> > > > > > * platforms, for reads.
> > > > > >
> > > > > > So AFAIK it's just because shared-LLC can be coherent for reads, while
> > > > > > also not being coherent for writes(CACHE_NONE),
> > > > >
> > > > > CPU vs. GPU is fully coherent when it comes to LLC. Or at least I've
> > > > > never heard of any mechanism that would make it only partially coherent.
> > > > 
> > > > What do you mean by "comes to LLC", are you talking about HAS_LLC() or
> > > > I915_CACHE_LLC?
> > > 
> > > I'm talking about the actual cache.
> > > 
> > > > 
> > > > If you set I915_CACHE_LLC, then yes it is fully coherent for both
> > > > HAS_LLC() and HAS_SNOOP().
> > > > 
> > > > If you set I915_CACHE_NONE, then reads are still coherent on
> > > > HAS_LLC(),
> > > 
> > > Reads and writes both. The only thing that's not coherent is the
> > > display engine.
> > 
> > There's a lot of code which seems to disagree,
> 
> Can't even imagine why anyone would make a cache coherency protocol
> that only handles reads but not writes...
> 
> > plus there's now this new
> > MOCS thing.
> 
> That's just a full LLC bypass AFAICS. Can't omit invalidates if
> you use that one or you'll just get stale data from the cache
> on reads as well.

And just to reiterate that the current "reads are coherent" thing
works is because the only non-coherent agent (display engine) never
writes anything. If/when we implement writeback support we can no
longer skip the invalidate even on LLC machines when reading from
a writeback buffer.
Daniel Vetter July 14, 2021, 12:57 p.m. UTC | #11
On Wed, Jul 14, 2021 at 02:42:53PM +0300, Ville Syrjälä wrote:
> On Wed, Jul 14, 2021 at 01:16:57PM +0200, Daniel Vetter wrote:
> > On Tue, Jul 13, 2021 at 09:46:30PM +0300, Ville Syrjälä wrote:
> > > On Tue, Jul 13, 2021 at 07:24:23PM +0100, Matthew Auld wrote:
> > > > On Tue, 13 Jul 2021 at 18:47, Ville Syrjälä
> > > > <ville.syrjala@linux.intel.com> wrote:
> > > > >
> > > > > On Tue, Jul 13, 2021 at 05:13:37PM +0100, Matthew Auld wrote:
> > > > > > On Tue, 13 Jul 2021 at 16:55, Ville Syrjälä
> > > > > > <ville.syrjala@linux.intel.com> wrote:
> > > > > > >
> > > > > > > On Tue, Jul 13, 2021 at 11:45:50AM +0100, Matthew Auld wrote:
> > > > > > > > +     /**
> > > > > > > > +      * @cache_coherent:
> > > > > > > > +      *
> > > > > > > > +      * Track whether the pages are coherent with the GPU if reading or
> > > > > > > > +      * writing through the CPU cache.
> > > > > > > > +      *
> > > > > > > > +      * This largely depends on the @cache_level, for example if the object
> > > > > > > > +      * is marked as I915_CACHE_LLC, then GPU access is coherent for both
> > > > > > > > +      * reads and writes through the CPU cache.
> > > > > > > > +      *
> > > > > > > > +      * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > > > > > > > +      * the CPU cache are always coherent, regardless of the @cache_level. On
> > > > > > > > +      * snooping based platforms this is not the case, unless the full
> > > > > > > > +      * I915_CACHE_LLC or similar setting is used.
> > > > > > > > +      *
> > > > > > > > +      * As a result of this we need to track coherency separately for reads
> > > > > > > > +      * and writes, in order to avoid superfluous flushing on shared-LLC
> > > > > > > > +      * platforms, for reads.
> > > > > > > > +      *
> > > > > > > > +      * I915_BO_CACHE_COHERENT_FOR_READ:
> > > > > > > > +      *
> > > > > > > > +      * When reading through the CPU cache, the GPU is still coherent. Note
> > > > > > > > +      * that no data has actually been modified here, so it might seem
> > > > > > > > +      * strange that we care about this.
> > > > > > > > +      *
> > > > > > > > +      * As an example, if some object is mapped on the CPU with write-back
> > > > > > > > +      * caching, and we read some page, then the cache likely now contains
> > > > > > > > +      * the data from that read. At this point the cache and main memory
> > > > > > > > +      * match up, so all good. But next the GPU needs to write some data to
> > > > > > > > +      * that same page. Now if the @cache_level is I915_CACHE_NONE and the
> > > > > > > > +      * the platform doesn't have the shared-LLC, then the GPU will
> > > > > > > > +      * effectively skip invalidating the cache(or however that works
> > > > > > > > +      * internally) when writing the new value.  This is really bad since the
> > > > > > > > +      * GPU has just written some new data to main memory, but the CPU cache
> > > > > > > > +      * is still valid and now contains stale data. As a result the next time
> > > > > > > > +      * we do a cached read with the CPU, we are rewarded with stale data.
> > > > > > > > +      * Likewise if the cache is later flushed, we might be rewarded with
> > > > > > > > +      * overwriting main memory with stale data.
> > > > > > > > +      *
> > > > > > > > +      * I915_BO_CACHE_COHERENT_FOR_WRITE:
> > > > > > > > +      *
> > > > > > > > +      * When writing through the CPU cache, the GPU is still coherent. Note
> > > > > > > > +      * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.
> > > > > > > > +      *
> > > > > > > > +      * This is never set when I915_CACHE_NONE is used for @cache_level,
> > > > > > > > +      * where instead we have to manually flush the caches after writing
> > > > > > > > +      * through the CPU cache. For other cache levels this should be set and
> > > > > > > > +      * the object is therefore considered coherent for both reads and writes
> > > > > > > > +      * through the CPU cache.
> > > > > > >
> > > > > > > I don't remember why we have this read vs. write split and this new
> > > > > > > documentation doesn't seem to really explain it either.
> > > > > >
> > > > > > Hmm, I attempted to explain that earlier:
> > > > > >
> > > > > > * Note that on platforms with shared-LLC support(HAS_LLC) reads through
> > > > > > * the CPU cache are always coherent, regardless of the @cache_level. On
> > > > > > * snooping based platforms this is not the case, unless the full
> > > > > > * I915_CACHE_LLC or similar setting is used.
> > > > > > *
> > > > > > * As a result of this we need to track coherency separately for reads
> > > > > > * and writes, in order to avoid superfluous flushing on shared-LLC
> > > > > > * platforms, for reads.
> > > > > >
> > > > > > So AFAIK it's just because shared-LLC can be coherent for reads, while
> > > > > > also not being coherent for writes(CACHE_NONE),
> > > > >
> > > > > CPU vs. GPU is fully coherent when it comes to LLC. Or at least I've
> > > > > never heard of any mechanism that would make it only partially coherent.
> > > > 
> > > > What do you mean by "comes to LLC", are you talking about HAS_LLC() or
> > > > I915_CACHE_LLC?
> > > 
> > > I'm talking about the actual cache.
> > > 
> > > > 
> > > > If you set I915_CACHE_LLC, then yes it is fully coherent for both
> > > > HAS_LLC() and HAS_SNOOP().
> > > > 
> > > > If you set I915_CACHE_NONE, then reads are still coherent on
> > > > HAS_LLC(),
> > > 
> > > Reads and writes both. The only thing that's not coherent is the
> > > display engine.
> > 
> > There's a lot of code which seems to disagree,
> 
> Can't even imagine why anyone would make a cache coherency protocol
> that only handles reads but not writes...
> 
> > plus there's now this new
> > MOCS thing.
> 
> That's just a full LLC bypass AFAICS. Can't omit invalidates if
> you use that one or you'll just get stale data from the cache
> on reads as well.
> 
> > I really hope we don't have all those cache coherency bits
> > just because the code complexity is entertaining?
> 
> They were definitely added to fix a display issue, and before
> that it was just a single flag, which wasn't doing what the display
> needed. I think before the flag was added we used some other indicators
> to check when we need to clflush, or maybe we did a some extra pointless
> clflushes here and there and the broken single flag was supposed to
> avoid those. Not quite sure.

Hm I thought cache_dirty was added for that display case? But yeah this
entire model is maybe not super well-defined in terms of all the use-cases
and what exactly it means ...

I'm also not clear why this is on the object, and not some kind of
function which computes it from the platform + cache_level.

> I suppose these two flags should maybe have been named more like
> "needs invalidate" and "needs writeback" to make it clear what 
> one needs to do.

tbh I have no idea, this all started to get added after I disappeared for
a few years into display.
-Daniel
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index ef3de2ae9723..02c3529b774c 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -92,6 +92,57 @@  struct drm_i915_gem_object_ops {
 	const char *name; /* friendly name for debug, e.g. lockdep classes */
 };
 
+/**
+ * enum i915_cache_level - The supported GTT caching values for system memory
+ * pages.
+ *
+ * These translate to some special GTT PTE bits when binding pages into some
+ * address space. It also determines whether an object, or rather its pages are
+ * coherent with the GPU, when also reading or writing through the CPU cache
+ * with those pages.
+ *
+ * Userspace can also control this through struct drm_i915_gem_caching.
+ */
+enum i915_cache_level {
+	/**
+	 * @I915_CACHE_NONE:
+	 *
+	 * Not coherent with the CPU cache. If the cache is dirty and we need
+	 * the underlying pages to be coherent with some later GPU access then
+	 * we need to manually flush the pages.
+	 *
+	 * Note that on shared-LLC platforms reads through the CPU cache are
+	 * still coherent even with this setting. See also
+	 * I915_BO_CACHE_COHERENT_FOR_READ for more details.
+	 */
+	I915_CACHE_NONE = 0,
+	/**
+	 * @I915_CACHE_LLC:
+	 *
+	 * Coherent with the CPU cache. If the cache is dirty, then the GPU will
+	 * ensure that access remains coherent, when both reading and writing
+	 * through the CPU cache.
+	 *
+	 * Applies to both platforms with shared-LLC(HAS_LLC), and snooping
+	 * based platforms(HAS_SNOOP).
+	 */
+	I915_CACHE_LLC,
+	/**
+	 * @I915_CACHE_L3_LLC:
+	 *
+	 * gen7+, L3 sits between the domain specifc caches, eg sampler/render
+	 * caches, and the large Last-Level-Cache. LLC is coherent with the CPU,
+	 * but L3 is only visible to the GPU.
+	 */
+	I915_CACHE_L3_LLC,
+	/**
+	 * @I915_CACHE_WT:
+	 *
+	 * hsw:gt3e Write-through for scanout buffers.
+	 */
+	I915_CACHE_WT,
+};
+
 enum i915_map_type {
 	I915_MAP_WB = 0,
 	I915_MAP_WC,
@@ -228,14 +279,90 @@  struct drm_i915_gem_object {
 	unsigned int mem_flags;
 #define I915_BO_FLAG_STRUCT_PAGE BIT(0) /* Object backed by struct pages */
 #define I915_BO_FLAG_IOMEM       BIT(1) /* Object backed by IO memory */
-	/*
-	 * Is the object to be mapped as read-only to the GPU
-	 * Only honoured if hardware has relevant pte bit
+	/**
+	 * @cache_level: The desired GTT caching level.
+	 *
+	 * See enum i915_cache_level for possible values, along with what
+	 * each does.
 	 */
 	unsigned int cache_level:3;
-	unsigned int cache_coherent:2;
+	/**
+	 * @cache_coherent:
+	 *
+	 * Track whether the pages are coherent with the GPU if reading or
+	 * writing through the CPU cache.
+	 *
+	 * This largely depends on the @cache_level, for example if the object
+	 * is marked as I915_CACHE_LLC, then GPU access is coherent for both
+	 * reads and writes through the CPU cache.
+	 *
+	 * Note that on platforms with shared-LLC support(HAS_LLC) reads through
+	 * the CPU cache are always coherent, regardless of the @cache_level. On
+	 * snooping based platforms this is not the case, unless the full
+	 * I915_CACHE_LLC or similar setting is used.
+	 *
+	 * As a result of this we need to track coherency separately for reads
+	 * and writes, in order to avoid superfluous flushing on shared-LLC
+	 * platforms, for reads.
+	 *
+	 * I915_BO_CACHE_COHERENT_FOR_READ:
+	 *
+	 * When reading through the CPU cache, the GPU is still coherent. Note
+	 * that no data has actually been modified here, so it might seem
+	 * strange that we care about this.
+	 *
+	 * As an example, if some object is mapped on the CPU with write-back
+	 * caching, and we read some page, then the cache likely now contains
+	 * the data from that read. At this point the cache and main memory
+	 * match up, so all good. But next the GPU needs to write some data to
+	 * that same page. Now if the @cache_level is I915_CACHE_NONE and the
+	 * the platform doesn't have the shared-LLC, then the GPU will
+	 * effectively skip invalidating the cache(or however that works
+	 * internally) when writing the new value.  This is really bad since the
+	 * GPU has just written some new data to main memory, but the CPU cache
+	 * is still valid and now contains stale data. As a result the next time
+	 * we do a cached read with the CPU, we are rewarded with stale data.
+	 * Likewise if the cache is later flushed, we might be rewarded with
+	 * overwriting main memory with stale data.
+	 *
+	 * I915_BO_CACHE_COHERENT_FOR_WRITE:
+	 *
+	 * When writing through the CPU cache, the GPU is still coherent. Note
+	 * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.
+	 *
+	 * This is never set when I915_CACHE_NONE is used for @cache_level,
+	 * where instead we have to manually flush the caches after writing
+	 * through the CPU cache. For other cache levels this should be set and
+	 * the object is therefore considered coherent for both reads and writes
+	 * through the CPU cache.
+	 */
 #define I915_BO_CACHE_COHERENT_FOR_READ BIT(0)
 #define I915_BO_CACHE_COHERENT_FOR_WRITE BIT(1)
+	unsigned int cache_coherent:2;
+	/**
+	 * @cache_dirty:
+	 *
+	 * Track if the cache might be dirty for the @pages i.e it has yet to be
+	 * written back to main memory. As a result reading directly from main
+	 * memory might yield stale data.
+	 *
+	 * This also ties into whether the kernel is tracking the object as
+	 * coherent with the GPU, as per @cache_coherent, as it determines if
+	 * flushing might be needed at various points.
+	 *
+	 * Another part of @cache_dirty is managing flushing when first
+	 * acquiring the pages for system memory, at this point the pages are
+	 * considered foreign, so the default assumption is that the cache is
+	 * dirty, for example the page zeroing done my the kernel might leave
+	 * writes though the CPU cache, or swapping-in, while the actual data in
+	 * main memory is potentially stale.  Note that this is a potential
+	 * security issue when dealing with userspace objects and zeroing. Now,
+	 * whether we actually need apply the big sledgehammer of flushing all
+	 * the pages on acquire depends on if @cache_coherent is marked as
+	 * I915_BO_CACHE_COHERENT_FOR_WRITE, i.e that the GPU will be coherent
+	 * for both reads and writes though the CPU cache. So pretty much this
+	 * should only be needed for I915_CACHE_NONE objects.
+	 */
 	unsigned int cache_dirty:1;
 
 	/**
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c4747f4407ef..37bb1a3cadd4 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -394,15 +394,6 @@  struct drm_i915_display_funcs {
 	void (*read_luts)(struct intel_crtc_state *crtc_state);
 };
 
-enum i915_cache_level {
-	I915_CACHE_NONE = 0,
-	I915_CACHE_LLC, /* also used for snoopable memory on non-LLC */
-	I915_CACHE_L3_LLC, /* gen7+, L3 sits between the domain specifc
-			      caches, eg sampler/render caches, and the
-			      large Last-Level-Cache. LLC is coherent with
-			      the CPU, but L3 is only visible to the GPU. */
-	I915_CACHE_WT, /* hsw:gt3e WriteThrough for scanouts */
-};
 
 #define I915_COLOR_UNEVICTABLE (-1) /* a non-vma sharing the address space */