diff mbox

[02/27] drm/i915: Mark CPU cache as dirty on every transition for CPU writes

Message ID 20170419094143.16922-3-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson April 19, 2017, 9:41 a.m. UTC
Currently, we only mark the CPU cache as dirty if we skip a clflush.
This leads to some confusion where we have to ask if the object is in
the write domain or missed a clflush. If we always mark the cache as
dirty, this becomes a much simply question to answer.

The goal remains to do as few clflushes as required and to do them as
late as possible, in the hope of deferring the work to a kthread and not
block the caller (e.g. execbuf, flips).

Reported-by: Dongwon Kim <dongwon.kim@intel.com>
Fixes: a6a7cc4b7db6 ("drm/i915: Always flush the dirty CPU cache when pinning the scanout")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dongwon Kim <dongwon.kim@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c                  | 78 +++++++++++++++---------
 drivers/gpu/drm/i915/i915_gem_clflush.c          | 15 +++--
 drivers/gpu/drm/i915/i915_gem_execbuffer.c       | 21 +++----
 drivers/gpu/drm/i915/i915_gem_internal.c         |  3 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c          |  5 +-
 drivers/gpu/drm/i915/selftests/huge_gem_object.c |  3 +-
 6 files changed, 70 insertions(+), 55 deletions(-)

Comments

Kim, Dongwon April 19, 2017, 4:52 p.m. UTC | #1
I tried your patch but it didn't fix the original 
problem. I think it is somehow related to the flushing condition
here:

@@ -1129,10 +1129,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 	if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
 		continue;

	if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
+	if (obj->base.write_domain & obj->cache_dirty)
 		i915_gem_clflush_object(obj, 0);
-		obj->base.write_domain = 0;
-	}

here, we do clflush only if write_domain is not 0 even if cache_dirty
flag is set after your patch is applied.

And now please look at this:

@@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
        case I915_GEM_DOMAIN_CPU:
                i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
                break;
+
+       case I915_GEM_DOMAIN_RENDER:
+               if (gpu_write_needs_clflush(obj))
+                       obj->cache_dirty = true;
+               break;
        }

        obj->base.write_domain=0;

So here, if the write_domain is I915_GEM_DOMAIN_RENDER, we set cache_dirty to true
then reset write_domain.

So right after this flush_write_domain call, write_domain will be 0 but cache is
still dirty. I am wondering if this is where that condition (write_domain==0 and 
cache_dirty==1) originally came from.

On Wed, Apr 19, 2017 at 10:41:18AM +0100, Chris Wilson wrote:
> Currently, we only mark the CPU cache as dirty if we skip a clflush.
> This leads to some confusion where we have to ask if the object is in
> the write domain or missed a clflush. If we always mark the cache as
> dirty, this becomes a much simply question to answer.
> 
> The goal remains to do as few clflushes as required and to do them as
> late as possible, in the hope of deferring the work to a kthread and not
> block the caller (e.g. execbuf, flips).
> 
> Reported-by: Dongwon Kim <dongwon.kim@intel.com>
> Fixes: a6a7cc4b7db6 ("drm/i915: Always flush the dirty CPU cache when pinning the scanout")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dongwon Kim <dongwon.kim@intel.com>
> Cc: Matt Roper <matthew.d.roper@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c                  | 78 +++++++++++++++---------
>  drivers/gpu/drm/i915/i915_gem_clflush.c          | 15 +++--
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c       | 21 +++----
>  drivers/gpu/drm/i915/i915_gem_internal.c         |  3 +-
>  drivers/gpu/drm/i915/i915_gem_userptr.c          |  5 +-
>  drivers/gpu/drm/i915/selftests/huge_gem_object.c |  3 +-
>  6 files changed, 70 insertions(+), 55 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 33fb11cc5acc..488ca7733c1e 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -49,7 +49,7 @@ static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
>  
>  static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
>  {
> -	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
> +	if (obj->cache_dirty)
>  		return false;
>  
>  	if (!i915_gem_object_is_coherent(obj))
> @@ -233,6 +233,14 @@ i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
>  	return st;
>  }
>  
> +static void __start_cpu_write(struct drm_i915_gem_object *obj)
> +{
> +	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> +	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> +	if (cpu_write_needs_clflush(obj))
> +		obj->cache_dirty = true;
> +}
> +
>  static void
>  __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
>  				struct sg_table *pages,
> @@ -248,8 +256,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
>  	    !i915_gem_object_is_coherent(obj))
>  		drm_clflush_sg(pages);
>  
> -	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> -	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> +	__start_cpu_write(obj);
>  }
>  
>  static void
> @@ -684,6 +691,12 @@ i915_gem_dumb_create(struct drm_file *file,
>  			       args->size, &args->handle);
>  }
>  
> +static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> +{
> +	return !(obj->cache_level == I915_CACHE_NONE ||
> +		 obj->cache_level == I915_CACHE_WT);
> +}
> +
>  /**
>   * Creates a new mm object and returns a handle to it.
>   * @dev: drm device pointer
> @@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
>  	case I915_GEM_DOMAIN_CPU:
>  		i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
>  		break;
> +
> +	case I915_GEM_DOMAIN_RENDER:
> +		if (gpu_write_needs_clflush(obj))
> +			obj->cache_dirty = true;
> +		break;
>  	}
>  
>  	obj->base.write_domain = 0;
> @@ -854,7 +872,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
>  	 * optimizes for the case when the gpu will dirty the data
>  	 * anyway again before the next pread happens.
>  	 */
> -	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> +	if (!obj->cache_dirty &&
> +	    !(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
>  		*needs_clflush = CLFLUSH_BEFORE;
>  
>  out:
> @@ -906,14 +925,15 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
>  	 * This optimizes for the case when the gpu will use the data
>  	 * right away and we therefore have to clflush anyway.
>  	 */
> -	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
> +	if (!obj->cache_dirty) {
>  		*needs_clflush |= CLFLUSH_AFTER;
>  
> -	/* Same trick applies to invalidate partially written cachelines read
> -	 * before writing.
> -	 */
> -	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> -		*needs_clflush |= CLFLUSH_BEFORE;
> +		/* Same trick applies to invalidate partially written
> +		 * cachelines read before writing.
> +		 */
> +		if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> +			*needs_clflush |= CLFLUSH_BEFORE;
> +	}
>  
>  out:
>  	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
> @@ -3374,10 +3394,12 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
>  
>  static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
>  {
> -	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU && !obj->cache_dirty)
> -		return;
> -
> -	i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
> +	/* We manually flush the CPU domain so that we can override and
> +	 * force the flush for the display, and perform it asyncrhonously.
> +	 */
> +	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
> +	if (obj->cache_dirty)
> +		i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
>  	obj->base.write_domain = 0;
>  }
>  
> @@ -3636,14 +3658,17 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
>  		}
>  	}
>  
> -	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU &&
> -	    i915_gem_object_is_coherent(obj))
> -		obj->cache_dirty = true;
> +	/* Catch any deferred obj->cache_dirty markups */
> +	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
>  
>  	list_for_each_entry(vma, &obj->vma_list, obj_link)
>  		vma->node.color = cache_level;
>  	obj->cache_level = cache_level;
>  
> +	if (obj->base.write_domain & I915_GEM_DOMAIN_CPU &&
> +	    cpu_write_needs_clflush(obj))
> +		obj->cache_dirty = true;
> +
>  	return 0;
>  }
>  
> @@ -3864,9 +3889,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
>  	if (ret)
>  		return ret;
>  
> -	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
> -		return 0;
> -
>  	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
>  
>  	/* Flush the CPU cache if it's still invalid. */
> @@ -3878,15 +3900,13 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
>  	/* It should now be out of any other write domains, and we can update
>  	 * the domain values for our changes.
>  	 */
> -	GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
> +	GEM_BUG_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
>  
>  	/* If we're writing through the CPU, then the GPU read domains will
>  	 * need to be invalidated at next use.
>  	 */
> -	if (write) {
> -		obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> -		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> -	}
> +	if (write)
> +		__start_cpu_write(obj);
>  
>  	return 0;
>  }
> @@ -4306,6 +4326,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
>  	} else
>  		obj->cache_level = I915_CACHE_NONE;
>  
> +	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
> +
>  	trace_i915_gem_object_create(obj);
>  
>  	return obj;
> @@ -4968,10 +4990,8 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
>  
>  	mutex_lock(&dev_priv->drm.struct_mutex);
>  	for (p = phases; *p; p++) {
> -		list_for_each_entry(obj, *p, global_link) {
> -			obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> -			obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> -		}
> +		list_for_each_entry(obj, *p, global_link)
> +			__start_cpu_write(obj);
>  	}
>  	mutex_unlock(&dev_priv->drm.struct_mutex);
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
> index ffd01e02fe94..a895643c4dc4 100644
> --- a/drivers/gpu/drm/i915/i915_gem_clflush.c
> +++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
> @@ -72,8 +72,6 @@ static const struct dma_fence_ops i915_clflush_ops = {
>  static void __i915_do_clflush(struct drm_i915_gem_object *obj)
>  {
>  	drm_clflush_sg(obj->mm.pages);
> -	obj->cache_dirty = false;
> -
>  	intel_fb_obj_flush(obj, ORIGIN_CPU);
>  }
>  
> @@ -82,9 +80,6 @@ static void i915_clflush_work(struct work_struct *work)
>  	struct clflush *clflush = container_of(work, typeof(*clflush), work);
>  	struct drm_i915_gem_object *obj = clflush->obj;
>  
> -	if (!obj->cache_dirty)
> -		goto out;
> -
>  	if (i915_gem_object_pin_pages(obj)) {
>  		DRM_ERROR("Failed to acquire obj->pages for clflushing\n");
>  		goto out;
> @@ -132,10 +127,10 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
>  	 * anything not backed by physical memory we consider to be always
>  	 * coherent and not need clflushing.
>  	 */
> -	if (!i915_gem_object_has_struct_page(obj))
> +	if (!i915_gem_object_has_struct_page(obj)) {
> +		obj->cache_dirty = false;
>  		return;
> -
> -	obj->cache_dirty = true;
> +	}
>  
>  	/* If the GPU is snooping the contents of the CPU cache,
>  	 * we do not need to manually clear the CPU cache lines.  However,
> @@ -154,6 +149,8 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
>  	if (!(flags & I915_CLFLUSH_SYNC))
>  		clflush = kmalloc(sizeof(*clflush), GFP_KERNEL);
>  	if (clflush) {
> +		GEM_BUG_ON(!obj->cache_dirty);
> +
>  		dma_fence_init(&clflush->dma,
>  			       &i915_clflush_ops,
>  			       &clflush_lock,
> @@ -181,6 +178,8 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
>  	} else {
>  		GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU);
>  	}
> +
> +	obj->cache_dirty = false;
>  }
>  
>  void i915_gem_clflush_init(struct drm_i915_private *i915)
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index af1965774e7b..ddc011ef5480 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -291,7 +291,7 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
>  		return DBG_USE_CPU_RELOC > 0;
>  
>  	return (HAS_LLC(to_i915(obj->base.dev)) ||
> -		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
> +		obj->cache_dirty ||
>  		obj->cache_level != I915_CACHE_NONE);
>  }
>  
> @@ -1129,10 +1129,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
>  		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
>  			continue;
>  
> -		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
> +		if (obj->base.write_domain & obj->cache_dirty)
>  			i915_gem_clflush_object(obj, 0);
> -			obj->base.write_domain = 0;
> -		}
>  
>  		ret = i915_gem_request_await_object
>  			(req, obj, obj->base.pending_write_domain);
> @@ -1265,12 +1263,6 @@ i915_gem_validate_context(struct drm_device *dev, struct drm_file *file,
>  	return ctx;
>  }
>  
> -static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> -{
> -	return !(obj->cache_level == I915_CACHE_NONE ||
> -		 obj->cache_level == I915_CACHE_WT);
> -}
> -
>  void i915_vma_move_to_active(struct i915_vma *vma,
>  			     struct drm_i915_gem_request *req,
>  			     unsigned int flags)
> @@ -1294,15 +1286,16 @@ void i915_vma_move_to_active(struct i915_vma *vma,
>  	i915_gem_active_set(&vma->last_read[idx], req);
>  	list_move_tail(&vma->vm_link, &vma->vm->active_list);
>  
> +	obj->base.write_domain = 0;
>  	if (flags & EXEC_OBJECT_WRITE) {
> +		obj->base.write_domain = I915_GEM_DOMAIN_RENDER;
> +
>  		if (intel_fb_obj_invalidate(obj, ORIGIN_CS))
>  			i915_gem_active_set(&obj->frontbuffer_write, req);
>  
> -		/* update for the implicit flush after a batch */
> -		obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
> -		if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
> -			obj->cache_dirty = true;
> +		obj->base.read_domains = 0;
>  	}
> +	obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
>  
>  	if (flags & EXEC_OBJECT_NEEDS_FENCE)
>  		i915_gem_active_set(&vma->last_fence, req);
> diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c
> index fc950abbe400..58e93e87d573 100644
> --- a/drivers/gpu/drm/i915/i915_gem_internal.c
> +++ b/drivers/gpu/drm/i915/i915_gem_internal.c
> @@ -188,9 +188,10 @@ i915_gem_object_create_internal(struct drm_i915_private *i915,
>  	drm_gem_private_object_init(&i915->drm, &obj->base, size);
>  	i915_gem_object_init(obj, &i915_gem_object_internal_ops);
>  
> -	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>  	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> +	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>  	obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
> +	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
>  
>  	return obj;
>  }
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index 58ccf8b8ca1c..9f84be171ad2 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -802,9 +802,10 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
>  
>  	drm_gem_private_object_init(dev, &obj->base, args->user_size);
>  	i915_gem_object_init(obj, &i915_gem_userptr_ops);
> -	obj->cache_level = I915_CACHE_LLC;
> -	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>  	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> +	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> +	obj->cache_level = I915_CACHE_LLC;
> +	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
>  
>  	obj->userptr.ptr = args->user_ptr;
>  	obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY);
> diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> index 4e681fc13be4..0ca867a877b6 100644
> --- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> +++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> @@ -126,9 +126,10 @@ huge_gem_object(struct drm_i915_private *i915,
>  	drm_gem_private_object_init(&i915->drm, &obj->base, dma_size);
>  	i915_gem_object_init(obj, &huge_ops);
>  
> -	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>  	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> +	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>  	obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
> +	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
>  	obj->scratch = phys_size;
>  
>  	return obj;
> -- 
> 2.11.0
>
Chris Wilson April 19, 2017, 5:15 p.m. UTC | #2
On Wed, Apr 19, 2017 at 09:52:28AM -0700, Dongwon Kim wrote:
> I tried your patch but it didn't fix the original 
> problem. I think it is somehow related to the flushing condition
> here:
> 
> @@ -1129,10 +1129,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
>  	if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
>  		continue;
> 
> 	if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
> +	if (obj->base.write_domain & obj->cache_dirty)
>  		i915_gem_clflush_object(obj, 0);
> -		obj->base.write_domain = 0;
> -	}
> 
> here, we do clflush only if write_domain is not 0 even if cache_dirty
> flag is set after your patch is applied.
> 
> And now please look at this:
> 
> @@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
>         case I915_GEM_DOMAIN_CPU:
>                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
>                 break;
> +
> +       case I915_GEM_DOMAIN_RENDER:
> +               if (gpu_write_needs_clflush(obj))
> +                       obj->cache_dirty = true;
> +               break;
>         }
> 
>         obj->base.write_domain=0;
> 
> So here, if the write_domain is I915_GEM_DOMAIN_RENDER, we set cache_dirty to true
> then reset write_domain.
> 
> So right after this flush_write_domain call, write_domain will be 0 but cache is
> still dirty. I am wondering if this is where that condition (write_domain==0 and 
> cache_dirty==1) originally came from.

And we definitely do not want to be flushing the cache for the GPU after
a GPU write. We only want for that cache to be flushed after the GPU if
it is moved to a non-coherent domain. That's the challenge.

I was also expecting that (incoherent) reads from the GPU would go through
the cache - we make the same assumption for CPU reads.

Thanks for testing, definitely back to the drawing board. Hmm, might be
worth taking the alternative approach and to always schedule an async
clflush in set-cache-domain. Just have to check that we have appropriate
waits and don't have any inappropriate ones.
-Chris
Chris Wilson April 19, 2017, 5:46 p.m. UTC | #3
On Wed, Apr 19, 2017 at 09:52:28AM -0700, Dongwon Kim wrote:
> I tried your patch but it didn't fix the original 
> problem. I think it is somehow related to the flushing condition
> here:

I don't think I actually checked what GPU you observed it on - I was
assuming llc, since that was the last bug we had.
-Chris
Chris Wilson April 19, 2017, 6:08 p.m. UTC | #4
On Wed, Apr 19, 2017 at 09:52:28AM -0700, Dongwon Kim wrote:
> I tried your patch but it didn't fix the original 
> problem. I think it is somehow related to the flushing condition
> here:
> 
> @@ -1129,10 +1129,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
>  	if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
>  		continue;
> 
> 	if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
> +	if (obj->base.write_domain & obj->cache_dirty)
>  		i915_gem_clflush_object(obj, 0);
> -		obj->base.write_domain = 0;
> -	}
> 
> here, we do clflush only if write_domain is not 0 even if cache_dirty
> flag is set after your patch is applied.

This can be just reduced to if (obj->cache_dirty) clflush().

We're slightly better in that we don't set obj->cache_dirty so often for
normal gpu rendering (just on transitions away from the gpu now), but it
still means we will be redundantly checking for clflushes prior to
rendering.

Can you double check that this patch + if (obj->cache_dirty) works for you?

What I guess I really want here is
	if (obj->cache_dirty & !obj->cache_coherent)
essentially inlining the check from clflush.
-Chris
Kim, Dongwon April 19, 2017, 6:13 p.m. UTC | #5
Chris,

Just to make sure, you want to just remove write_domain check from 
if statement before clflush in execbuffer_move_to_gpu. Am I right?
I will try both (cache_dirty only vs cache_dirty & !cache_coherent)
and get back to you shortly. 

On Wed, Apr 19, 2017 at 07:08:33PM +0100, Chris Wilson wrote:
> On Wed, Apr 19, 2017 at 09:52:28AM -0700, Dongwon Kim wrote:
> > I tried your patch but it didn't fix the original 
> > problem. I think it is somehow related to the flushing condition
> > here:
> > 
> > @@ -1129,10 +1129,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
> >  	if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
> >  		continue;
> > 
> > 	if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
> > +	if (obj->base.write_domain & obj->cache_dirty)
> >  		i915_gem_clflush_object(obj, 0);
> > -		obj->base.write_domain = 0;
> > -	}
> > 
> > here, we do clflush only if write_domain is not 0 even if cache_dirty
> > flag is set after your patch is applied.
> 
> This can be just reduced to if (obj->cache_dirty) clflush().
> 
> We're slightly better in that we don't set obj->cache_dirty so often for
> normal gpu rendering (just on transitions away from the gpu now), but it
> still means we will be redundantly checking for clflushes prior to
> rendering.
> 
> Can you double check that this patch + if (obj->cache_dirty) works for you?
> 
> What I guess I really want here is
> 	if (obj->cache_dirty & !obj->cache_coherent)
> essentially inlining the check from clflush.
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre
Chris Wilson April 19, 2017, 6:26 p.m. UTC | #6
On Wed, Apr 19, 2017 at 11:13:17AM -0700, Dongwon Kim wrote:
> Chris,
> 
> Just to make sure, you want to just remove write_domain check from 
> if statement before clflush in execbuffer_move_to_gpu. Am I right?
> I will try both (cache_dirty only vs cache_dirty & !cache_coherent)
> and get back to you shortly. 

Yes, I just don't have a single bit for cache_coherent yet, so you
might as well let that call i915_gem_object_clflush().
-Chris
Kim, Dongwon April 19, 2017, 8:30 p.m. UTC | #7
Chris,

I think my assumption was not correct. I took out write_domain
check but it is still failing. However, here are couple of
observation points. I did some experiments.. One of them is 
to take out even cache_dirty check from eb_move_to_gpu. 
With this, all sample tests were passing but as you might 
expect, tests were running so slow, which explains how much 
clflush cost.

Then, I put cache_dirty check back into eb_move_to_gpu then
removed 'if (gpu_write_needs_clflush(obj))' from flush_write_domain
when write_domain is I915_GEM_DOMAIN_RENDER

@@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigne
d int flush_domains)
+
+       case I915_GEM_DOMAIN_RENDER:
+               if (gpu_write_needs_clflush(obj)) <-- took out this line
+                       obj->cache_dirty = true;
+               break;

to make cache_dirty is set all the time if write_domain is 
I915_GEM_DOMAIN_RENDER. And I saw some of failing tests were
now passing but this doesn't fix all of them.

I will try to revert back other changes in your patch as well.
Please let me know if there's any other thing you want me to
try to find a clue.

On Wed, Apr 19, 2017 at 07:26:29PM +0100, Chris Wilson wrote:
> On Wed, Apr 19, 2017 at 11:13:17AM -0700, Dongwon Kim wrote:
> > Chris,
> > 
> > Just to make sure, you want to just remove write_domain check from 
> > if statement before clflush in execbuffer_move_to_gpu. Am I right?
> > I will try both (cache_dirty only vs cache_dirty & !cache_coherent)
> > and get back to you shortly. 
> 
> Yes, I just don't have a single bit for cache_coherent yet, so you
> might as well let that call i915_gem_object_clflush().
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre
Kim, Dongwon April 19, 2017, 8:49 p.m. UTC | #8
Chris,

I am sorry that I didn't tell you about GPU that
I am working on. It is GEN9LP. Our target is APL-I.
So no LLC is available. 

On Wed, Apr 19, 2017 at 07:26:29PM +0100, Chris Wilson wrote:
> On Wed, Apr 19, 2017 at 11:13:17AM -0700, Dongwon Kim wrote:
> > Chris,
> > 
> > Just to make sure, you want to just remove write_domain check from 
> > if statement before clflush in execbuffer_move_to_gpu. Am I right?
> > I will try both (cache_dirty only vs cache_dirty & !cache_coherent)
> > and get back to you shortly. 
> 
> Yes, I just don't have a single bit for cache_coherent yet, so you
> might as well let that call i915_gem_object_clflush().
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 33fb11cc5acc..488ca7733c1e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -49,7 +49,7 @@  static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
 
 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 {
-	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
+	if (obj->cache_dirty)
 		return false;
 
 	if (!i915_gem_object_is_coherent(obj))
@@ -233,6 +233,14 @@  i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 	return st;
 }
 
+static void __start_cpu_write(struct drm_i915_gem_object *obj)
+{
+	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
+	if (cpu_write_needs_clflush(obj))
+		obj->cache_dirty = true;
+}
+
 static void
 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 				struct sg_table *pages,
@@ -248,8 +256,7 @@  __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 	    !i915_gem_object_is_coherent(obj))
 		drm_clflush_sg(pages);
 
-	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
-	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
+	__start_cpu_write(obj);
 }
 
 static void
@@ -684,6 +691,12 @@  i915_gem_dumb_create(struct drm_file *file,
 			       args->size, &args->handle);
 }
 
+static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
+{
+	return !(obj->cache_level == I915_CACHE_NONE ||
+		 obj->cache_level == I915_CACHE_WT);
+}
+
 /**
  * Creates a new mm object and returns a handle to it.
  * @dev: drm device pointer
@@ -753,6 +766,11 @@  flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 	case I915_GEM_DOMAIN_CPU:
 		i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 		break;
+
+	case I915_GEM_DOMAIN_RENDER:
+		if (gpu_write_needs_clflush(obj))
+			obj->cache_dirty = true;
+		break;
 	}
 
 	obj->base.write_domain = 0;
@@ -854,7 +872,8 @@  int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 	 * optimizes for the case when the gpu will dirty the data
 	 * anyway again before the next pread happens.
 	 */
-	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
+	if (!obj->cache_dirty &&
+	    !(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
 		*needs_clflush = CLFLUSH_BEFORE;
 
 out:
@@ -906,14 +925,15 @@  int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 	 * This optimizes for the case when the gpu will use the data
 	 * right away and we therefore have to clflush anyway.
 	 */
-	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
+	if (!obj->cache_dirty) {
 		*needs_clflush |= CLFLUSH_AFTER;
 
-	/* Same trick applies to invalidate partially written cachelines read
-	 * before writing.
-	 */
-	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
-		*needs_clflush |= CLFLUSH_BEFORE;
+		/* Same trick applies to invalidate partially written
+		 * cachelines read before writing.
+		 */
+		if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
+			*needs_clflush |= CLFLUSH_BEFORE;
+	}
 
 out:
 	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
@@ -3374,10 +3394,12 @@  int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
 
 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
 {
-	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU && !obj->cache_dirty)
-		return;
-
-	i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
+	/* We manually flush the CPU domain so that we can override and
+	 * force the flush for the display, and perform it asyncrhonously.
+	 */
+	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
+	if (obj->cache_dirty)
+		i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
 	obj->base.write_domain = 0;
 }
 
@@ -3636,14 +3658,17 @@  int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 		}
 	}
 
-	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU &&
-	    i915_gem_object_is_coherent(obj))
-		obj->cache_dirty = true;
+	/* Catch any deferred obj->cache_dirty markups */
+	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 
 	list_for_each_entry(vma, &obj->vma_list, obj_link)
 		vma->node.color = cache_level;
 	obj->cache_level = cache_level;
 
+	if (obj->base.write_domain & I915_GEM_DOMAIN_CPU &&
+	    cpu_write_needs_clflush(obj))
+		obj->cache_dirty = true;
+
 	return 0;
 }
 
@@ -3864,9 +3889,6 @@  i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 	if (ret)
 		return ret;
 
-	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
-		return 0;
-
 	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 
 	/* Flush the CPU cache if it's still invalid. */
@@ -3878,15 +3900,13 @@  i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 	/* It should now be out of any other write domains, and we can update
 	 * the domain values for our changes.
 	 */
-	GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
+	GEM_BUG_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
 
 	/* If we're writing through the CPU, then the GPU read domains will
 	 * need to be invalidated at next use.
 	 */
-	if (write) {
-		obj->base.read_domains = I915_GEM_DOMAIN_CPU;
-		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
-	}
+	if (write)
+		__start_cpu_write(obj);
 
 	return 0;
 }
@@ -4306,6 +4326,8 @@  i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
 	} else
 		obj->cache_level = I915_CACHE_NONE;
 
+	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
+
 	trace_i915_gem_object_create(obj);
 
 	return obj;
@@ -4968,10 +4990,8 @@  int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
 
 	mutex_lock(&dev_priv->drm.struct_mutex);
 	for (p = phases; *p; p++) {
-		list_for_each_entry(obj, *p, global_link) {
-			obj->base.read_domains = I915_GEM_DOMAIN_CPU;
-			obj->base.write_domain = I915_GEM_DOMAIN_CPU;
-		}
+		list_for_each_entry(obj, *p, global_link)
+			__start_cpu_write(obj);
 	}
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
index ffd01e02fe94..a895643c4dc4 100644
--- a/drivers/gpu/drm/i915/i915_gem_clflush.c
+++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
@@ -72,8 +72,6 @@  static const struct dma_fence_ops i915_clflush_ops = {
 static void __i915_do_clflush(struct drm_i915_gem_object *obj)
 {
 	drm_clflush_sg(obj->mm.pages);
-	obj->cache_dirty = false;
-
 	intel_fb_obj_flush(obj, ORIGIN_CPU);
 }
 
@@ -82,9 +80,6 @@  static void i915_clflush_work(struct work_struct *work)
 	struct clflush *clflush = container_of(work, typeof(*clflush), work);
 	struct drm_i915_gem_object *obj = clflush->obj;
 
-	if (!obj->cache_dirty)
-		goto out;
-
 	if (i915_gem_object_pin_pages(obj)) {
 		DRM_ERROR("Failed to acquire obj->pages for clflushing\n");
 		goto out;
@@ -132,10 +127,10 @@  void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
 	 * anything not backed by physical memory we consider to be always
 	 * coherent and not need clflushing.
 	 */
-	if (!i915_gem_object_has_struct_page(obj))
+	if (!i915_gem_object_has_struct_page(obj)) {
+		obj->cache_dirty = false;
 		return;
-
-	obj->cache_dirty = true;
+	}
 
 	/* If the GPU is snooping the contents of the CPU cache,
 	 * we do not need to manually clear the CPU cache lines.  However,
@@ -154,6 +149,8 @@  void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
 	if (!(flags & I915_CLFLUSH_SYNC))
 		clflush = kmalloc(sizeof(*clflush), GFP_KERNEL);
 	if (clflush) {
+		GEM_BUG_ON(!obj->cache_dirty);
+
 		dma_fence_init(&clflush->dma,
 			       &i915_clflush_ops,
 			       &clflush_lock,
@@ -181,6 +178,8 @@  void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
 	} else {
 		GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU);
 	}
+
+	obj->cache_dirty = false;
 }
 
 void i915_gem_clflush_init(struct drm_i915_private *i915)
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index af1965774e7b..ddc011ef5480 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -291,7 +291,7 @@  static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
 		return DBG_USE_CPU_RELOC > 0;
 
 	return (HAS_LLC(to_i915(obj->base.dev)) ||
-		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
+		obj->cache_dirty ||
 		obj->cache_level != I915_CACHE_NONE);
 }
 
@@ -1129,10 +1129,8 @@  i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
 			continue;
 
-		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
+		if (obj->base.write_domain & obj->cache_dirty)
 			i915_gem_clflush_object(obj, 0);
-			obj->base.write_domain = 0;
-		}
 
 		ret = i915_gem_request_await_object
 			(req, obj, obj->base.pending_write_domain);
@@ -1265,12 +1263,6 @@  i915_gem_validate_context(struct drm_device *dev, struct drm_file *file,
 	return ctx;
 }
 
-static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
-{
-	return !(obj->cache_level == I915_CACHE_NONE ||
-		 obj->cache_level == I915_CACHE_WT);
-}
-
 void i915_vma_move_to_active(struct i915_vma *vma,
 			     struct drm_i915_gem_request *req,
 			     unsigned int flags)
@@ -1294,15 +1286,16 @@  void i915_vma_move_to_active(struct i915_vma *vma,
 	i915_gem_active_set(&vma->last_read[idx], req);
 	list_move_tail(&vma->vm_link, &vma->vm->active_list);
 
+	obj->base.write_domain = 0;
 	if (flags & EXEC_OBJECT_WRITE) {
+		obj->base.write_domain = I915_GEM_DOMAIN_RENDER;
+
 		if (intel_fb_obj_invalidate(obj, ORIGIN_CS))
 			i915_gem_active_set(&obj->frontbuffer_write, req);
 
-		/* update for the implicit flush after a batch */
-		obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
-		if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
-			obj->cache_dirty = true;
+		obj->base.read_domains = 0;
 	}
+	obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
 
 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
 		i915_gem_active_set(&vma->last_fence, req);
diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c
index fc950abbe400..58e93e87d573 100644
--- a/drivers/gpu/drm/i915/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/i915_gem_internal.c
@@ -188,9 +188,10 @@  i915_gem_object_create_internal(struct drm_i915_private *i915,
 	drm_gem_private_object_init(&i915->drm, &obj->base, size);
 	i915_gem_object_init(obj, &i915_gem_object_internal_ops);
 
-	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
+	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
 
 	return obj;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 58ccf8b8ca1c..9f84be171ad2 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -802,9 +802,10 @@  i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
 
 	drm_gem_private_object_init(dev, &obj->base, args->user_size);
 	i915_gem_object_init(obj, &i915_gem_userptr_ops);
-	obj->cache_level = I915_CACHE_LLC;
-	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
+	obj->cache_level = I915_CACHE_LLC;
+	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
 
 	obj->userptr.ptr = args->user_ptr;
 	obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY);
diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
index 4e681fc13be4..0ca867a877b6 100644
--- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c
+++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
@@ -126,9 +126,10 @@  huge_gem_object(struct drm_i915_private *i915,
 	drm_gem_private_object_init(&i915->drm, &obj->base, dma_size);
 	i915_gem_object_init(obj, &huge_ops);
 
-	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
+	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
 	obj->scratch = phys_size;
 
 	return obj;