Message ID | 20170419094143.16922-3-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
I tried your patch but it didn't fix the original problem. I think it is somehow related to the flushing condition here: @@ -1129,10 +1129,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req, if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC) continue; if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) { + if (obj->base.write_domain & obj->cache_dirty) i915_gem_clflush_object(obj, 0); - obj->base.write_domain = 0; - } here, we do clflush only if write_domain is not 0 even if cache_dirty flag is set after your patch is applied. And now please look at this: @@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains) case I915_GEM_DOMAIN_CPU: i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC); break; + + case I915_GEM_DOMAIN_RENDER: + if (gpu_write_needs_clflush(obj)) + obj->cache_dirty = true; + break; } obj->base.write_domain=0; So here, if the write_domain is I915_GEM_DOMAIN_RENDER, we set cache_dirty to true then reset write_domain. So right after this flush_write_domain call, write_domain will be 0 but cache is still dirty. I am wondering if this is where that condition (write_domain==0 and cache_dirty==1) originally came from. On Wed, Apr 19, 2017 at 10:41:18AM +0100, Chris Wilson wrote: > Currently, we only mark the CPU cache as dirty if we skip a clflush. > This leads to some confusion where we have to ask if the object is in > the write domain or missed a clflush. If we always mark the cache as > dirty, this becomes a much simply question to answer. > > The goal remains to do as few clflushes as required and to do them as > late as possible, in the hope of deferring the work to a kthread and not > block the caller (e.g. execbuf, flips). > > Reported-by: Dongwon Kim <dongwon.kim@intel.com> > Fixes: a6a7cc4b7db6 ("drm/i915: Always flush the dirty CPU cache when pinning the scanout") > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Dongwon Kim <dongwon.kim@intel.com> > Cc: Matt Roper <matthew.d.roper@intel.com> > --- > drivers/gpu/drm/i915/i915_gem.c | 78 +++++++++++++++--------- > drivers/gpu/drm/i915/i915_gem_clflush.c | 15 +++-- > drivers/gpu/drm/i915/i915_gem_execbuffer.c | 21 +++---- > drivers/gpu/drm/i915/i915_gem_internal.c | 3 +- > drivers/gpu/drm/i915/i915_gem_userptr.c | 5 +- > drivers/gpu/drm/i915/selftests/huge_gem_object.c | 3 +- > 6 files changed, 70 insertions(+), 55 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 33fb11cc5acc..488ca7733c1e 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -49,7 +49,7 @@ static void i915_gem_flush_free_objects(struct drm_i915_private *i915); > > static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj) > { > - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU) > + if (obj->cache_dirty) > return false; > > if (!i915_gem_object_is_coherent(obj)) > @@ -233,6 +233,14 @@ i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj) > return st; > } > > +static void __start_cpu_write(struct drm_i915_gem_object *obj) > +{ > + obj->base.read_domains = I915_GEM_DOMAIN_CPU; > + obj->base.write_domain = I915_GEM_DOMAIN_CPU; > + if (cpu_write_needs_clflush(obj)) > + obj->cache_dirty = true; > +} > + > static void > __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj, > struct sg_table *pages, > @@ -248,8 +256,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj, > !i915_gem_object_is_coherent(obj)) > drm_clflush_sg(pages); > > - obj->base.read_domains = I915_GEM_DOMAIN_CPU; > - obj->base.write_domain = I915_GEM_DOMAIN_CPU; > + __start_cpu_write(obj); > } > > static void > @@ -684,6 +691,12 @@ i915_gem_dumb_create(struct drm_file *file, > args->size, &args->handle); > } > > +static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj) > +{ > + return !(obj->cache_level == I915_CACHE_NONE || > + obj->cache_level == I915_CACHE_WT); > +} > + > /** > * Creates a new mm object and returns a handle to it. > * @dev: drm device pointer > @@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains) > case I915_GEM_DOMAIN_CPU: > i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC); > break; > + > + case I915_GEM_DOMAIN_RENDER: > + if (gpu_write_needs_clflush(obj)) > + obj->cache_dirty = true; > + break; > } > > obj->base.write_domain = 0; > @@ -854,7 +872,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj, > * optimizes for the case when the gpu will dirty the data > * anyway again before the next pread happens. > */ > - if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) > + if (!obj->cache_dirty && > + !(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) > *needs_clflush = CLFLUSH_BEFORE; > > out: > @@ -906,14 +925,15 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj, > * This optimizes for the case when the gpu will use the data > * right away and we therefore have to clflush anyway. > */ > - if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) > + if (!obj->cache_dirty) { > *needs_clflush |= CLFLUSH_AFTER; > > - /* Same trick applies to invalidate partially written cachelines read > - * before writing. > - */ > - if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) > - *needs_clflush |= CLFLUSH_BEFORE; > + /* Same trick applies to invalidate partially written > + * cachelines read before writing. > + */ > + if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) > + *needs_clflush |= CLFLUSH_BEFORE; > + } > > out: > intel_fb_obj_invalidate(obj, ORIGIN_CPU); > @@ -3374,10 +3394,12 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags) > > static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj) > { > - if (obj->base.write_domain != I915_GEM_DOMAIN_CPU && !obj->cache_dirty) > - return; > - > - i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE); > + /* We manually flush the CPU domain so that we can override and > + * force the flush for the display, and perform it asyncrhonously. > + */ > + flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU); > + if (obj->cache_dirty) > + i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE); > obj->base.write_domain = 0; > } > > @@ -3636,14 +3658,17 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj, > } > } > > - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU && > - i915_gem_object_is_coherent(obj)) > - obj->cache_dirty = true; > + /* Catch any deferred obj->cache_dirty markups */ > + flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU); > > list_for_each_entry(vma, &obj->vma_list, obj_link) > vma->node.color = cache_level; > obj->cache_level = cache_level; > > + if (obj->base.write_domain & I915_GEM_DOMAIN_CPU && > + cpu_write_needs_clflush(obj)) > + obj->cache_dirty = true; > + > return 0; > } > > @@ -3864,9 +3889,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write) > if (ret) > return ret; > > - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU) > - return 0; > - > flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU); > > /* Flush the CPU cache if it's still invalid. */ > @@ -3878,15 +3900,13 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write) > /* It should now be out of any other write domains, and we can update > * the domain values for our changes. > */ > - GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0); > + GEM_BUG_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU); > > /* If we're writing through the CPU, then the GPU read domains will > * need to be invalidated at next use. > */ > - if (write) { > - obj->base.read_domains = I915_GEM_DOMAIN_CPU; > - obj->base.write_domain = I915_GEM_DOMAIN_CPU; > - } > + if (write) > + __start_cpu_write(obj); > > return 0; > } > @@ -4306,6 +4326,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size) > } else > obj->cache_level = I915_CACHE_NONE; > > + obj->cache_dirty = !i915_gem_object_is_coherent(obj); > + > trace_i915_gem_object_create(obj); > > return obj; > @@ -4968,10 +4990,8 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv) > > mutex_lock(&dev_priv->drm.struct_mutex); > for (p = phases; *p; p++) { > - list_for_each_entry(obj, *p, global_link) { > - obj->base.read_domains = I915_GEM_DOMAIN_CPU; > - obj->base.write_domain = I915_GEM_DOMAIN_CPU; > - } > + list_for_each_entry(obj, *p, global_link) > + __start_cpu_write(obj); > } > mutex_unlock(&dev_priv->drm.struct_mutex); > > diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c > index ffd01e02fe94..a895643c4dc4 100644 > --- a/drivers/gpu/drm/i915/i915_gem_clflush.c > +++ b/drivers/gpu/drm/i915/i915_gem_clflush.c > @@ -72,8 +72,6 @@ static const struct dma_fence_ops i915_clflush_ops = { > static void __i915_do_clflush(struct drm_i915_gem_object *obj) > { > drm_clflush_sg(obj->mm.pages); > - obj->cache_dirty = false; > - > intel_fb_obj_flush(obj, ORIGIN_CPU); > } > > @@ -82,9 +80,6 @@ static void i915_clflush_work(struct work_struct *work) > struct clflush *clflush = container_of(work, typeof(*clflush), work); > struct drm_i915_gem_object *obj = clflush->obj; > > - if (!obj->cache_dirty) > - goto out; > - > if (i915_gem_object_pin_pages(obj)) { > DRM_ERROR("Failed to acquire obj->pages for clflushing\n"); > goto out; > @@ -132,10 +127,10 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj, > * anything not backed by physical memory we consider to be always > * coherent and not need clflushing. > */ > - if (!i915_gem_object_has_struct_page(obj)) > + if (!i915_gem_object_has_struct_page(obj)) { > + obj->cache_dirty = false; > return; > - > - obj->cache_dirty = true; > + } > > /* If the GPU is snooping the contents of the CPU cache, > * we do not need to manually clear the CPU cache lines. However, > @@ -154,6 +149,8 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj, > if (!(flags & I915_CLFLUSH_SYNC)) > clflush = kmalloc(sizeof(*clflush), GFP_KERNEL); > if (clflush) { > + GEM_BUG_ON(!obj->cache_dirty); > + > dma_fence_init(&clflush->dma, > &i915_clflush_ops, > &clflush_lock, > @@ -181,6 +178,8 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj, > } else { > GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU); > } > + > + obj->cache_dirty = false; > } > > void i915_gem_clflush_init(struct drm_i915_private *i915) > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > index af1965774e7b..ddc011ef5480 100644 > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > @@ -291,7 +291,7 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj) > return DBG_USE_CPU_RELOC > 0; > > return (HAS_LLC(to_i915(obj->base.dev)) || > - obj->base.write_domain == I915_GEM_DOMAIN_CPU || > + obj->cache_dirty || > obj->cache_level != I915_CACHE_NONE); > } > > @@ -1129,10 +1129,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req, > if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC) > continue; > > - if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) { > + if (obj->base.write_domain & obj->cache_dirty) > i915_gem_clflush_object(obj, 0); > - obj->base.write_domain = 0; > - } > > ret = i915_gem_request_await_object > (req, obj, obj->base.pending_write_domain); > @@ -1265,12 +1263,6 @@ i915_gem_validate_context(struct drm_device *dev, struct drm_file *file, > return ctx; > } > > -static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj) > -{ > - return !(obj->cache_level == I915_CACHE_NONE || > - obj->cache_level == I915_CACHE_WT); > -} > - > void i915_vma_move_to_active(struct i915_vma *vma, > struct drm_i915_gem_request *req, > unsigned int flags) > @@ -1294,15 +1286,16 @@ void i915_vma_move_to_active(struct i915_vma *vma, > i915_gem_active_set(&vma->last_read[idx], req); > list_move_tail(&vma->vm_link, &vma->vm->active_list); > > + obj->base.write_domain = 0; > if (flags & EXEC_OBJECT_WRITE) { > + obj->base.write_domain = I915_GEM_DOMAIN_RENDER; > + > if (intel_fb_obj_invalidate(obj, ORIGIN_CS)) > i915_gem_active_set(&obj->frontbuffer_write, req); > > - /* update for the implicit flush after a batch */ > - obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS; > - if (!obj->cache_dirty && gpu_write_needs_clflush(obj)) > - obj->cache_dirty = true; > + obj->base.read_domains = 0; > } > + obj->base.read_domains |= I915_GEM_GPU_DOMAINS; > > if (flags & EXEC_OBJECT_NEEDS_FENCE) > i915_gem_active_set(&vma->last_fence, req); > diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c > index fc950abbe400..58e93e87d573 100644 > --- a/drivers/gpu/drm/i915/i915_gem_internal.c > +++ b/drivers/gpu/drm/i915/i915_gem_internal.c > @@ -188,9 +188,10 @@ i915_gem_object_create_internal(struct drm_i915_private *i915, > drm_gem_private_object_init(&i915->drm, &obj->base, size); > i915_gem_object_init(obj, &i915_gem_object_internal_ops); > > - obj->base.write_domain = I915_GEM_DOMAIN_CPU; > obj->base.read_domains = I915_GEM_DOMAIN_CPU; > + obj->base.write_domain = I915_GEM_DOMAIN_CPU; > obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE; > + obj->cache_dirty = !i915_gem_object_is_coherent(obj); > > return obj; > } > diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c > index 58ccf8b8ca1c..9f84be171ad2 100644 > --- a/drivers/gpu/drm/i915/i915_gem_userptr.c > +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c > @@ -802,9 +802,10 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file > > drm_gem_private_object_init(dev, &obj->base, args->user_size); > i915_gem_object_init(obj, &i915_gem_userptr_ops); > - obj->cache_level = I915_CACHE_LLC; > - obj->base.write_domain = I915_GEM_DOMAIN_CPU; > obj->base.read_domains = I915_GEM_DOMAIN_CPU; > + obj->base.write_domain = I915_GEM_DOMAIN_CPU; > + obj->cache_level = I915_CACHE_LLC; > + obj->cache_dirty = !i915_gem_object_is_coherent(obj); > > obj->userptr.ptr = args->user_ptr; > obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY); > diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c > index 4e681fc13be4..0ca867a877b6 100644 > --- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c > +++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c > @@ -126,9 +126,10 @@ huge_gem_object(struct drm_i915_private *i915, > drm_gem_private_object_init(&i915->drm, &obj->base, dma_size); > i915_gem_object_init(obj, &huge_ops); > > - obj->base.write_domain = I915_GEM_DOMAIN_CPU; > obj->base.read_domains = I915_GEM_DOMAIN_CPU; > + obj->base.write_domain = I915_GEM_DOMAIN_CPU; > obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE; > + obj->cache_dirty = !i915_gem_object_is_coherent(obj); > obj->scratch = phys_size; > > return obj; > -- > 2.11.0 >
On Wed, Apr 19, 2017 at 09:52:28AM -0700, Dongwon Kim wrote: > I tried your patch but it didn't fix the original > problem. I think it is somehow related to the flushing condition > here: > > @@ -1129,10 +1129,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req, > if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC) > continue; > > if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) { > + if (obj->base.write_domain & obj->cache_dirty) > i915_gem_clflush_object(obj, 0); > - obj->base.write_domain = 0; > - } > > here, we do clflush only if write_domain is not 0 even if cache_dirty > flag is set after your patch is applied. > > And now please look at this: > > @@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains) > case I915_GEM_DOMAIN_CPU: > i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC); > break; > + > + case I915_GEM_DOMAIN_RENDER: > + if (gpu_write_needs_clflush(obj)) > + obj->cache_dirty = true; > + break; > } > > obj->base.write_domain=0; > > So here, if the write_domain is I915_GEM_DOMAIN_RENDER, we set cache_dirty to true > then reset write_domain. > > So right after this flush_write_domain call, write_domain will be 0 but cache is > still dirty. I am wondering if this is where that condition (write_domain==0 and > cache_dirty==1) originally came from. And we definitely do not want to be flushing the cache for the GPU after a GPU write. We only want for that cache to be flushed after the GPU if it is moved to a non-coherent domain. That's the challenge. I was also expecting that (incoherent) reads from the GPU would go through the cache - we make the same assumption for CPU reads. Thanks for testing, definitely back to the drawing board. Hmm, might be worth taking the alternative approach and to always schedule an async clflush in set-cache-domain. Just have to check that we have appropriate waits and don't have any inappropriate ones. -Chris
On Wed, Apr 19, 2017 at 09:52:28AM -0700, Dongwon Kim wrote: > I tried your patch but it didn't fix the original > problem. I think it is somehow related to the flushing condition > here: I don't think I actually checked what GPU you observed it on - I was assuming llc, since that was the last bug we had. -Chris
On Wed, Apr 19, 2017 at 09:52:28AM -0700, Dongwon Kim wrote: > I tried your patch but it didn't fix the original > problem. I think it is somehow related to the flushing condition > here: > > @@ -1129,10 +1129,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req, > if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC) > continue; > > if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) { > + if (obj->base.write_domain & obj->cache_dirty) > i915_gem_clflush_object(obj, 0); > - obj->base.write_domain = 0; > - } > > here, we do clflush only if write_domain is not 0 even if cache_dirty > flag is set after your patch is applied. This can be just reduced to if (obj->cache_dirty) clflush(). We're slightly better in that we don't set obj->cache_dirty so often for normal gpu rendering (just on transitions away from the gpu now), but it still means we will be redundantly checking for clflushes prior to rendering. Can you double check that this patch + if (obj->cache_dirty) works for you? What I guess I really want here is if (obj->cache_dirty & !obj->cache_coherent) essentially inlining the check from clflush. -Chris
Chris, Just to make sure, you want to just remove write_domain check from if statement before clflush in execbuffer_move_to_gpu. Am I right? I will try both (cache_dirty only vs cache_dirty & !cache_coherent) and get back to you shortly. On Wed, Apr 19, 2017 at 07:08:33PM +0100, Chris Wilson wrote: > On Wed, Apr 19, 2017 at 09:52:28AM -0700, Dongwon Kim wrote: > > I tried your patch but it didn't fix the original > > problem. I think it is somehow related to the flushing condition > > here: > > > > @@ -1129,10 +1129,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req, > > if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC) > > continue; > > > > if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) { > > + if (obj->base.write_domain & obj->cache_dirty) > > i915_gem_clflush_object(obj, 0); > > - obj->base.write_domain = 0; > > - } > > > > here, we do clflush only if write_domain is not 0 even if cache_dirty > > flag is set after your patch is applied. > > This can be just reduced to if (obj->cache_dirty) clflush(). > > We're slightly better in that we don't set obj->cache_dirty so often for > normal gpu rendering (just on transitions away from the gpu now), but it > still means we will be redundantly checking for clflushes prior to > rendering. > > Can you double check that this patch + if (obj->cache_dirty) works for you? > > What I guess I really want here is > if (obj->cache_dirty & !obj->cache_coherent) > essentially inlining the check from clflush. > -Chris > > -- > Chris Wilson, Intel Open Source Technology Centre
On Wed, Apr 19, 2017 at 11:13:17AM -0700, Dongwon Kim wrote: > Chris, > > Just to make sure, you want to just remove write_domain check from > if statement before clflush in execbuffer_move_to_gpu. Am I right? > I will try both (cache_dirty only vs cache_dirty & !cache_coherent) > and get back to you shortly. Yes, I just don't have a single bit for cache_coherent yet, so you might as well let that call i915_gem_object_clflush(). -Chris
Chris, I think my assumption was not correct. I took out write_domain check but it is still failing. However, here are couple of observation points. I did some experiments.. One of them is to take out even cache_dirty check from eb_move_to_gpu. With this, all sample tests were passing but as you might expect, tests were running so slow, which explains how much clflush cost. Then, I put cache_dirty check back into eb_move_to_gpu then removed 'if (gpu_write_needs_clflush(obj))' from flush_write_domain when write_domain is I915_GEM_DOMAIN_RENDER @@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigne d int flush_domains) + + case I915_GEM_DOMAIN_RENDER: + if (gpu_write_needs_clflush(obj)) <-- took out this line + obj->cache_dirty = true; + break; to make cache_dirty is set all the time if write_domain is I915_GEM_DOMAIN_RENDER. And I saw some of failing tests were now passing but this doesn't fix all of them. I will try to revert back other changes in your patch as well. Please let me know if there's any other thing you want me to try to find a clue. On Wed, Apr 19, 2017 at 07:26:29PM +0100, Chris Wilson wrote: > On Wed, Apr 19, 2017 at 11:13:17AM -0700, Dongwon Kim wrote: > > Chris, > > > > Just to make sure, you want to just remove write_domain check from > > if statement before clflush in execbuffer_move_to_gpu. Am I right? > > I will try both (cache_dirty only vs cache_dirty & !cache_coherent) > > and get back to you shortly. > > Yes, I just don't have a single bit for cache_coherent yet, so you > might as well let that call i915_gem_object_clflush(). > -Chris > > -- > Chris Wilson, Intel Open Source Technology Centre
Chris, I am sorry that I didn't tell you about GPU that I am working on. It is GEN9LP. Our target is APL-I. So no LLC is available. On Wed, Apr 19, 2017 at 07:26:29PM +0100, Chris Wilson wrote: > On Wed, Apr 19, 2017 at 11:13:17AM -0700, Dongwon Kim wrote: > > Chris, > > > > Just to make sure, you want to just remove write_domain check from > > if statement before clflush in execbuffer_move_to_gpu. Am I right? > > I will try both (cache_dirty only vs cache_dirty & !cache_coherent) > > and get back to you shortly. > > Yes, I just don't have a single bit for cache_coherent yet, so you > might as well let that call i915_gem_object_clflush(). > -Chris > > -- > Chris Wilson, Intel Open Source Technology Centre
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 33fb11cc5acc..488ca7733c1e 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -49,7 +49,7 @@ static void i915_gem_flush_free_objects(struct drm_i915_private *i915); static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj) { - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU) + if (obj->cache_dirty) return false; if (!i915_gem_object_is_coherent(obj)) @@ -233,6 +233,14 @@ i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj) return st; } +static void __start_cpu_write(struct drm_i915_gem_object *obj) +{ + obj->base.read_domains = I915_GEM_DOMAIN_CPU; + obj->base.write_domain = I915_GEM_DOMAIN_CPU; + if (cpu_write_needs_clflush(obj)) + obj->cache_dirty = true; +} + static void __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj, struct sg_table *pages, @@ -248,8 +256,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj, !i915_gem_object_is_coherent(obj)) drm_clflush_sg(pages); - obj->base.read_domains = I915_GEM_DOMAIN_CPU; - obj->base.write_domain = I915_GEM_DOMAIN_CPU; + __start_cpu_write(obj); } static void @@ -684,6 +691,12 @@ i915_gem_dumb_create(struct drm_file *file, args->size, &args->handle); } +static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj) +{ + return !(obj->cache_level == I915_CACHE_NONE || + obj->cache_level == I915_CACHE_WT); +} + /** * Creates a new mm object and returns a handle to it. * @dev: drm device pointer @@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains) case I915_GEM_DOMAIN_CPU: i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC); break; + + case I915_GEM_DOMAIN_RENDER: + if (gpu_write_needs_clflush(obj)) + obj->cache_dirty = true; + break; } obj->base.write_domain = 0; @@ -854,7 +872,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj, * optimizes for the case when the gpu will dirty the data * anyway again before the next pread happens. */ - if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) + if (!obj->cache_dirty && + !(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) *needs_clflush = CLFLUSH_BEFORE; out: @@ -906,14 +925,15 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj, * This optimizes for the case when the gpu will use the data * right away and we therefore have to clflush anyway. */ - if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) + if (!obj->cache_dirty) { *needs_clflush |= CLFLUSH_AFTER; - /* Same trick applies to invalidate partially written cachelines read - * before writing. - */ - if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) - *needs_clflush |= CLFLUSH_BEFORE; + /* Same trick applies to invalidate partially written + * cachelines read before writing. + */ + if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) + *needs_clflush |= CLFLUSH_BEFORE; + } out: intel_fb_obj_invalidate(obj, ORIGIN_CPU); @@ -3374,10 +3394,12 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags) static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj) { - if (obj->base.write_domain != I915_GEM_DOMAIN_CPU && !obj->cache_dirty) - return; - - i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE); + /* We manually flush the CPU domain so that we can override and + * force the flush for the display, and perform it asyncrhonously. + */ + flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU); + if (obj->cache_dirty) + i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE); obj->base.write_domain = 0; } @@ -3636,14 +3658,17 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj, } } - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU && - i915_gem_object_is_coherent(obj)) - obj->cache_dirty = true; + /* Catch any deferred obj->cache_dirty markups */ + flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU); list_for_each_entry(vma, &obj->vma_list, obj_link) vma->node.color = cache_level; obj->cache_level = cache_level; + if (obj->base.write_domain & I915_GEM_DOMAIN_CPU && + cpu_write_needs_clflush(obj)) + obj->cache_dirty = true; + return 0; } @@ -3864,9 +3889,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write) if (ret) return ret; - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU) - return 0; - flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU); /* Flush the CPU cache if it's still invalid. */ @@ -3878,15 +3900,13 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write) /* It should now be out of any other write domains, and we can update * the domain values for our changes. */ - GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0); + GEM_BUG_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU); /* If we're writing through the CPU, then the GPU read domains will * need to be invalidated at next use. */ - if (write) { - obj->base.read_domains = I915_GEM_DOMAIN_CPU; - obj->base.write_domain = I915_GEM_DOMAIN_CPU; - } + if (write) + __start_cpu_write(obj); return 0; } @@ -4306,6 +4326,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size) } else obj->cache_level = I915_CACHE_NONE; + obj->cache_dirty = !i915_gem_object_is_coherent(obj); + trace_i915_gem_object_create(obj); return obj; @@ -4968,10 +4990,8 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv) mutex_lock(&dev_priv->drm.struct_mutex); for (p = phases; *p; p++) { - list_for_each_entry(obj, *p, global_link) { - obj->base.read_domains = I915_GEM_DOMAIN_CPU; - obj->base.write_domain = I915_GEM_DOMAIN_CPU; - } + list_for_each_entry(obj, *p, global_link) + __start_cpu_write(obj); } mutex_unlock(&dev_priv->drm.struct_mutex); diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c index ffd01e02fe94..a895643c4dc4 100644 --- a/drivers/gpu/drm/i915/i915_gem_clflush.c +++ b/drivers/gpu/drm/i915/i915_gem_clflush.c @@ -72,8 +72,6 @@ static const struct dma_fence_ops i915_clflush_ops = { static void __i915_do_clflush(struct drm_i915_gem_object *obj) { drm_clflush_sg(obj->mm.pages); - obj->cache_dirty = false; - intel_fb_obj_flush(obj, ORIGIN_CPU); } @@ -82,9 +80,6 @@ static void i915_clflush_work(struct work_struct *work) struct clflush *clflush = container_of(work, typeof(*clflush), work); struct drm_i915_gem_object *obj = clflush->obj; - if (!obj->cache_dirty) - goto out; - if (i915_gem_object_pin_pages(obj)) { DRM_ERROR("Failed to acquire obj->pages for clflushing\n"); goto out; @@ -132,10 +127,10 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj, * anything not backed by physical memory we consider to be always * coherent and not need clflushing. */ - if (!i915_gem_object_has_struct_page(obj)) + if (!i915_gem_object_has_struct_page(obj)) { + obj->cache_dirty = false; return; - - obj->cache_dirty = true; + } /* If the GPU is snooping the contents of the CPU cache, * we do not need to manually clear the CPU cache lines. However, @@ -154,6 +149,8 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj, if (!(flags & I915_CLFLUSH_SYNC)) clflush = kmalloc(sizeof(*clflush), GFP_KERNEL); if (clflush) { + GEM_BUG_ON(!obj->cache_dirty); + dma_fence_init(&clflush->dma, &i915_clflush_ops, &clflush_lock, @@ -181,6 +178,8 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj, } else { GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU); } + + obj->cache_dirty = false; } void i915_gem_clflush_init(struct drm_i915_private *i915) diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index af1965774e7b..ddc011ef5480 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -291,7 +291,7 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj) return DBG_USE_CPU_RELOC > 0; return (HAS_LLC(to_i915(obj->base.dev)) || - obj->base.write_domain == I915_GEM_DOMAIN_CPU || + obj->cache_dirty || obj->cache_level != I915_CACHE_NONE); } @@ -1129,10 +1129,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req, if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC) continue; - if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) { + if (obj->base.write_domain & obj->cache_dirty) i915_gem_clflush_object(obj, 0); - obj->base.write_domain = 0; - } ret = i915_gem_request_await_object (req, obj, obj->base.pending_write_domain); @@ -1265,12 +1263,6 @@ i915_gem_validate_context(struct drm_device *dev, struct drm_file *file, return ctx; } -static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj) -{ - return !(obj->cache_level == I915_CACHE_NONE || - obj->cache_level == I915_CACHE_WT); -} - void i915_vma_move_to_active(struct i915_vma *vma, struct drm_i915_gem_request *req, unsigned int flags) @@ -1294,15 +1286,16 @@ void i915_vma_move_to_active(struct i915_vma *vma, i915_gem_active_set(&vma->last_read[idx], req); list_move_tail(&vma->vm_link, &vma->vm->active_list); + obj->base.write_domain = 0; if (flags & EXEC_OBJECT_WRITE) { + obj->base.write_domain = I915_GEM_DOMAIN_RENDER; + if (intel_fb_obj_invalidate(obj, ORIGIN_CS)) i915_gem_active_set(&obj->frontbuffer_write, req); - /* update for the implicit flush after a batch */ - obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS; - if (!obj->cache_dirty && gpu_write_needs_clflush(obj)) - obj->cache_dirty = true; + obj->base.read_domains = 0; } + obj->base.read_domains |= I915_GEM_GPU_DOMAINS; if (flags & EXEC_OBJECT_NEEDS_FENCE) i915_gem_active_set(&vma->last_fence, req); diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c index fc950abbe400..58e93e87d573 100644 --- a/drivers/gpu/drm/i915/i915_gem_internal.c +++ b/drivers/gpu/drm/i915/i915_gem_internal.c @@ -188,9 +188,10 @@ i915_gem_object_create_internal(struct drm_i915_private *i915, drm_gem_private_object_init(&i915->drm, &obj->base, size); i915_gem_object_init(obj, &i915_gem_object_internal_ops); - obj->base.write_domain = I915_GEM_DOMAIN_CPU; obj->base.read_domains = I915_GEM_DOMAIN_CPU; + obj->base.write_domain = I915_GEM_DOMAIN_CPU; obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE; + obj->cache_dirty = !i915_gem_object_is_coherent(obj); return obj; } diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 58ccf8b8ca1c..9f84be171ad2 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -802,9 +802,10 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file drm_gem_private_object_init(dev, &obj->base, args->user_size); i915_gem_object_init(obj, &i915_gem_userptr_ops); - obj->cache_level = I915_CACHE_LLC; - obj->base.write_domain = I915_GEM_DOMAIN_CPU; obj->base.read_domains = I915_GEM_DOMAIN_CPU; + obj->base.write_domain = I915_GEM_DOMAIN_CPU; + obj->cache_level = I915_CACHE_LLC; + obj->cache_dirty = !i915_gem_object_is_coherent(obj); obj->userptr.ptr = args->user_ptr; obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY); diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c index 4e681fc13be4..0ca867a877b6 100644 --- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c +++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c @@ -126,9 +126,10 @@ huge_gem_object(struct drm_i915_private *i915, drm_gem_private_object_init(&i915->drm, &obj->base, dma_size); i915_gem_object_init(obj, &huge_ops); - obj->base.write_domain = I915_GEM_DOMAIN_CPU; obj->base.read_domains = I915_GEM_DOMAIN_CPU; + obj->base.write_domain = I915_GEM_DOMAIN_CPU; obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE; + obj->cache_dirty = !i915_gem_object_is_coherent(obj); obj->scratch = phys_size; return obj;
Currently, we only mark the CPU cache as dirty if we skip a clflush. This leads to some confusion where we have to ask if the object is in the write domain or missed a clflush. If we always mark the cache as dirty, this becomes a much simply question to answer. The goal remains to do as few clflushes as required and to do them as late as possible, in the hope of deferring the work to a kthread and not block the caller (e.g. execbuf, flips). Reported-by: Dongwon Kim <dongwon.kim@intel.com> Fixes: a6a7cc4b7db6 ("drm/i915: Always flush the dirty CPU cache when pinning the scanout") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Dongwon Kim <dongwon.kim@intel.com> Cc: Matt Roper <matthew.d.roper@intel.com> --- drivers/gpu/drm/i915/i915_gem.c | 78 +++++++++++++++--------- drivers/gpu/drm/i915/i915_gem_clflush.c | 15 +++-- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 21 +++---- drivers/gpu/drm/i915/i915_gem_internal.c | 3 +- drivers/gpu/drm/i915/i915_gem_userptr.c | 5 +- drivers/gpu/drm/i915/selftests/huge_gem_object.c | 3 +- 6 files changed, 70 insertions(+), 55 deletions(-)