Message ID | 1418526504-26316-5-git-send-email-benjamin.widawsky@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Tested-By: PRC QA PRTS (Patch Regression Test System Contact: shuang.he@intel.com)
-------------------------------------Summary-------------------------------------
Platform Delta drm-intel-nightly Series Applied
PNV 364/364 364/364
ILK +3 362/366 365/366
SNB 448/450 448/450
IVB 497/498 497/498
BYT 289/289 289/289
HSW 563/564 563/564
BDW 417/417 417/417
-------------------------------------Detailed-------------------------------------
Platform Test drm-intel-nightly Series Applied
ILK igt_kms_flip_nonexisting-fb DMESG_WARN(1, M26)PASS(5, M37M26) PASS(1, M37)
ILK igt_kms_flip_rcs-flip-vs-panning-interruptible DMESG_WARN(2, M26)PASS(4, M37M26) PASS(1, M37)
ILK igt_kms_flip_rcs-wf_vblank-vs-dpms-interruptible DMESG_WARN(1, M26)PASS(4, M26M37) PASS(1, M37)
Note: You need to pay more attention to line start with '*'
On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote: > If we're moving a bunch of buffers from the CPU domain to the GPU domain, and > we've already blown out the entire cache via a wbinvd, there is nothing more to > do. > > With this and the previous patches, I am seeing a 3x FPS increase on a certain > benchmark which uses a giant 2d array texture. Unless I missed something in the > code, it should only effect non-LLC i915 platforms. > > I haven't yet run any numbers for other benchmarks, nor have I attempted to > check if various conformance tests still pass. > > NOTE: As mentioned in the previous patch, if one can easily obtain the largest > buffer and attempt to flush it first, the results would be even more desirable. So even with that optimization if you only have tons of small buffers that need to be flushed you'd still take the clflush path for every single one. How difficult would it to calculate the total size to be flushed first, and then make the clflush vs. wbinvd decision base on that? > > Cc: DRI Development <dri-devel@lists.freedesktop.org> > Signed-off-by: Ben Widawsky <ben@bwidawsk.net> > --- > drivers/gpu/drm/i915/i915_drv.h | 3 ++- > drivers/gpu/drm/i915/i915_gem.c | 12 +++++------- > drivers/gpu/drm/i915/i915_gem_execbuffer.c | 8 +++++--- > drivers/gpu/drm/i915/intel_lrc.c | 8 +++++--- > 4 files changed, 17 insertions(+), 14 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index d68c75f..fdb92a3 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2642,7 +2642,8 @@ static inline bool i915_stop_ring_allow_warn(struct drm_i915_private *dev_priv) > } > > void i915_gem_reset(struct drm_device *dev); > -bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force); > +enum drm_cache_flush > +i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force); > int __must_check i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj); > int __must_check i915_gem_init(struct drm_device *dev); > int i915_gem_init_rings(struct drm_device *dev); > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index de241eb..3746738 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -3608,7 +3608,7 @@ err_unpin: > return vma; > } > > -bool > +enum drm_cache_flush > i915_gem_clflush_object(struct drm_i915_gem_object *obj, > bool force) > { > @@ -3617,14 +3617,14 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj, > * again at bind time. > */ > if (obj->pages == NULL) > - return false; > + return DRM_CACHE_FLUSH_NONE; > > /* > * Stolen memory is always coherent with the GPU as it is explicitly > * marked as wc by the system, or the system is cache-coherent. > */ > if (obj->stolen || obj->phys_handle) > - return false; > + return DRM_CACHE_FLUSH_NONE; > > /* If the GPU is snooping the contents of the CPU cache, > * we do not need to manually clear the CPU cache lines. However, > @@ -3635,12 +3635,10 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj, > * tracking. > */ > if (!force && cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) > - return false; > + return DRM_CACHE_FLUSH_NONE; > > trace_i915_gem_object_clflush(obj); > - drm_clflush_sg(obj->pages); > - > - return true; > + return drm_clflush_sg(obj->pages); > } > > /** Flushes the GTT write domain for the object if it's dirty. */ > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > index 0c25f62..e8eb9e9 100644 > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > @@ -827,7 +827,7 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring, > { > struct i915_vma *vma; > uint32_t flush_domains = 0; > - bool flush_chipset = false; > + enum drm_cache_flush flush_chipset = DRM_CACHE_FLUSH_NONE; > int ret; > > list_for_each_entry(vma, vmas, exec_list) { > @@ -836,8 +836,10 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring, > if (ret) > return ret; > > - if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) > - flush_chipset |= i915_gem_clflush_object(obj, false); > + if (obj->base.write_domain & I915_GEM_DOMAIN_CPU && > + flush_chipset != DRM_CACHE_FLUSH_WBINVD) { > + flush_chipset = i915_gem_clflush_object(obj, false); > + } > > flush_domains |= obj->base.write_domain; > } > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index 89b5577..a6c6ebd 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -611,7 +611,7 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, > struct intel_engine_cs *ring = ringbuf->ring; > struct i915_vma *vma; > uint32_t flush_domains = 0; > - bool flush_chipset = false; > + enum drm_cache_flush flush_chipset = DRM_CACHE_FLUSH_NONE; > int ret; > > list_for_each_entry(vma, vmas, exec_list) { > @@ -621,8 +621,10 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, > if (ret) > return ret; > > - if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) > - flush_chipset |= i915_gem_clflush_object(obj, false); > + if (obj->base.write_domain & I915_GEM_DOMAIN_CPU && > + flush_chipset != DRM_CACHE_FLUSH_WBINVD) { > + flush_chipset = i915_gem_clflush_object(obj, false); > + } > > flush_domains |= obj->base.write_domain; > } > -- > 2.1.3 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Sun, Dec 14, 2014 at 03:12:21PM +0200, Ville Syrjälä wrote: > On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote: > > If we're moving a bunch of buffers from the CPU domain to the GPU domain, and > > we've already blown out the entire cache via a wbinvd, there is nothing more to > > do. > > > > With this and the previous patches, I am seeing a 3x FPS increase on a certain > > benchmark which uses a giant 2d array texture. Unless I missed something in the > > code, it should only effect non-LLC i915 platforms. > > > > I haven't yet run any numbers for other benchmarks, nor have I attempted to > > check if various conformance tests still pass. > > > > NOTE: As mentioned in the previous patch, if one can easily obtain the largest > > buffer and attempt to flush it first, the results would be even more desirable. > > So even with that optimization if you only have tons of small buffers > that need to be flushed you'd still take the clflush path for every > single one. > > How difficult would it to calculate the total size to be flushed first, > and then make the clflush vs. wbinvd decision base on that? > I'll write the patch and send it to Eero for test. It's not hard, and I think that's a good idea as well. One reason I didn't put such code in this series is that moves away from a global DRM solution (and like I said in the cover-letter, I am fine with that). Implementing this, I think in the i915 code we'd just iterate through the BOs until we got to a certain threshold, then just call wbinvd() from i915 and not even both with drm_cache. You could also maybe try to shorcut if there are more than X buffers. However, for what you describe, I think it might make more sense to let userspace specify an execbuf flag to do the wbinvd(). Userspace can trivially determine such info, it prevents having to iterate through the buffers an extra time in the kernel. I wonder if the clflushing many small objects is showing up on profiles? So far, this specific microbenchmark was the only profile I'd seen where the clflushes show up. Thanks. [snip]
On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote: > On Sun, Dec 14, 2014 at 03:12:21PM +0200, Ville Syrjälä wrote: > > On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote: > > > If we're moving a bunch of buffers from the CPU domain to the GPU domain, and > > > we've already blown out the entire cache via a wbinvd, there is nothing more to > > > do. > > > > > > With this and the previous patches, I am seeing a 3x FPS increase on a certain > > > benchmark which uses a giant 2d array texture. Unless I missed something in the > > > code, it should only effect non-LLC i915 platforms. > > > > > > I haven't yet run any numbers for other benchmarks, nor have I attempted to > > > check if various conformance tests still pass. > > > > > > NOTE: As mentioned in the previous patch, if one can easily obtain the largest > > > buffer and attempt to flush it first, the results would be even more desirable. > > > > So even with that optimization if you only have tons of small buffers > > that need to be flushed you'd still take the clflush path for every > > single one. > > > > How difficult would it to calculate the total size to be flushed first, > > and then make the clflush vs. wbinvd decision base on that? > > > > I'll write the patch and send it to Eero for test. > > It's not hard, and I think that's a good idea as well. One reason I didn't put > such code in this series is that moves away from a global DRM solution (and like > I said in the cover-letter, I am fine with that). Implementing this, I think in > the i915 code we'd just iterate through the BOs until we got to a certain > threshold, then just call wbinvd() from i915 and not even both with drm_cache. > You could also maybe try to shorcut if there are more than X buffers. I don't mind an i915 specific solution (we have them already in many places). So will wait for the results of this experiments before merging more patches. > However, for what you describe, I think it might make more sense to let > userspace specify an execbuf flag to do the wbinvd(). Userspace can trivially > determine such info, it prevents having to iterate through the buffers an extra > time in the kernel. If we keep the running tally of clflushing in the reservation step and then use that in move_to_gpu we shouldn't need an additional loop. And even if the estimate from the first loop is off a bit (e.g. due to eviction to make space for new buffers) it won't matter - it's just an optimization. Imo asking userspace to do this for the kernel doesn't make much sense since the kernel already keeps track of domains. Cheers, Daniel
On Mon, Dec 15, 2014 at 08:55:32AM +0100, Daniel Vetter wrote: > On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote: > > On Sun, Dec 14, 2014 at 03:12:21PM +0200, Ville Syrjälä wrote: > > > On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote: > > > > If we're moving a bunch of buffers from the CPU domain to the GPU domain, and > > > > we've already blown out the entire cache via a wbinvd, there is nothing more to > > > > do. > > > > > > > > With this and the previous patches, I am seeing a 3x FPS increase on a certain > > > > benchmark which uses a giant 2d array texture. Unless I missed something in the > > > > code, it should only effect non-LLC i915 platforms. > > > > > > > > I haven't yet run any numbers for other benchmarks, nor have I attempted to > > > > check if various conformance tests still pass. > > > > > > > > NOTE: As mentioned in the previous patch, if one can easily obtain the largest > > > > buffer and attempt to flush it first, the results would be even more desirable. > > > > > > So even with that optimization if you only have tons of small buffers > > > that need to be flushed you'd still take the clflush path for every > > > single one. > > > > > > How difficult would it to calculate the total size to be flushed first, > > > and then make the clflush vs. wbinvd decision base on that? > > > > > > > I'll write the patch and send it to Eero for test. > > > > It's not hard, and I think that's a good idea as well. One reason I didn't put > > such code in this series is that moves away from a global DRM solution (and like > > I said in the cover-letter, I am fine with that). Implementing this, I think in > > the i915 code we'd just iterate through the BOs until we got to a certain > > threshold, then just call wbinvd() from i915 and not even both with drm_cache. > > You could also maybe try to shorcut if there are more than X buffers. > > I don't mind an i915 specific solution (we have them already in many > places). So will wait for the results of this experiments before merging > more patches. I actually think an i915 specific solution is required, as the making drm_clflush_pages autoselect is likely to cause regressions on unaware drivers (e.g. anything that has a reservation loop in execbuf like i915). I do think that execbuf regularly hitting clflush is something to be concerned about in userspace (even a wbinvd every execbuf is not going to be nice, even an mfence per execbuf shows up on profiles!), but we do have to occassionally flush large objects like framebuffers which probably would be faster using wbinvd. -Chris
On Mon, Dec 15, 2014 at 08:20:50AM +0000, Chris Wilson wrote: > On Mon, Dec 15, 2014 at 08:55:32AM +0100, Daniel Vetter wrote: > > On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote: > > > On Sun, Dec 14, 2014 at 03:12:21PM +0200, Ville Syrjälä wrote: > > > > On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote: > > > > > If we're moving a bunch of buffers from the CPU domain to the GPU domain, and > > > > > we've already blown out the entire cache via a wbinvd, there is nothing more to > > > > > do. > > > > > > > > > > With this and the previous patches, I am seeing a 3x FPS increase on a certain > > > > > benchmark which uses a giant 2d array texture. Unless I missed something in the > > > > > code, it should only effect non-LLC i915 platforms. > > > > > > > > > > I haven't yet run any numbers for other benchmarks, nor have I attempted to > > > > > check if various conformance tests still pass. > > > > > > > > > > NOTE: As mentioned in the previous patch, if one can easily obtain the largest > > > > > buffer and attempt to flush it first, the results would be even more desirable. > > > > > > > > So even with that optimization if you only have tons of small buffers > > > > that need to be flushed you'd still take the clflush path for every > > > > single one. > > > > > > > > How difficult would it to calculate the total size to be flushed first, > > > > and then make the clflush vs. wbinvd decision base on that? > > > > > > > > > > I'll write the patch and send it to Eero for test. > > > > > > It's not hard, and I think that's a good idea as well. One reason I didn't put > > > such code in this series is that moves away from a global DRM solution (and like > > > I said in the cover-letter, I am fine with that). Implementing this, I think in > > > the i915 code we'd just iterate through the BOs until we got to a certain > > > threshold, then just call wbinvd() from i915 and not even both with drm_cache. > > > You could also maybe try to shorcut if there are more than X buffers. > > > > I don't mind an i915 specific solution (we have them already in many > > places). So will wait for the results of this experiments before merging > > more patches. > > I actually think an i915 specific solution is required, as the making > drm_clflush_pages autoselect is likely to cause regressions on unaware > drivers (e.g. anything that has a reservation loop in execbuf like i915). Assuming the stall is gone as Jesse said in the other thread, I can't envision a scenario where wbinvd would do worse on large objects. > > I do think that execbuf regularly hitting clflush is something to be > concerned about in userspace (even a wbinvd every execbuf is not going > to be nice, even an mfence per execbuf shows up on profiles!), but we do > have to occassionally flush large objects like framebuffers which > probably would be faster using wbinvd. > -Chris > > -- > Chris Wilson, Intel Open Source Technology Centre
On Mon, Dec 15, 2014 at 11:56:05AM -0800, Ben Widawsky wrote: > On Mon, Dec 15, 2014 at 08:20:50AM +0000, Chris Wilson wrote: > > On Mon, Dec 15, 2014 at 08:55:32AM +0100, Daniel Vetter wrote: > > > On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote: > > > > On Sun, Dec 14, 2014 at 03:12:21PM +0200, Ville Syrjälä wrote: > > > > > On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote: > > > > > > If we're moving a bunch of buffers from the CPU domain to the GPU domain, and > > > > > > we've already blown out the entire cache via a wbinvd, there is nothing more to > > > > > > do. > > > > > > > > > > > > With this and the previous patches, I am seeing a 3x FPS increase on a certain > > > > > > benchmark which uses a giant 2d array texture. Unless I missed something in the > > > > > > code, it should only effect non-LLC i915 platforms. > > > > > > > > > > > > I haven't yet run any numbers for other benchmarks, nor have I attempted to > > > > > > check if various conformance tests still pass. > > > > > > > > > > > > NOTE: As mentioned in the previous patch, if one can easily obtain the largest > > > > > > buffer and attempt to flush it first, the results would be even more desirable. > > > > > > > > > > So even with that optimization if you only have tons of small buffers > > > > > that need to be flushed you'd still take the clflush path for every > > > > > single one. > > > > > > > > > > How difficult would it to calculate the total size to be flushed first, > > > > > and then make the clflush vs. wbinvd decision base on that? > > > > > > > > > > > > > I'll write the patch and send it to Eero for test. > > > > > > > > It's not hard, and I think that's a good idea as well. One reason I didn't put > > > > such code in this series is that moves away from a global DRM solution (and like > > > > I said in the cover-letter, I am fine with that). Implementing this, I think in > > > > the i915 code we'd just iterate through the BOs until we got to a certain > > > > threshold, then just call wbinvd() from i915 and not even both with drm_cache. > > > > You could also maybe try to shorcut if there are more than X buffers. > > > > > > I don't mind an i915 specific solution (we have them already in many > > > places). So will wait for the results of this experiments before merging > > > more patches. > > > > I actually think an i915 specific solution is required, as the making > > drm_clflush_pages autoselect is likely to cause regressions on unaware > > drivers (e.g. anything that has a reservation loop in execbuf like i915). > > Assuming the stall is gone as Jesse said in the other thread, I can't envision a > scenario where wbinvd would do worse on large objects. It is the multiple wbinvd performed at each execbuffer that is worrisome. -Chris
On Mon, Dec 15, 2014 at 08:39:35PM +0000, Chris Wilson wrote: > On Mon, Dec 15, 2014 at 11:56:05AM -0800, Ben Widawsky wrote: > > On Mon, Dec 15, 2014 at 08:20:50AM +0000, Chris Wilson wrote: > > > On Mon, Dec 15, 2014 at 08:55:32AM +0100, Daniel Vetter wrote: > > > > On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote: > > > > > On Sun, Dec 14, 2014 at 03:12:21PM +0200, Ville Syrjälä wrote: > > > > > > On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote: > > > > > > > If we're moving a bunch of buffers from the CPU domain to the GPU domain, and > > > > > > > we've already blown out the entire cache via a wbinvd, there is nothing more to > > > > > > > do. > > > > > > > > > > > > > > With this and the previous patches, I am seeing a 3x FPS increase on a certain > > > > > > > benchmark which uses a giant 2d array texture. Unless I missed something in the > > > > > > > code, it should only effect non-LLC i915 platforms. > > > > > > > > > > > > > > I haven't yet run any numbers for other benchmarks, nor have I attempted to > > > > > > > check if various conformance tests still pass. > > > > > > > > > > > > > > NOTE: As mentioned in the previous patch, if one can easily obtain the largest > > > > > > > buffer and attempt to flush it first, the results would be even more desirable. > > > > > > > > > > > > So even with that optimization if you only have tons of small buffers > > > > > > that need to be flushed you'd still take the clflush path for every > > > > > > single one. > > > > > > > > > > > > How difficult would it to calculate the total size to be flushed first, > > > > > > and then make the clflush vs. wbinvd decision base on that? > > > > > > > > > > > > > > > > I'll write the patch and send it to Eero for test. > > > > > > > > > > It's not hard, and I think that's a good idea as well. One reason I didn't put > > > > > such code in this series is that moves away from a global DRM solution (and like > > > > > I said in the cover-letter, I am fine with that). Implementing this, I think in > > > > > the i915 code we'd just iterate through the BOs until we got to a certain > > > > > threshold, then just call wbinvd() from i915 and not even both with drm_cache. > > > > > You could also maybe try to shorcut if there are more than X buffers. > > > > > > > > I don't mind an i915 specific solution (we have them already in many > > > > places). So will wait for the results of this experiments before merging > > > > more patches. > > > > > > I actually think an i915 specific solution is required, as the making > > > drm_clflush_pages autoselect is likely to cause regressions on unaware > > > drivers (e.g. anything that has a reservation loop in execbuf like i915). > > > > Assuming the stall is gone as Jesse said in the other thread, I can't envision a > > scenario where wbinvd would do worse on large objects. > > It is the multiple wbinvd performed at each execbuffer that is worrisome. > -Chris > This patch attempts to avoid that by dropping all flushing after the first WBINVD.
On Mon, Dec 15, 2014 at 01:06:40PM -0800, Ben Widawsky wrote: > On Mon, Dec 15, 2014 at 08:39:35PM +0000, Chris Wilson wrote: > > On Mon, Dec 15, 2014 at 11:56:05AM -0800, Ben Widawsky wrote: > > > On Mon, Dec 15, 2014 at 08:20:50AM +0000, Chris Wilson wrote: > > > > On Mon, Dec 15, 2014 at 08:55:32AM +0100, Daniel Vetter wrote: > > > > > On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote: > > > > > > It's not hard, and I think that's a good idea as well. One reason I didn't put > > > > > > such code in this series is that moves away from a global DRM solution (and like > > > > > > I said in the cover-letter, I am fine with that). Implementing this, I think in > > > > > > the i915 code we'd just iterate through the BOs until we got to a certain > > > > > > threshold, then just call wbinvd() from i915 and not even both with drm_cache. > > > > > > You could also maybe try to shorcut if there are more than X buffers. > > > > > > > > > > I don't mind an i915 specific solution (we have them already in many > > > > > places). So will wait for the results of this experiments before merging > > > > > more patches. > > > > > > > > I actually think an i915 specific solution is required, as the making > > > > drm_clflush_pages autoselect is likely to cause regressions on unaware > > > > drivers (e.g. anything that has a reservation loop in execbuf like i915). > > > > > > Assuming the stall is gone as Jesse said in the other thread, I can't envision a > > > scenario where wbinvd would do worse on large objects. > > > > It is the multiple wbinvd performed at each execbuffer that is worrisome. > > This patch attempts to avoid that by dropping all flushing after the first > WBINVD. But you can't make the central change to drm_clflush_* without driver specific workarounds like this patch... -Chris
On Tue, Dec 16, 2014 at 07:57:39AM +0000, Chris Wilson wrote: > On Mon, Dec 15, 2014 at 01:06:40PM -0800, Ben Widawsky wrote: > > On Mon, Dec 15, 2014 at 08:39:35PM +0000, Chris Wilson wrote: > > > On Mon, Dec 15, 2014 at 11:56:05AM -0800, Ben Widawsky wrote: > > > > On Mon, Dec 15, 2014 at 08:20:50AM +0000, Chris Wilson wrote: > > > > > On Mon, Dec 15, 2014 at 08:55:32AM +0100, Daniel Vetter wrote: > > > > > > On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote: > > > > > > > It's not hard, and I think that's a good idea as well. One reason I didn't put > > > > > > > such code in this series is that moves away from a global DRM solution (and like > > > > > > > I said in the cover-letter, I am fine with that). Implementing this, I think in > > > > > > > the i915 code we'd just iterate through the BOs until we got to a certain > > > > > > > threshold, then just call wbinvd() from i915 and not even both with drm_cache. > > > > > > > You could also maybe try to shorcut if there are more than X buffers. > > > > > > > > > > > > I don't mind an i915 specific solution (we have them already in many > > > > > > places). So will wait for the results of this experiments before merging > > > > > > more patches. > > > > > > > > > > I actually think an i915 specific solution is required, as the making > > > > > drm_clflush_pages autoselect is likely to cause regressions on unaware > > > > > drivers (e.g. anything that has a reservation loop in execbuf like i915). > > > > > > > > Assuming the stall is gone as Jesse said in the other thread, I can't envision a > > > > scenario where wbinvd would do worse on large objects. > > > > > > It is the multiple wbinvd performed at each execbuffer that is worrisome. > > > > This patch attempts to avoid that by dropping all flushing after the first > > WBINVD. > > But you can't make the central change to drm_clflush_* without driver > specific workarounds like this patch... Good point. I'll do an i915 one when I find the time. I was still hoping to get some numbers from Eero on this series.
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index d68c75f..fdb92a3 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2642,7 +2642,8 @@ static inline bool i915_stop_ring_allow_warn(struct drm_i915_private *dev_priv) } void i915_gem_reset(struct drm_device *dev); -bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force); +enum drm_cache_flush +i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force); int __must_check i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj); int __must_check i915_gem_init(struct drm_device *dev); int i915_gem_init_rings(struct drm_device *dev); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index de241eb..3746738 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -3608,7 +3608,7 @@ err_unpin: return vma; } -bool +enum drm_cache_flush i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force) { @@ -3617,14 +3617,14 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj, * again at bind time. */ if (obj->pages == NULL) - return false; + return DRM_CACHE_FLUSH_NONE; /* * Stolen memory is always coherent with the GPU as it is explicitly * marked as wc by the system, or the system is cache-coherent. */ if (obj->stolen || obj->phys_handle) - return false; + return DRM_CACHE_FLUSH_NONE; /* If the GPU is snooping the contents of the CPU cache, * we do not need to manually clear the CPU cache lines. However, @@ -3635,12 +3635,10 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj, * tracking. */ if (!force && cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) - return false; + return DRM_CACHE_FLUSH_NONE; trace_i915_gem_object_clflush(obj); - drm_clflush_sg(obj->pages); - - return true; + return drm_clflush_sg(obj->pages); } /** Flushes the GTT write domain for the object if it's dirty. */ diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 0c25f62..e8eb9e9 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -827,7 +827,7 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring, { struct i915_vma *vma; uint32_t flush_domains = 0; - bool flush_chipset = false; + enum drm_cache_flush flush_chipset = DRM_CACHE_FLUSH_NONE; int ret; list_for_each_entry(vma, vmas, exec_list) { @@ -836,8 +836,10 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring, if (ret) return ret; - if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) - flush_chipset |= i915_gem_clflush_object(obj, false); + if (obj->base.write_domain & I915_GEM_DOMAIN_CPU && + flush_chipset != DRM_CACHE_FLUSH_WBINVD) { + flush_chipset = i915_gem_clflush_object(obj, false); + } flush_domains |= obj->base.write_domain; } diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 89b5577..a6c6ebd 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -611,7 +611,7 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, struct intel_engine_cs *ring = ringbuf->ring; struct i915_vma *vma; uint32_t flush_domains = 0; - bool flush_chipset = false; + enum drm_cache_flush flush_chipset = DRM_CACHE_FLUSH_NONE; int ret; list_for_each_entry(vma, vmas, exec_list) { @@ -621,8 +621,10 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, if (ret) return ret; - if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) - flush_chipset |= i915_gem_clflush_object(obj, false); + if (obj->base.write_domain & I915_GEM_DOMAIN_CPU && + flush_chipset != DRM_CACHE_FLUSH_WBINVD) { + flush_chipset = i915_gem_clflush_object(obj, false); + } flush_domains |= obj->base.write_domain; }
If we're moving a bunch of buffers from the CPU domain to the GPU domain, and we've already blown out the entire cache via a wbinvd, there is nothing more to do. With this and the previous patches, I am seeing a 3x FPS increase on a certain benchmark which uses a giant 2d array texture. Unless I missed something in the code, it should only effect non-LLC i915 platforms. I haven't yet run any numbers for other benchmarks, nor have I attempted to check if various conformance tests still pass. NOTE: As mentioned in the previous patch, if one can easily obtain the largest buffer and attempt to flush it first, the results would be even more desirable. Cc: DRI Development <dri-devel@lists.freedesktop.org> Signed-off-by: Ben Widawsky <ben@bwidawsk.net> --- drivers/gpu/drm/i915/i915_drv.h | 3 ++- drivers/gpu/drm/i915/i915_gem.c | 12 +++++------- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 8 +++++--- drivers/gpu/drm/i915/intel_lrc.c | 8 +++++--- 4 files changed, 17 insertions(+), 14 deletions(-)