diff mbox

[3/6] drm/i915: Opportunistically reduce flushing at execbuf

Message ID 1423518859-6199-4-git-send-email-benjamin.widawsky@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ben Widawsky Feb. 9, 2015, 9:54 p.m. UTC
If we're moving a bunch of buffers from the CPU domain to the GPU domain, and
we've already blown out the entire cache via a wbinvd, there is nothing more to
do.

With this and the previous patches, I am seeing a 3x FPS increase on a certain
benchmark which uses a giant 2d array texture. Unless I missed something in the
code, it should only effect non-LLC i915 platforms.

I haven't yet run any numbers for other benchmarks, nor have I attempted to
check if various conformance tests still pass.

v2: Rewrite the patch to be i915 only
Obtain whether or not we wbinvd up front.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h            |  8 ++++++++
 drivers/gpu/drm/i915/i915_gem.c            | 11 +++++------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 20 ++++++++++++++++----
 drivers/gpu/drm/i915/intel_lrc.c           | 10 ++++++++--
 4 files changed, 37 insertions(+), 12 deletions(-)

Comments

Chris Wilson Feb. 10, 2015, 9:21 a.m. UTC | #1
On Mon, Feb 09, 2015 at 01:54:16PM -0800, Ben Widawsky wrote:
> If we're moving a bunch of buffers from the CPU domain to the GPU domain, and
> we've already blown out the entire cache via a wbinvd, there is nothing more to
> do.
> 
> With this and the previous patches, I am seeing a 3x FPS increase on a certain
> benchmark which uses a giant 2d array texture. Unless I missed something in the
> code, it should only effect non-LLC i915 platforms.

Out of curiosity, have you compared with how this performs with an
improved userspace? There are several techniques which userspace can do
that are much higher performance than either clflush or wbinvd.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 90ff6aa..5d2f62d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1643,6 +1643,7 @@  struct i915_workarounds {
 
 struct eb_vmas {
 	struct list_head vmas;
+	bool do_wbinvd;
 	int and;
 	union {
 		struct i915_vma *lut[0];
@@ -1913,6 +1914,8 @@  struct drm_i915_private {
 		void (*stop_ring)(struct intel_engine_cs *ring);
 	} gt;
 
+	size_t wbinvd_threshold;
+
 	uint32_t request_uniq;
 
 	/*
@@ -2810,6 +2813,11 @@  static inline bool i915_stop_ring_allow_warn(struct drm_i915_private *dev_priv)
 
 void i915_gem_reset(struct drm_device *dev);
 bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
+static inline bool cpu_cache_is_coherent(struct drm_device *dev,
+					 enum i915_cache_level level)
+{
+	return HAS_LLC(dev) || level != I915_CACHE_NONE;
+}
 int __must_check i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj);
 int __must_check i915_gem_init(struct drm_device *dev);
 int i915_gem_init_rings(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fc81889..5bfb332 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -61,12 +61,6 @@  static int i915_gem_shrinker_oom(struct notifier_block *nb,
 				 void *ptr);
 static unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv);
 
-static bool cpu_cache_is_coherent(struct drm_device *dev,
-				  enum i915_cache_level level)
-{
-	return HAS_LLC(dev) || level != I915_CACHE_NONE;
-}
-
 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 {
 	if (!cpu_cache_is_coherent(obj->base.dev, obj->cache_level))
@@ -4878,6 +4872,11 @@  int i915_gem_init(struct drm_device *dev)
 		dev_priv->gt.stop_ring = intel_logical_ring_stop;
 	}
 
+	dev_priv->wbinvd_threshold = boot_cpu_data.x86_cache_size << 10;
+	/* Pick a high default in the unlikely case we got nothing */
+	if (!dev_priv->wbinvd_threshold)
+		dev_priv->wbinvd_threshold = (8 << 20);
+
 	ret = i915_gem_init_userptr(dev);
 	if (ret)
 		goto out_unlock;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 13ed13e..56f9268 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -50,7 +50,7 @@  eb_create(struct drm_i915_gem_execbuffer2 *args)
 		unsigned size = args->buffer_count;
 		size *= sizeof(struct i915_vma *);
 		size += sizeof(struct eb_vmas);
-		eb = kmalloc(size, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
+		eb = kzalloc(size, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
 	}
 
 	if (eb == NULL) {
@@ -78,6 +78,7 @@  eb_reset(struct eb_vmas *eb)
 {
 	if (eb->and >= 0)
 		memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head));
+	eb->do_wbinvd = false;
 }
 
 static int
@@ -154,6 +155,11 @@  eb_lookup_vmas(struct eb_vmas *eb,
 			hlist_add_head(&vma->exec_node,
 				       &eb->buckets[handle & eb->and]);
 		}
+
+		if (vma->node.size >= to_i915(obj->base.dev)->wbinvd_threshold &&
+		    obj->base.write_domain & I915_GEM_DOMAIN_CPU &&
+		    !cpu_cache_is_coherent(obj->base.dev, obj->cache_level))
+			eb->do_wbinvd = true;
 		++i;
 	}
 
@@ -826,7 +832,7 @@  i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring,
 	struct list_head *vmas = &eb->vmas;
 	struct i915_vma *vma;
 	uint32_t flush_domains = 0;
-	bool flush_chipset = false;
+	bool flush_chipset = eb->do_wbinvd;
 	int ret;
 
 	list_for_each_entry(vma, vmas, exec_list) {
@@ -835,12 +841,18 @@  i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring,
 		if (ret)
 			return ret;
 
+		flush_domains |= obj->base.write_domain;
+
+		if (eb->do_wbinvd)
+			continue;
+
 		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
 			flush_chipset |= i915_gem_clflush_object(obj, false);
-
-		flush_domains |= obj->base.write_domain;
 	}
 
+	if (eb->do_wbinvd)
+		wbinvd();
+
 	if (flush_chipset)
 		i915_gem_chipset_flush(ring->dev);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 03741f9..16ca4a2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -586,12 +586,18 @@  static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf,
 		if (ret)
 			return ret;
 
+		flush_domains |= obj->base.write_domain;
+
+		if (eb->do_wbinvd)
+			continue;
+
 		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
 			i915_gem_clflush_object(obj, false);
-
-		flush_domains |= obj->base.write_domain;
 	}
 
+	if (eb->do_wbinvd)
+		wbinvd();
+
 	if (flush_domains & I915_GEM_DOMAIN_GTT)
 		wmb();