@@ -1643,6 +1643,7 @@ struct i915_workarounds {
struct eb_vmas {
struct list_head vmas;
+ bool do_wbinvd;
int and;
union {
struct i915_vma *lut[0];
@@ -1913,6 +1914,8 @@ struct drm_i915_private {
void (*stop_ring)(struct intel_engine_cs *ring);
} gt;
+ size_t wbinvd_threshold;
+
uint32_t request_uniq;
/*
@@ -2810,6 +2813,11 @@ static inline bool i915_stop_ring_allow_warn(struct drm_i915_private *dev_priv)
void i915_gem_reset(struct drm_device *dev);
bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
+static inline bool cpu_cache_is_coherent(struct drm_device *dev,
+ enum i915_cache_level level)
+{
+ return HAS_LLC(dev) || level != I915_CACHE_NONE;
+}
int __must_check i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj);
int __must_check i915_gem_init(struct drm_device *dev);
int i915_gem_init_rings(struct drm_device *dev);
@@ -61,12 +61,6 @@ static int i915_gem_shrinker_oom(struct notifier_block *nb,
void *ptr);
static unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv);
-static bool cpu_cache_is_coherent(struct drm_device *dev,
- enum i915_cache_level level)
-{
- return HAS_LLC(dev) || level != I915_CACHE_NONE;
-}
-
static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
{
if (!cpu_cache_is_coherent(obj->base.dev, obj->cache_level))
@@ -4878,6 +4872,11 @@ int i915_gem_init(struct drm_device *dev)
dev_priv->gt.stop_ring = intel_logical_ring_stop;
}
+ dev_priv->wbinvd_threshold = boot_cpu_data.x86_cache_size << 10;
+ /* Pick a high default in the unlikely case we got nothing */
+ if (!dev_priv->wbinvd_threshold)
+ dev_priv->wbinvd_threshold = (8 << 20);
+
ret = i915_gem_init_userptr(dev);
if (ret)
goto out_unlock;
@@ -50,7 +50,7 @@ eb_create(struct drm_i915_gem_execbuffer2 *args)
unsigned size = args->buffer_count;
size *= sizeof(struct i915_vma *);
size += sizeof(struct eb_vmas);
- eb = kmalloc(size, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
+ eb = kzalloc(size, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
}
if (eb == NULL) {
@@ -78,6 +78,7 @@ eb_reset(struct eb_vmas *eb)
{
if (eb->and >= 0)
memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head));
+ eb->do_wbinvd = false;
}
static int
@@ -154,6 +155,11 @@ eb_lookup_vmas(struct eb_vmas *eb,
hlist_add_head(&vma->exec_node,
&eb->buckets[handle & eb->and]);
}
+
+ if (vma->node.size >= to_i915(obj->base.dev)->wbinvd_threshold &&
+ obj->base.write_domain & I915_GEM_DOMAIN_CPU &&
+ !cpu_cache_is_coherent(obj->base.dev, obj->cache_level))
+ eb->do_wbinvd = true;
++i;
}
@@ -826,7 +832,7 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring,
struct list_head *vmas = &eb->vmas;
struct i915_vma *vma;
uint32_t flush_domains = 0;
- bool flush_chipset = false;
+ bool flush_chipset = eb->do_wbinvd;
int ret;
list_for_each_entry(vma, vmas, exec_list) {
@@ -835,12 +841,18 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring,
if (ret)
return ret;
+ flush_domains |= obj->base.write_domain;
+
+ if (eb->do_wbinvd)
+ continue;
+
if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
flush_chipset |= i915_gem_clflush_object(obj, false);
-
- flush_domains |= obj->base.write_domain;
}
+ if (eb->do_wbinvd)
+ wbinvd();
+
if (flush_chipset)
i915_gem_chipset_flush(ring->dev);
@@ -586,12 +586,18 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf,
if (ret)
return ret;
+ flush_domains |= obj->base.write_domain;
+
+ if (eb->do_wbinvd)
+ continue;
+
if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
i915_gem_clflush_object(obj, false);
-
- flush_domains |= obj->base.write_domain;
}
+ if (eb->do_wbinvd)
+ wbinvd();
+
if (flush_domains & I915_GEM_DOMAIN_GTT)
wmb();
If we're moving a bunch of buffers from the CPU domain to the GPU domain, and we've already blown out the entire cache via a wbinvd, there is nothing more to do. With this and the previous patches, I am seeing a 3x FPS increase on a certain benchmark which uses a giant 2d array texture. Unless I missed something in the code, it should only effect non-LLC i915 platforms. I haven't yet run any numbers for other benchmarks, nor have I attempted to check if various conformance tests still pass. v2: Rewrite the patch to be i915 only Obtain whether or not we wbinvd up front. Signed-off-by: Ben Widawsky <ben@bwidawsk.net> --- drivers/gpu/drm/i915/i915_drv.h | 8 ++++++++ drivers/gpu/drm/i915/i915_gem.c | 11 +++++------ drivers/gpu/drm/i915/i915_gem_execbuffer.c | 20 ++++++++++++++++---- drivers/gpu/drm/i915/intel_lrc.c | 10 ++++++++-- 4 files changed, 37 insertions(+), 12 deletions(-)