[183/190] drm/i915/cmdparser: Use cached vmappings
diff mbox

Message ID 1452510091-6833-42-git-send-email-chris@chris-wilson.co.uk
State New
Headers show

Commit Message

Chris Wilson Jan. 11, 2016, 11:01 a.m. UTC
The single largest factor in the overhead of parsing the commands is the
setup of the virtual mapping to provide a continuous block for the batch
buffer. If we keep those vmappings around (against the better judgement
of mm/vmalloc.c, which we offset by handwaving and looking suggestively
at the shrinker) we can dramatically improve the performance of the
parser for small batches (such as media workloads). Furthermore, we can
use the prepare shmem read/write functions to determine  how best we
need to clflush the range (rather than every page of the object).

The impact of caching both src/dst vmaps is +80% on ivb and +140% on byt
for the throughput on small batches. (Caching just the dst vmap and
iterating over the src, doing a page by page copy is roughly 5% slower
on both platforms. That may be an acceptable trade-off to eliminate one
cached vmapping, and we may be able to reduce the per-page copying overhead
further.) For *this* simple test case, the cmdparser is now within a
factor of 2 of ideal performance.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c     | 123 ++++++++++-------------------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   6 ++
 2 files changed, 48 insertions(+), 81 deletions(-)

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 84340eb42e1b..32b83369ae4e 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -866,98 +866,57 @@  find_reg(const struct drm_i915_reg_descriptor *table,
 	return NULL;
 }
 
-static u32 *vmap_batch(struct drm_i915_gem_object *obj,
-		       unsigned start, unsigned len)
-{
-	int i;
-	void *addr = NULL;
-	struct sg_page_iter sg_iter;
-	int first_page = start >> PAGE_SHIFT;
-	int last_page = (len + start + 4095) >> PAGE_SHIFT;
-	int npages = last_page - first_page;
-	struct page **pages;
-
-	pages = drm_malloc_ab(npages, sizeof(*pages));
-	if (pages == NULL) {
-		DRM_DEBUG_DRIVER("Failed to get space for pages\n");
-		goto finish;
-	}
-
-	i = 0;
-	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
-		pages[i++] = sg_page_iter_page(&sg_iter);
-		if (i == npages)
-			break;
-	}
-
-	addr = vmap(pages, i, 0, PAGE_KERNEL);
-	if (addr == NULL) {
-		DRM_DEBUG_DRIVER("Failed to vmap pages\n");
-		goto finish;
-	}
-
-finish:
-	if (pages)
-		drm_free_large(pages);
-	return (u32*)addr;
-}
-
-/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
-static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
+/* Returns a vmap'd pointer to dst_obj, which the caller must unmap */
+static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 		       struct drm_i915_gem_object *src_obj,
 		       u32 batch_start_offset,
-		       u32 batch_len)
+		       u32 batch_len,
+		       bool *needs_clflush_after)
 {
-	unsigned needs_clflush;
-	void *src_base, *src;
-	void *dst = NULL;
+	unsigned src_needs_clflush;
+	unsigned dst_needs_clflush;
+	void *src, *dst;
 	int ret;
 
-	if (batch_len > dest_obj->base.size ||
-	    batch_len + batch_start_offset > src_obj->base.size)
-		return ERR_PTR(-E2BIG);
-
-	if (WARN_ON(dest_obj->pages_pin_count == 0))
-		return ERR_PTR(-ENODEV);
-
-	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
-	if (ret) {
-		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
+	ret = i915_gem_obj_prepare_shmem_read(src_obj, &src_needs_clflush);
+	if (ret)
 		return ERR_PTR(ret);
-	}
 
-	src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
-	if (!src_base) {
-		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
-		ret = -ENOMEM;
+	ret = i915_gem_obj_prepare_shmem_write(dst_obj, &dst_needs_clflush);
+	if (ret) {
+		dst = ERR_PTR(ret);
 		goto unpin_src;
 	}
 
-	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
-	if (ret) {
-		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
-		goto unmap_src;
+	src = i915_gem_object_pin_vmap(src_obj);
+	if (IS_ERR(src)) {
+		dst = src;
+		goto unpin_dst;
 	}
 
-	dst = vmap_batch(dest_obj, 0, batch_len);
-	if (!dst) {
-		DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
-		ret = -ENOMEM;
+	dst = i915_gem_object_pin_vmap(dst_obj);
+	if (IS_ERR(dst))
 		goto unmap_src;
-	}
 
-	src = src_base + offset_in_page(batch_start_offset);
-	if (needs_clflush)
-		drm_clflush_virt_range(src, batch_len);
+	src += batch_start_offset;
+	if (src_needs_clflush)
+		clflush_cache_range(src, batch_len);
+
+	if (dst_needs_clflush & CLFLUSH_BEFORE)
+		batch_len = roundup(batch_len, boot_cpu_data.x86_clflush_size);
 
 	memcpy(dst, src, batch_len);
 
+	/* dst_obj is returned with vmap pinned */
+	*needs_clflush_after = dst_needs_clflush & CLFLUSH_AFTER;
+
 unmap_src:
-	vunmap(src_base);
+	i915_gem_object_unpin_vmap(src_obj);
+unpin_dst:
+	i915_gem_object_unpin_pages(dst_obj);
 unpin_src:
 	i915_gem_object_unpin_pages(src_obj);
-
-	return ret ? ERR_PTR(ret) : dst;
+	return dst;
 }
 
 static bool check_cmd(const struct intel_engine_cs *ring,
@@ -1106,16 +1065,18 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 		    u32 batch_len,
 		    bool is_master)
 {
-	u32 *cmd, *batch_base, *batch_end;
+	u32 *cmd, *batch_end;
 	struct drm_i915_cmd_descriptor default_desc = { 0 };
 	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
+	bool needs_clflush_after = false;
 	int ret = 0;
 
-	batch_base = copy_batch(shadow_batch_obj, batch_obj,
-				batch_start_offset, batch_len);
-	if (IS_ERR(batch_base)) {
+	cmd = copy_batch(shadow_batch_obj, batch_obj,
+			 batch_start_offset, batch_len,
+			 &needs_clflush_after);
+	if (IS_ERR(cmd)) {
 		DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
-		return PTR_ERR(batch_base);
+		return PTR_ERR(cmd);
 	}
 
 	/*
@@ -1123,9 +1084,7 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 	 * large or larger and copy_batch() will write MI_NOPs to the extra
 	 * space. Parsing should be faster in some cases this way.
 	 */
-	batch_end = batch_base + (batch_len / sizeof(*batch_end));
-
-	cmd = batch_base;
+	batch_end = cmd + (batch_len / sizeof(*batch_end));
 	while (cmd < batch_end) {
 		const struct drm_i915_cmd_descriptor *desc;
 		u32 length;
@@ -1184,7 +1143,9 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 		ret = -EINVAL;
 	}
 
-	vunmap(batch_base);
+	if (ret == 0 && needs_clflush_after)
+		clflush_cache_range(shadow_batch_obj->vmapping, batch_len);
+	i915_gem_object_unpin_vmap(shadow_batch_obj);
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 733250afa139..eac3d52f790d 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1616,6 +1616,12 @@  i915_gem_do_execbuffer(struct drm_device *dev,
 		ret = -EINVAL;
 		goto err;
 	}
+	if (args->batch_start_offset > eb.batch_vma->size ||
+	    args->batch_len > eb.batch_vma->size - args->batch_start_offset) {
+		DRM_DEBUG("Attempting to use out-of-bounds batch\n");
+		ret = -EINVAL;
+		goto err;
+	}
 
 	if (intel_engine_needs_cmd_parser(eb.engine) && args->batch_len) {
 		struct i915_vma *vma;