diff mbox

[1/6] drm/i915: Eliminate vmap overhead for cmd parser

Message ID 1443700635-4849-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Oct. 1, 2015, 11:57 a.m. UTC
With a little complexity to handle cmds straddling page boundaries, we
can completely avoiding having to vmap the batch and the shadow batch
objects whilst running the command parser.

On ivb i7-3720MQ:

x11perf -dot before 54.3M, after 53.2M (max 203M)
glxgears before 7110 fps, after 7300 fps (max 7860 fps)

Before:
Time to blt 16384 bytes x      1:	 12.400µs, 1.2GiB/s
Time to blt 16384 bytes x   4096:	  3.055µs, 5.0GiB/s

After:
Time to blt 16384 bytes x      1:	  8.600µs, 1.8GiB/s
Time to blt 16384 bytes x   4096:	  2.456µs, 6.2GiB/s

Removing the vmap is mostly a win, except we lose in a few cases where
the batch size is greater than a page due to the extra complexity (loss
of a simple cache efficient large copy, and boundary handling).

v2: Reorder so that we do check oacontrol remaining set at end-of-batch

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 298 ++++++++++++++++-----------------
 1 file changed, 146 insertions(+), 152 deletions(-)

Comments

Ville Syrjälä Oct. 1, 2015, 12:37 p.m. UTC | #1
On Thu, Oct 01, 2015 at 12:57:10PM +0100, Chris Wilson wrote:
> With a little complexity to handle cmds straddling page boundaries, we
> can completely avoiding having to vmap the batch and the shadow batch
> objects whilst running the command parser.
> 
> On ivb i7-3720MQ:
> 
> x11perf -dot before 54.3M, after 53.2M (max 203M)
> glxgears before 7110 fps, after 7300 fps (max 7860 fps)
> 
> Before:
> Time to blt 16384 bytes x      1:	 12.400µs, 1.2GiB/s
> Time to blt 16384 bytes x   4096:	  3.055µs, 5.0GiB/s
> 
> After:
> Time to blt 16384 bytes x      1:	  8.600µs, 1.8GiB/s
> Time to blt 16384 bytes x   4096:	  2.456µs, 6.2GiB/s
> 
> Removing the vmap is mostly a win, except we lose in a few cases where
> the batch size is greater than a page due to the extra complexity (loss
> of a simple cache efficient large copy, and boundary handling).
> 
> v2: Reorder so that we do check oacontrol remaining set at end-of-batch
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_cmd_parser.c | 298 ++++++++++++++++-----------------
>  1 file changed, 146 insertions(+), 152 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> index 237ff6884a22..91e4baa0f2b8 100644
> --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> @@ -852,100 +852,6 @@ find_reg(const struct drm_i915_reg_descriptor *table,
>  	return NULL;
>  }
>  
> -static u32 *vmap_batch(struct drm_i915_gem_object *obj,
> -		       unsigned start, unsigned len)
> -{
> -	int i;
> -	void *addr = NULL;
> -	struct sg_page_iter sg_iter;
> -	int first_page = start >> PAGE_SHIFT;
> -	int last_page = (len + start + 4095) >> PAGE_SHIFT;
> -	int npages = last_page - first_page;
> -	struct page **pages;
> -
> -	pages = drm_malloc_ab(npages, sizeof(*pages));
> -	if (pages == NULL) {
> -		DRM_DEBUG_DRIVER("Failed to get space for pages\n");
> -		goto finish;
> -	}
> -
> -	i = 0;
> -	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
> -		pages[i++] = sg_page_iter_page(&sg_iter);
> -		if (i == npages)
> -			break;
> -	}
> -
> -	addr = vmap(pages, i, 0, PAGE_KERNEL);
> -	if (addr == NULL) {
> -		DRM_DEBUG_DRIVER("Failed to vmap pages\n");
> -		goto finish;
> -	}
> -
> -finish:
> -	if (pages)
> -		drm_free_large(pages);
> -	return (u32*)addr;
> -}
> -
> -/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
> -static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
> -		       struct drm_i915_gem_object *src_obj,
> -		       u32 batch_start_offset,
> -		       u32 batch_len)
> -{
> -	int needs_clflush = 0;
> -	void *src_base, *src;
> -	void *dst = NULL;
> -	int ret;
> -
> -	if (batch_len > dest_obj->base.size ||
> -	    batch_len + batch_start_offset > src_obj->base.size)
> -		return ERR_PTR(-E2BIG);
> -
> -	if (WARN_ON(dest_obj->pages_pin_count == 0))
> -		return ERR_PTR(-ENODEV);
> -
> -	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> -		return ERR_PTR(ret);
> -	}
> -
> -	src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
> -	if (!src_base) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
> -		ret = -ENOMEM;
> -		goto unpin_src;
> -	}
> -
> -	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> -		goto unmap_src;
> -	}
> -
> -	dst = vmap_batch(dest_obj, 0, batch_len);
> -	if (!dst) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
> -		ret = -ENOMEM;
> -		goto unmap_src;
> -	}
> -
> -	src = src_base + offset_in_page(batch_start_offset);
> -	if (needs_clflush)
> -		drm_clflush_virt_range(src, batch_len);
> -
> -	memcpy(dst, src, batch_len);
> -
> -unmap_src:
> -	vunmap(src_base);
> -unpin_src:
> -	i915_gem_object_unpin_pages(src_obj);
> -
> -	return ret ? ERR_PTR(ret) : dst;
> -}
> -
>  /**
>   * i915_needs_cmd_parser() - should a given ring use software command parsing?
>   * @ring: the ring in question
> @@ -1112,16 +1018,35 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
>  		    u32 batch_len,
>  		    bool is_master)
>  {
> -	u32 *cmd, *batch_base, *batch_end;
> +	u32 tmp[128];
> +	const struct drm_i915_cmd_descriptor *desc;
> +	unsigned dst_iter, src_iter;
> +	int needs_clflush = 0;
> +	struct get_page rewind;
> +	void *src, *dst;
> +	unsigned in, out;
> +	u32 *buf, partial = 0, length;
>  	struct drm_i915_cmd_descriptor default_desc = { 0 };
>  	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
>  	int ret = 0;
>  
> -	batch_base = copy_batch(shadow_batch_obj, batch_obj,
> -				batch_start_offset, batch_len);
> -	if (IS_ERR(batch_base)) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
> -		return PTR_ERR(batch_base);
> +	if (batch_len > shadow_batch_obj->base.size ||
> +	    batch_len + batch_start_offset > batch_obj->base.size)
> +		return -E2BIG;
> +
> +	if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
> +		return -ENODEV;
> +
> +	ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> +		return ret;
> +	}
> +
> +	ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> +		goto unpin;
>  	}
>  
>  	/*
> @@ -1129,69 +1054,138 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
>  	 * large or larger and copy_batch() will write MI_NOPs to the extra
>  	 * space. Parsing should be faster in some cases this way.
>  	 */
> -	batch_end = batch_base + (batch_len / sizeof(*batch_end));
> +	ret = -EINVAL;
> +	rewind = batch_obj->get_page;
> +
> +	dst_iter = 0;
> +	dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, dst_iter));
> +
> +	in = offset_in_page(batch_start_offset);
> +	out = 0;
> +	for (src_iter = batch_start_offset >> PAGE_SHIFT;
> +	     src_iter < batch_obj->base.size >> PAGE_SHIFT;
> +	     src_iter++) {
> +		u32 this, i, j, k;
> +		u32 *cmd, *page_end, *batch_end;
> +
> +		this = batch_len;
> +		if (this > PAGE_SIZE - in)
> +			this = PAGE_SIZE - in;
> +
> +		src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter));
> +		if (needs_clflush)
> +			drm_clflush_virt_range(src + in, this);
> +
> +		i = this;
> +		j = in;
> +		do {
> +			/* We keep dst around so that we do not blow
> +			 * the CPU caches immediately after the copy (due
> +			 * to the kunmap_atomic(dst) doing a TLB flush.
> +			 */
> +			if (out == PAGE_SIZE) {
> +				kunmap_atomic(dst);
> +				dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter));
> +				out = 0;
> +			}
>  
> -	cmd = batch_base;
> -	while (cmd < batch_end) {
> -		const struct drm_i915_cmd_descriptor *desc;
> -		u32 length;
> +			k = i;
> +			if (k > PAGE_SIZE - out)
> +				k = PAGE_SIZE - out;
> +			if (k == PAGE_SIZE)
> +				copy_page(dst, src);
> +			else
> +				memcpy(dst + out, src + j, k);
> +
> +			out += k;
> +			j += k;
> +			i -= k;
> +		} while (i);
> +
> +		cmd = src + in;

So you're now checking the src batch? What prevents userspace from
overwriting it with eg. NOPS between the time you copied it and the
time you check it?

> +		page_end = (void *)cmd + this;
> +		batch_end = (void *)cmd + batch_len;
> +
> +		if (partial) {
> +			memcpy(tmp + partial, cmd, (length - partial)*4);
> +			cmd -= partial;
> +			partial = 0;
> +			buf = tmp;
> +			goto check;
> +		}
>  
> -		if (*cmd == MI_BATCH_BUFFER_END)
> -			break;
> +		do {
> +			if (*cmd == MI_BATCH_BUFFER_END) {
> +				if (oacontrol_set) {
> +					DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> +					ret = -EINVAL;
> +				} else
> +					ret = 0;
> +				goto unmap;
> +			}
>  
> -		desc = find_cmd(ring, *cmd, &default_desc);
> -		if (!desc) {
> -			DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
> -					 *cmd);
> -			ret = -EINVAL;
> -			break;
> -		}
> +			desc = find_cmd(ring, *cmd, &default_desc);
> +			if (!desc) {
> +				DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
> +						 *cmd);
> +				goto unmap;
> +			}
>  
> -		/*
> -		 * If the batch buffer contains a chained batch, return an
> -		 * error that tells the caller to abort and dispatch the
> -		 * workload as a non-secure batch.
> -		 */
> -		if (desc->cmd.value == MI_BATCH_BUFFER_START) {
> -			ret = -EACCES;
> -			break;
> -		}
> +			/*
> +			 * If the batch buffer contains a chained batch, return an
> +			 * error that tells the caller to abort and dispatch the
> +			 * workload as a non-secure batch.
> +			 */
> +			if (desc->cmd.value == MI_BATCH_BUFFER_START) {
> +				ret = -EACCES;
> +				goto unmap;
> +			}
>  
> -		if (desc->flags & CMD_DESC_FIXED)
> -			length = desc->length.fixed;
> -		else
> -			length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
> -
> -		if ((batch_end - cmd) < length) {
> -			DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
> -					 *cmd,
> -					 length,
> -					 batch_end - cmd);
> -			ret = -EINVAL;
> -			break;
> -		}
> +			if (desc->flags & CMD_DESC_FIXED)
> +				length = desc->length.fixed;
> +			else
> +				length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
>  
> -		if (!check_cmd(ring, desc, cmd, length, is_master,
> -			       &oacontrol_set)) {
> -			ret = -EINVAL;
> -			break;
> -		}
> +			if (cmd + length > page_end) {
> +				if (length + cmd > batch_end) {
> +					DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
> +							 *cmd, length, batch_end - cmd);
> +					goto unmap;
> +				}
>  
> -		cmd += length;
> -	}
> +				if (WARN_ON(length > sizeof(tmp)/4)) {
> +					ret = -ENODEV;
> +					goto unmap;
> +				}
>  
> -	if (oacontrol_set) {
> -		DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> -		ret = -EINVAL;
> -	}
> +				partial = page_end - cmd;
> +				memcpy(tmp, cmd, partial*4);
> +				break;
> +			}
>  
> -	if (cmd >= batch_end) {
> -		DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
> -		ret = -EINVAL;
> -	}
> +			buf = cmd;
> +check:
> +			if (!check_cmd(ring, desc, buf, length, is_master,
> +				       &oacontrol_set))
> +				goto unmap;
>  
> -	vunmap(batch_base);
> +			cmd += length;
> +		} while (cmd < page_end);
> +
> +		batch_len -= this;
> +		if (batch_len == 0)
> +			break;
> +
> +		kunmap_atomic(src);
> +		in = 0;
> +	}
>  
> +unmap:
> +	kunmap_atomic(src);
> +	kunmap_atomic(dst);
> +	batch_obj->get_page = rewind;
> +unpin:
> +	i915_gem_object_unpin_pages(batch_obj);
>  	return ret;
>  }
>  
> -- 
> 2.6.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Oct. 1, 2015, 1:24 p.m. UTC | #2
On Thu, Oct 01, 2015 at 03:37:21PM +0300, Ville Syrjälä wrote:
> On Thu, Oct 01, 2015 at 12:57:10PM +0100, Chris Wilson wrote:
> > -	while (cmd < batch_end) {
> > -		const struct drm_i915_cmd_descriptor *desc;
> > -		u32 length;
> > +			k = i;
> > +			if (k > PAGE_SIZE - out)
> > +				k = PAGE_SIZE - out;
> > +			if (k == PAGE_SIZE)
> > +				copy_page(dst, src);
> > +			else
> > +				memcpy(dst + out, src + j, k);
> > +
> > +			out += k;
> > +			j += k;
> > +			i -= k;
> > +		} while (i);
> > +
> > +		cmd = src + in;
> 
> So you're now checking the src batch? What prevents userspace from
> overwriting it with eg. NOPS between the time you copied it and the
> time you check it?

Zilch. I picked src as it was already in the CPU cache, whereas dst will
be WC later. To satisfy you and byt, I need to stage the copy into a
temporary page, scan it, then copy into dst.

The silver lining is that it does remove some lines of code. I'm pretty
confident that the double copy should not be noticed if I remember about
the cache-trashing of kunmap and carefully manage that.
-Chris
Ville Syrjälä Oct. 1, 2015, 1:29 p.m. UTC | #3
On Thu, Oct 01, 2015 at 02:24:53PM +0100, Chris Wilson wrote:
> On Thu, Oct 01, 2015 at 03:37:21PM +0300, Ville Syrjälä wrote:
> > On Thu, Oct 01, 2015 at 12:57:10PM +0100, Chris Wilson wrote:
> > > -	while (cmd < batch_end) {
> > > -		const struct drm_i915_cmd_descriptor *desc;
> > > -		u32 length;
> > > +			k = i;
> > > +			if (k > PAGE_SIZE - out)
> > > +				k = PAGE_SIZE - out;
> > > +			if (k == PAGE_SIZE)
> > > +				copy_page(dst, src);
> > > +			else
> > > +				memcpy(dst + out, src + j, k);
> > > +
> > > +			out += k;
> > > +			j += k;
> > > +			i -= k;
> > > +		} while (i);
> > > +
> > > +		cmd = src + in;
> > 
> > So you're now checking the src batch? What prevents userspace from
> > overwriting it with eg. NOPS between the time you copied it and the
> > time you check it?
> 
> Zilch. I picked src as it was already in the CPU cache, whereas dst will
> be WC later. To satisfy you and byt, I need to stage the copy into a
> temporary page, scan it, then copy into dst.
> 
> The silver lining is that it does remove some lines of code. I'm pretty
> confident that the double copy should not be noticed if I remember about
> the cache-trashing of kunmap and carefully manage that.

Yeah, I was thinking that optimally we'd do the copy+scan in cacheline
units, but maybe that's too small to make it actually efficient?
Chris Wilson Oct. 1, 2015, 1:39 p.m. UTC | #4
On Thu, Oct 01, 2015 at 04:29:48PM +0300, Ville Syrjälä wrote:
> On Thu, Oct 01, 2015 at 02:24:53PM +0100, Chris Wilson wrote:
> > On Thu, Oct 01, 2015 at 03:37:21PM +0300, Ville Syrjälä wrote:
> > > On Thu, Oct 01, 2015 at 12:57:10PM +0100, Chris Wilson wrote:
> > > > -	while (cmd < batch_end) {
> > > > -		const struct drm_i915_cmd_descriptor *desc;
> > > > -		u32 length;
> > > > +			k = i;
> > > > +			if (k > PAGE_SIZE - out)
> > > > +				k = PAGE_SIZE - out;
> > > > +			if (k == PAGE_SIZE)
> > > > +				copy_page(dst, src);
> > > > +			else
> > > > +				memcpy(dst + out, src + j, k);
> > > > +
> > > > +			out += k;
> > > > +			j += k;
> > > > +			i -= k;
> > > > +		} while (i);
> > > > +
> > > > +		cmd = src + in;
> > > 
> > > So you're now checking the src batch? What prevents userspace from
> > > overwriting it with eg. NOPS between the time you copied it and the
> > > time you check it?
> > 
> > Zilch. I picked src as it was already in the CPU cache, whereas dst will
> > be WC later. To satisfy you and byt, I need to stage the copy into a
> > temporary page, scan it, then copy into dst.
> > 
> > The silver lining is that it does remove some lines of code. I'm pretty
> > confident that the double copy should not be noticed if I remember about
> > the cache-trashing of kunmap and carefully manage that.
> 
> Yeah, I was thinking that optimally we'd do the copy+scan in cacheline
> units, but maybe that's too small to make it actually efficient?

Yes, the issue there becomes that we need up to 256 bytes (though I
think the largest command is actually ~130bytes) in order to pass a
complete copy of a command to the validator. If we make the temporary
too small we waste time building up partial commands, and the maximum
sane size is one page. __get_free_page() seems a reasonable first
choice, though a kmalloc() from a fresh slab tends to be faster to
allocate.
-Chris
Daniel Vetter Oct. 6, 2015, 12:44 p.m. UTC | #5
On Thu, Oct 01, 2015 at 12:57:10PM +0100, Chris Wilson wrote:
> With a little complexity to handle cmds straddling page boundaries, we
> can completely avoiding having to vmap the batch and the shadow batch
> objects whilst running the command parser.
> 
> On ivb i7-3720MQ:
> 
> x11perf -dot before 54.3M, after 53.2M (max 203M)
> glxgears before 7110 fps, after 7300 fps (max 7860 fps)
> 
> Before:
> Time to blt 16384 bytes x      1:	 12.400µs, 1.2GiB/s
> Time to blt 16384 bytes x   4096:	  3.055µs, 5.0GiB/s
> 
> After:
> Time to blt 16384 bytes x      1:	  8.600µs, 1.8GiB/s
> Time to blt 16384 bytes x   4096:	  2.456µs, 6.2GiB/s

Numbers for the overall series (or individual patches would be even
better) are needed. I thought you have this neat script now to do that for
an entire series?
-Daniel

> 
> Removing the vmap is mostly a win, except we lose in a few cases where
> the batch size is greater than a page due to the extra complexity (loss
> of a simple cache efficient large copy, and boundary handling).
> 
> v2: Reorder so that we do check oacontrol remaining set at end-of-batch
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_cmd_parser.c | 298 ++++++++++++++++-----------------
>  1 file changed, 146 insertions(+), 152 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> index 237ff6884a22..91e4baa0f2b8 100644
> --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> @@ -852,100 +852,6 @@ find_reg(const struct drm_i915_reg_descriptor *table,
>  	return NULL;
>  }
>  
> -static u32 *vmap_batch(struct drm_i915_gem_object *obj,
> -		       unsigned start, unsigned len)
> -{
> -	int i;
> -	void *addr = NULL;
> -	struct sg_page_iter sg_iter;
> -	int first_page = start >> PAGE_SHIFT;
> -	int last_page = (len + start + 4095) >> PAGE_SHIFT;
> -	int npages = last_page - first_page;
> -	struct page **pages;
> -
> -	pages = drm_malloc_ab(npages, sizeof(*pages));
> -	if (pages == NULL) {
> -		DRM_DEBUG_DRIVER("Failed to get space for pages\n");
> -		goto finish;
> -	}
> -
> -	i = 0;
> -	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
> -		pages[i++] = sg_page_iter_page(&sg_iter);
> -		if (i == npages)
> -			break;
> -	}
> -
> -	addr = vmap(pages, i, 0, PAGE_KERNEL);
> -	if (addr == NULL) {
> -		DRM_DEBUG_DRIVER("Failed to vmap pages\n");
> -		goto finish;
> -	}
> -
> -finish:
> -	if (pages)
> -		drm_free_large(pages);
> -	return (u32*)addr;
> -}
> -
> -/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
> -static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
> -		       struct drm_i915_gem_object *src_obj,
> -		       u32 batch_start_offset,
> -		       u32 batch_len)
> -{
> -	int needs_clflush = 0;
> -	void *src_base, *src;
> -	void *dst = NULL;
> -	int ret;
> -
> -	if (batch_len > dest_obj->base.size ||
> -	    batch_len + batch_start_offset > src_obj->base.size)
> -		return ERR_PTR(-E2BIG);
> -
> -	if (WARN_ON(dest_obj->pages_pin_count == 0))
> -		return ERR_PTR(-ENODEV);
> -
> -	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> -		return ERR_PTR(ret);
> -	}
> -
> -	src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
> -	if (!src_base) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
> -		ret = -ENOMEM;
> -		goto unpin_src;
> -	}
> -
> -	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> -		goto unmap_src;
> -	}
> -
> -	dst = vmap_batch(dest_obj, 0, batch_len);
> -	if (!dst) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
> -		ret = -ENOMEM;
> -		goto unmap_src;
> -	}
> -
> -	src = src_base + offset_in_page(batch_start_offset);
> -	if (needs_clflush)
> -		drm_clflush_virt_range(src, batch_len);
> -
> -	memcpy(dst, src, batch_len);
> -
> -unmap_src:
> -	vunmap(src_base);
> -unpin_src:
> -	i915_gem_object_unpin_pages(src_obj);
> -
> -	return ret ? ERR_PTR(ret) : dst;
> -}
> -
>  /**
>   * i915_needs_cmd_parser() - should a given ring use software command parsing?
>   * @ring: the ring in question
> @@ -1112,16 +1018,35 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
>  		    u32 batch_len,
>  		    bool is_master)
>  {
> -	u32 *cmd, *batch_base, *batch_end;
> +	u32 tmp[128];
> +	const struct drm_i915_cmd_descriptor *desc;
> +	unsigned dst_iter, src_iter;
> +	int needs_clflush = 0;
> +	struct get_page rewind;
> +	void *src, *dst;
> +	unsigned in, out;
> +	u32 *buf, partial = 0, length;
>  	struct drm_i915_cmd_descriptor default_desc = { 0 };
>  	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
>  	int ret = 0;
>  
> -	batch_base = copy_batch(shadow_batch_obj, batch_obj,
> -				batch_start_offset, batch_len);
> -	if (IS_ERR(batch_base)) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
> -		return PTR_ERR(batch_base);
> +	if (batch_len > shadow_batch_obj->base.size ||
> +	    batch_len + batch_start_offset > batch_obj->base.size)
> +		return -E2BIG;
> +
> +	if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
> +		return -ENODEV;
> +
> +	ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> +		return ret;
> +	}
> +
> +	ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> +		goto unpin;
>  	}
>  
>  	/*
> @@ -1129,69 +1054,138 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
>  	 * large or larger and copy_batch() will write MI_NOPs to the extra
>  	 * space. Parsing should be faster in some cases this way.
>  	 */
> -	batch_end = batch_base + (batch_len / sizeof(*batch_end));
> +	ret = -EINVAL;
> +	rewind = batch_obj->get_page;
> +
> +	dst_iter = 0;
> +	dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, dst_iter));
> +
> +	in = offset_in_page(batch_start_offset);
> +	out = 0;
> +	for (src_iter = batch_start_offset >> PAGE_SHIFT;
> +	     src_iter < batch_obj->base.size >> PAGE_SHIFT;
> +	     src_iter++) {
> +		u32 this, i, j, k;
> +		u32 *cmd, *page_end, *batch_end;
> +
> +		this = batch_len;
> +		if (this > PAGE_SIZE - in)
> +			this = PAGE_SIZE - in;
> +
> +		src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter));
> +		if (needs_clflush)
> +			drm_clflush_virt_range(src + in, this);
> +
> +		i = this;
> +		j = in;
> +		do {
> +			/* We keep dst around so that we do not blow
> +			 * the CPU caches immediately after the copy (due
> +			 * to the kunmap_atomic(dst) doing a TLB flush.
> +			 */
> +			if (out == PAGE_SIZE) {
> +				kunmap_atomic(dst);
> +				dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter));
> +				out = 0;
> +			}
>  
> -	cmd = batch_base;
> -	while (cmd < batch_end) {
> -		const struct drm_i915_cmd_descriptor *desc;
> -		u32 length;
> +			k = i;
> +			if (k > PAGE_SIZE - out)
> +				k = PAGE_SIZE - out;
> +			if (k == PAGE_SIZE)
> +				copy_page(dst, src);
> +			else
> +				memcpy(dst + out, src + j, k);
> +
> +			out += k;
> +			j += k;
> +			i -= k;
> +		} while (i);
> +
> +		cmd = src + in;
> +		page_end = (void *)cmd + this;
> +		batch_end = (void *)cmd + batch_len;
> +
> +		if (partial) {
> +			memcpy(tmp + partial, cmd, (length - partial)*4);
> +			cmd -= partial;
> +			partial = 0;
> +			buf = tmp;
> +			goto check;
> +		}
>  
> -		if (*cmd == MI_BATCH_BUFFER_END)
> -			break;
> +		do {
> +			if (*cmd == MI_BATCH_BUFFER_END) {
> +				if (oacontrol_set) {
> +					DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> +					ret = -EINVAL;
> +				} else
> +					ret = 0;
> +				goto unmap;
> +			}
>  
> -		desc = find_cmd(ring, *cmd, &default_desc);
> -		if (!desc) {
> -			DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
> -					 *cmd);
> -			ret = -EINVAL;
> -			break;
> -		}
> +			desc = find_cmd(ring, *cmd, &default_desc);
> +			if (!desc) {
> +				DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
> +						 *cmd);
> +				goto unmap;
> +			}
>  
> -		/*
> -		 * If the batch buffer contains a chained batch, return an
> -		 * error that tells the caller to abort and dispatch the
> -		 * workload as a non-secure batch.
> -		 */
> -		if (desc->cmd.value == MI_BATCH_BUFFER_START) {
> -			ret = -EACCES;
> -			break;
> -		}
> +			/*
> +			 * If the batch buffer contains a chained batch, return an
> +			 * error that tells the caller to abort and dispatch the
> +			 * workload as a non-secure batch.
> +			 */
> +			if (desc->cmd.value == MI_BATCH_BUFFER_START) {
> +				ret = -EACCES;
> +				goto unmap;
> +			}
>  
> -		if (desc->flags & CMD_DESC_FIXED)
> -			length = desc->length.fixed;
> -		else
> -			length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
> -
> -		if ((batch_end - cmd) < length) {
> -			DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
> -					 *cmd,
> -					 length,
> -					 batch_end - cmd);
> -			ret = -EINVAL;
> -			break;
> -		}
> +			if (desc->flags & CMD_DESC_FIXED)
> +				length = desc->length.fixed;
> +			else
> +				length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
>  
> -		if (!check_cmd(ring, desc, cmd, length, is_master,
> -			       &oacontrol_set)) {
> -			ret = -EINVAL;
> -			break;
> -		}
> +			if (cmd + length > page_end) {
> +				if (length + cmd > batch_end) {
> +					DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
> +							 *cmd, length, batch_end - cmd);
> +					goto unmap;
> +				}
>  
> -		cmd += length;
> -	}
> +				if (WARN_ON(length > sizeof(tmp)/4)) {
> +					ret = -ENODEV;
> +					goto unmap;
> +				}
>  
> -	if (oacontrol_set) {
> -		DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> -		ret = -EINVAL;
> -	}
> +				partial = page_end - cmd;
> +				memcpy(tmp, cmd, partial*4);
> +				break;
> +			}
>  
> -	if (cmd >= batch_end) {
> -		DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
> -		ret = -EINVAL;
> -	}
> +			buf = cmd;
> +check:
> +			if (!check_cmd(ring, desc, buf, length, is_master,
> +				       &oacontrol_set))
> +				goto unmap;
>  
> -	vunmap(batch_base);
> +			cmd += length;
> +		} while (cmd < page_end);
> +
> +		batch_len -= this;
> +		if (batch_len == 0)
> +			break;
> +
> +		kunmap_atomic(src);
> +		in = 0;
> +	}
>  
> +unmap:
> +	kunmap_atomic(src);
> +	kunmap_atomic(dst);
> +	batch_obj->get_page = rewind;
> +unpin:
> +	i915_gem_object_unpin_pages(batch_obj);
>  	return ret;
>  }
>  
> -- 
> 2.6.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Oct. 6, 2015, 1 p.m. UTC | #6
On Tue, Oct 06, 2015 at 02:44:22PM +0200, Daniel Vetter wrote:
> On Thu, Oct 01, 2015 at 12:57:10PM +0100, Chris Wilson wrote:
> > With a little complexity to handle cmds straddling page boundaries, we
> > can completely avoiding having to vmap the batch and the shadow batch
> > objects whilst running the command parser.
> > 
> > On ivb i7-3720MQ:
> > 
> > x11perf -dot before 54.3M, after 53.2M (max 203M)
> > glxgears before 7110 fps, after 7300 fps (max 7860 fps)
> > 
> > Before:
> > Time to blt 16384 bytes x      1:	 12.400µs, 1.2GiB/s
> > Time to blt 16384 bytes x   4096:	  3.055µs, 5.0GiB/s
> > 
> > After:
> > Time to blt 16384 bytes x      1:	  8.600µs, 1.8GiB/s
> > Time to blt 16384 bytes x   4096:	  2.456µs, 6.2GiB/s
> 
> Numbers for the overall series (or individual patches would be even
> better) are needed. I thought you have this neat script now to do that for
> an entire series?

Note that numbers on a patch are for the patch unless otherwise stated.
Double so since these are numbers from when I first posted it and didn't
have anything else to boost cmdparser perf.

I do and I even added the benchmark to demonstrate one case. Then I
forgot to enable the cmdparser in mesa and so its numbers are bunk.
Fancy scripts still can't eliminate pebkac.

However, it did show that we still get the throughput improvement from
killing the vmap even with the temporary copy.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 237ff6884a22..91e4baa0f2b8 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -852,100 +852,6 @@  find_reg(const struct drm_i915_reg_descriptor *table,
 	return NULL;
 }
 
-static u32 *vmap_batch(struct drm_i915_gem_object *obj,
-		       unsigned start, unsigned len)
-{
-	int i;
-	void *addr = NULL;
-	struct sg_page_iter sg_iter;
-	int first_page = start >> PAGE_SHIFT;
-	int last_page = (len + start + 4095) >> PAGE_SHIFT;
-	int npages = last_page - first_page;
-	struct page **pages;
-
-	pages = drm_malloc_ab(npages, sizeof(*pages));
-	if (pages == NULL) {
-		DRM_DEBUG_DRIVER("Failed to get space for pages\n");
-		goto finish;
-	}
-
-	i = 0;
-	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
-		pages[i++] = sg_page_iter_page(&sg_iter);
-		if (i == npages)
-			break;
-	}
-
-	addr = vmap(pages, i, 0, PAGE_KERNEL);
-	if (addr == NULL) {
-		DRM_DEBUG_DRIVER("Failed to vmap pages\n");
-		goto finish;
-	}
-
-finish:
-	if (pages)
-		drm_free_large(pages);
-	return (u32*)addr;
-}
-
-/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
-static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
-		       struct drm_i915_gem_object *src_obj,
-		       u32 batch_start_offset,
-		       u32 batch_len)
-{
-	int needs_clflush = 0;
-	void *src_base, *src;
-	void *dst = NULL;
-	int ret;
-
-	if (batch_len > dest_obj->base.size ||
-	    batch_len + batch_start_offset > src_obj->base.size)
-		return ERR_PTR(-E2BIG);
-
-	if (WARN_ON(dest_obj->pages_pin_count == 0))
-		return ERR_PTR(-ENODEV);
-
-	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
-	if (ret) {
-		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
-		return ERR_PTR(ret);
-	}
-
-	src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
-	if (!src_base) {
-		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
-		ret = -ENOMEM;
-		goto unpin_src;
-	}
-
-	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
-	if (ret) {
-		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
-		goto unmap_src;
-	}
-
-	dst = vmap_batch(dest_obj, 0, batch_len);
-	if (!dst) {
-		DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
-		ret = -ENOMEM;
-		goto unmap_src;
-	}
-
-	src = src_base + offset_in_page(batch_start_offset);
-	if (needs_clflush)
-		drm_clflush_virt_range(src, batch_len);
-
-	memcpy(dst, src, batch_len);
-
-unmap_src:
-	vunmap(src_base);
-unpin_src:
-	i915_gem_object_unpin_pages(src_obj);
-
-	return ret ? ERR_PTR(ret) : dst;
-}
-
 /**
  * i915_needs_cmd_parser() - should a given ring use software command parsing?
  * @ring: the ring in question
@@ -1112,16 +1018,35 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 		    u32 batch_len,
 		    bool is_master)
 {
-	u32 *cmd, *batch_base, *batch_end;
+	u32 tmp[128];
+	const struct drm_i915_cmd_descriptor *desc;
+	unsigned dst_iter, src_iter;
+	int needs_clflush = 0;
+	struct get_page rewind;
+	void *src, *dst;
+	unsigned in, out;
+	u32 *buf, partial = 0, length;
 	struct drm_i915_cmd_descriptor default_desc = { 0 };
 	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
 	int ret = 0;
 
-	batch_base = copy_batch(shadow_batch_obj, batch_obj,
-				batch_start_offset, batch_len);
-	if (IS_ERR(batch_base)) {
-		DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
-		return PTR_ERR(batch_base);
+	if (batch_len > shadow_batch_obj->base.size ||
+	    batch_len + batch_start_offset > batch_obj->base.size)
+		return -E2BIG;
+
+	if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
+		return -ENODEV;
+
+	ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
+	if (ret) {
+		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
+		return ret;
+	}
+
+	ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
+	if (ret) {
+		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
+		goto unpin;
 	}
 
 	/*
@@ -1129,69 +1054,138 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 	 * large or larger and copy_batch() will write MI_NOPs to the extra
 	 * space. Parsing should be faster in some cases this way.
 	 */
-	batch_end = batch_base + (batch_len / sizeof(*batch_end));
+	ret = -EINVAL;
+	rewind = batch_obj->get_page;
+
+	dst_iter = 0;
+	dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, dst_iter));
+
+	in = offset_in_page(batch_start_offset);
+	out = 0;
+	for (src_iter = batch_start_offset >> PAGE_SHIFT;
+	     src_iter < batch_obj->base.size >> PAGE_SHIFT;
+	     src_iter++) {
+		u32 this, i, j, k;
+		u32 *cmd, *page_end, *batch_end;
+
+		this = batch_len;
+		if (this > PAGE_SIZE - in)
+			this = PAGE_SIZE - in;
+
+		src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter));
+		if (needs_clflush)
+			drm_clflush_virt_range(src + in, this);
+
+		i = this;
+		j = in;
+		do {
+			/* We keep dst around so that we do not blow
+			 * the CPU caches immediately after the copy (due
+			 * to the kunmap_atomic(dst) doing a TLB flush.
+			 */
+			if (out == PAGE_SIZE) {
+				kunmap_atomic(dst);
+				dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter));
+				out = 0;
+			}
 
-	cmd = batch_base;
-	while (cmd < batch_end) {
-		const struct drm_i915_cmd_descriptor *desc;
-		u32 length;
+			k = i;
+			if (k > PAGE_SIZE - out)
+				k = PAGE_SIZE - out;
+			if (k == PAGE_SIZE)
+				copy_page(dst, src);
+			else
+				memcpy(dst + out, src + j, k);
+
+			out += k;
+			j += k;
+			i -= k;
+		} while (i);
+
+		cmd = src + in;
+		page_end = (void *)cmd + this;
+		batch_end = (void *)cmd + batch_len;
+
+		if (partial) {
+			memcpy(tmp + partial, cmd, (length - partial)*4);
+			cmd -= partial;
+			partial = 0;
+			buf = tmp;
+			goto check;
+		}
 
-		if (*cmd == MI_BATCH_BUFFER_END)
-			break;
+		do {
+			if (*cmd == MI_BATCH_BUFFER_END) {
+				if (oacontrol_set) {
+					DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
+					ret = -EINVAL;
+				} else
+					ret = 0;
+				goto unmap;
+			}
 
-		desc = find_cmd(ring, *cmd, &default_desc);
-		if (!desc) {
-			DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
-					 *cmd);
-			ret = -EINVAL;
-			break;
-		}
+			desc = find_cmd(ring, *cmd, &default_desc);
+			if (!desc) {
+				DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
+						 *cmd);
+				goto unmap;
+			}
 
-		/*
-		 * If the batch buffer contains a chained batch, return an
-		 * error that tells the caller to abort and dispatch the
-		 * workload as a non-secure batch.
-		 */
-		if (desc->cmd.value == MI_BATCH_BUFFER_START) {
-			ret = -EACCES;
-			break;
-		}
+			/*
+			 * If the batch buffer contains a chained batch, return an
+			 * error that tells the caller to abort and dispatch the
+			 * workload as a non-secure batch.
+			 */
+			if (desc->cmd.value == MI_BATCH_BUFFER_START) {
+				ret = -EACCES;
+				goto unmap;
+			}
 
-		if (desc->flags & CMD_DESC_FIXED)
-			length = desc->length.fixed;
-		else
-			length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
-
-		if ((batch_end - cmd) < length) {
-			DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
-					 *cmd,
-					 length,
-					 batch_end - cmd);
-			ret = -EINVAL;
-			break;
-		}
+			if (desc->flags & CMD_DESC_FIXED)
+				length = desc->length.fixed;
+			else
+				length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
 
-		if (!check_cmd(ring, desc, cmd, length, is_master,
-			       &oacontrol_set)) {
-			ret = -EINVAL;
-			break;
-		}
+			if (cmd + length > page_end) {
+				if (length + cmd > batch_end) {
+					DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
+							 *cmd, length, batch_end - cmd);
+					goto unmap;
+				}
 
-		cmd += length;
-	}
+				if (WARN_ON(length > sizeof(tmp)/4)) {
+					ret = -ENODEV;
+					goto unmap;
+				}
 
-	if (oacontrol_set) {
-		DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
-		ret = -EINVAL;
-	}
+				partial = page_end - cmd;
+				memcpy(tmp, cmd, partial*4);
+				break;
+			}
 
-	if (cmd >= batch_end) {
-		DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
-		ret = -EINVAL;
-	}
+			buf = cmd;
+check:
+			if (!check_cmd(ring, desc, buf, length, is_master,
+				       &oacontrol_set))
+				goto unmap;
 
-	vunmap(batch_base);
+			cmd += length;
+		} while (cmd < page_end);
+
+		batch_len -= this;
+		if (batch_len == 0)
+			break;
+
+		kunmap_atomic(src);
+		in = 0;
+	}
 
+unmap:
+	kunmap_atomic(src);
+	kunmap_atomic(dst);
+	batch_obj->get_page = rewind;
+unpin:
+	i915_gem_object_unpin_pages(batch_obj);
 	return ret;
 }