diff mbox

[v3] drm/i915: Eliminate vmap overhead for cmd parser

Message ID 1448033483-3884-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Nov. 20, 2015, 3:31 p.m. UTC
With a little complexity to handle cmds straddling page boundaries, we
can completely avoiding having to vmap the batch and the shadow batch
objects whilst running the command parser.

On ivb i7-3720MQ:

x11perf -dot before 54.3M, after 53.2M (max 203M)
glxgears before 7110 fps, after 7300 fps (max 7860 fps)

Before:
Time to blt 16384 bytes x      1:	 12.400µs, 1.2GiB/s
Time to blt 16384 bytes x   4096:	  3.055µs, 5.0GiB/s

After:
Time to blt 16384 bytes x      1:	  8.600µs, 1.8GiB/s
Time to blt 16384 bytes x   4096:	  2.456µs, 6.2GiB/s

Removing the vmap is mostly a win, except we lose in a few cases where
the batch size is greater than a page due to the extra complexity (loss
of a simple cache efficient large copy, and boundary handling).

v2: Reorder so that we do check oacontrol remaining set at end-of-batch
v3: Stage the copy through a temporary page so that we only copy into
dst commands that have been verified.
v4: Move the batch offset/length validation to execbuf.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 258 ++++++++++++++++-----------------
 1 file changed, 127 insertions(+), 131 deletions(-)

Comments

Ville Syrjälä Nov. 25, 2015, 7:51 p.m. UTC | #1
On Fri, Nov 20, 2015 at 03:31:23PM +0000, Chris Wilson wrote:
> With a little complexity to handle cmds straddling page boundaries, we
> can completely avoiding having to vmap the batch and the shadow batch
> objects whilst running the command parser.
> 
> On ivb i7-3720MQ:
> 
> x11perf -dot before 54.3M, after 53.2M (max 203M)
> glxgears before 7110 fps, after 7300 fps (max 7860 fps)
> 
> Before:
> Time to blt 16384 bytes x      1:	 12.400µs, 1.2GiB/s
> Time to blt 16384 bytes x   4096:	  3.055µs, 5.0GiB/s
> 
> After:
> Time to blt 16384 bytes x      1:	  8.600µs, 1.8GiB/s
> Time to blt 16384 bytes x   4096:	  2.456µs, 6.2GiB/s
> 
> Removing the vmap is mostly a win, except we lose in a few cases where
> the batch size is greater than a page due to the extra complexity (loss
> of a simple cache efficient large copy, and boundary handling).
> 
> v2: Reorder so that we do check oacontrol remaining set at end-of-batch
> v3: Stage the copy through a temporary page so that we only copy into
> dst commands that have been verified.
> v4: Move the batch offset/length validation to execbuf.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_cmd_parser.c | 258 ++++++++++++++++-----------------
>  1 file changed, 127 insertions(+), 131 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> index 814d894ed925..86910e17505a 100644
> --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> @@ -860,100 +860,6 @@ find_reg(const struct drm_i915_reg_descriptor *table,
>  	return NULL;
>  }
>  
> -static u32 *vmap_batch(struct drm_i915_gem_object *obj,
> -		       unsigned start, unsigned len)
> -{
> -	int i;
> -	void *addr = NULL;
> -	struct sg_page_iter sg_iter;
> -	int first_page = start >> PAGE_SHIFT;
> -	int last_page = (len + start + 4095) >> PAGE_SHIFT;
> -	int npages = last_page - first_page;
> -	struct page **pages;
> -
> -	pages = drm_malloc_ab(npages, sizeof(*pages));
> -	if (pages == NULL) {
> -		DRM_DEBUG_DRIVER("Failed to get space for pages\n");
> -		goto finish;
> -	}
> -
> -	i = 0;
> -	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
> -		pages[i++] = sg_page_iter_page(&sg_iter);
> -		if (i == npages)
> -			break;
> -	}
> -
> -	addr = vmap(pages, i, 0, PAGE_KERNEL);
> -	if (addr == NULL) {
> -		DRM_DEBUG_DRIVER("Failed to vmap pages\n");
> -		goto finish;
> -	}
> -
> -finish:
> -	if (pages)
> -		drm_free_large(pages);
> -	return (u32*)addr;
> -}
> -
> -/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
> -static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
> -		       struct drm_i915_gem_object *src_obj,
> -		       u32 batch_start_offset,
> -		       u32 batch_len)
> -{
> -	int needs_clflush = 0;
> -	void *src_base, *src;
> -	void *dst = NULL;
> -	int ret;
> -
> -	if (batch_len > dest_obj->base.size ||
> -	    batch_len + batch_start_offset > src_obj->base.size)
> -		return ERR_PTR(-E2BIG);
> -
> -	if (WARN_ON(dest_obj->pages_pin_count == 0))
> -		return ERR_PTR(-ENODEV);
> -
> -	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> -		return ERR_PTR(ret);
> -	}
> -
> -	src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
> -	if (!src_base) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
> -		ret = -ENOMEM;
> -		goto unpin_src;
> -	}
> -
> -	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> -		goto unmap_src;
> -	}
> -
> -	dst = vmap_batch(dest_obj, 0, batch_len);
> -	if (!dst) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
> -		ret = -ENOMEM;
> -		goto unmap_src;
> -	}
> -
> -	src = src_base + offset_in_page(batch_start_offset);
> -	if (needs_clflush)
> -		drm_clflush_virt_range(src, batch_len);
> -
> -	memcpy(dst, src, batch_len);
> -
> -unmap_src:
> -	vunmap(src_base);
> -unpin_src:
> -	i915_gem_object_unpin_pages(src_obj);
> -
> -	return ret ? ERR_PTR(ret) : dst;
> -}
> -
>  /**
>   * i915_needs_cmd_parser() - should a given ring use software command parsing?
>   * @ring: the ring in question
> @@ -1097,6 +1003,7 @@ static bool check_cmd(const struct intel_engine_cs *ring,
>  }
>  
>  #define LENGTH_BIAS 2
> +#define MAX_PARTIAL 256

There seems to some confusion whether this is bytes or dwords.

Also I guess we already end up allocating two pages anyway, so
maybe MAX_PARTIAL should just be one page? It's still not big
enough to cover the max legal cmd length AFAICS, so I think
the WARN in the check needs to be removed.

>  
>  /**
>   * i915_parse_cmds() - parse a submitted batch buffer for privilege violations
> @@ -1120,39 +1027,91 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
>  		    u32 batch_len,
>  		    bool is_master)
>  {
> -	u32 *cmd, *batch_base, *batch_end;
> +	const struct drm_i915_cmd_descriptor *desc;
> +	unsigned dst_iter, src_iter;
> +	int needs_clflush = 0;
> +	struct get_page rewind;
> +	void *src, *dst, *tmp;
> +	u32 partial, length;
> +	unsigned in, out;
>  	struct drm_i915_cmd_descriptor default_desc = { 0 };
>  	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
> -	int ret = 0;
> +	int ret;
>  
> -	batch_base = copy_batch(shadow_batch_obj, batch_obj,
> -				batch_start_offset, batch_len);
> -	if (IS_ERR(batch_base)) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
> -		return PTR_ERR(batch_base);
> +	if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
> +		return -ENODEV;
> +
> +	ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> +		return ret;
>  	}
>  
> -	/*
> -	 * We use the batch length as size because the shadow object is as
> -	 * large or larger and copy_batch() will write MI_NOPs to the extra
> -	 * space. Parsing should be faster in some cases this way.
> +	ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> +		goto unpin;
> +	}
> +
> +	tmp = kmalloc(PAGE_SIZE + MAX_PARTIAL, GFP_TEMPORARY);
> +	if (tmp == NULL)
> +		return -ENOMEM;
> +
> +	/* Keep the original cached iterator around as that is likely
> +	 * to be more useful in future ioctls.
>  	 */
> -	batch_end = batch_base + (batch_len / sizeof(*batch_end));
> +	rewind = batch_obj->get_page;
>  
> -	cmd = batch_base;
> -	while (cmd < batch_end) {
> -		const struct drm_i915_cmd_descriptor *desc;
> -		u32 length;
> +	ret = -EINVAL;
> +
> +	dst_iter = 0;
> +	dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, dst_iter));
> +	out = 0;
> +
> +	in = offset_in_page(batch_start_offset);
> +	partial = 0;
> +	for (src_iter = batch_start_offset >> PAGE_SHIFT;
> +	     src_iter < batch_obj->base.size >> PAGE_SHIFT;
> +	     src_iter++) {

So we're iterating over all the pages. Should be enough to iterate
until batch_start_offset+batch_len I suppose, but as long as we bail
out when we run out of batch it should be fine.

I see there's a batch_len check at the end, but I don't see us handling
the case when the user already gives us something with batch_len==0.
Maybe that should be rejected somewhere higher up?

Also what happens if we don't find MI_BATCH_BUFFER_END before running
out of batch? Oh, I see, we set ret=-EINVAL, and clear it to 0 when we
find MI_BATCH_BUFFER_END. So that part seems to be fine.

> +		u32 *cmd, *page_end, *batch_end;
> +		u32 this;
> +
> +		this = batch_len;

I was a bit concerned about batch_len & 3, but we already check for
batch_len&7==0 in i915_gem_check_execbuffer(), so it should be good here.

> +		if (this > PAGE_SIZE - in)
> +			this = PAGE_SIZE - in;
>  
> -		if (*cmd == MI_BATCH_BUFFER_END)
> +		src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter));
> +		if (needs_clflush)
> +			drm_clflush_virt_range(src + in, this);
> +
> +		if (this == PAGE_SIZE && partial == 0)
> +			copy_page(tmp, src);
> +		else
> +			memcpy(tmp+partial, src+in, this);
> +
> +		cmd = tmp;
> +		page_end = tmp + this + partial;
> +		batch_end = tmp + batch_len + partial;
> +		partial = 0;
> +
> +		do {
> +			if (*cmd == MI_BATCH_BUFFER_END) {
> +				if (oacontrol_set) {
> +					DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> +					goto unmap;
> +				}
> +
> +				cmd++; /* copy this cmd to dst */
> +				batch_len = this; /* no more src */
> +				ret = 0;
>  				break;
> +			}
>  
>  			desc = find_cmd(ring, *cmd, &default_desc);
>  			if (!desc) {
>  				DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
>  						 *cmd);
> -			ret = -EINVAL;
> -			break;
> +				goto unmap;
>  			}
>  
>  			/*
> @@ -1162,7 +1121,7 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
>  			 */
>  			if (desc->cmd.value == MI_BATCH_BUFFER_START) {
>  				ret = -EACCES;

Bleh. I wonder why we even bother with this thing on VLV/IVB when
it's that easy to bypass...

> -			break;
> +				goto unmap;
>  			}
>  
>  			if (desc->flags & CMD_DESC_FIXED)
> @@ -1170,36 +1129,73 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
>  			else
>  				length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
>  
> -		if ((batch_end - cmd) < length) {
> +			if (unlikely(cmd + length > page_end)) {
> +				if (unlikely(cmd + length > batch_end)) {
>  					DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
> -					 *cmd,
> -					 length,
> -					 batch_end - cmd);
> -			ret = -EINVAL;
> -			break;
> +							 *cmd, length, batch_end - cmd);
> +					goto unmap;
>  				}
>  
> -		if (!check_cmd(ring, desc, cmd, length, is_master,
> -			       &oacontrol_set)) {
> -			ret = -EINVAL;
> +				if (WARN_ON(length > MAX_PARTIAL)) {
> +					ret = -ENODEV;
> +					goto unmap;
> +				}
> +
> +				partial = 4*(page_end - cmd);
>  				break;
>  			}
>  
> +			if (!check_cmd(ring, desc, cmd, length, is_master,
> +				       &oacontrol_set))
> +				goto unmap;
> +
>  			cmd += length;
> -	}
> +		} while (cmd < page_end);
>  
> -	if (oacontrol_set) {
> -		DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> -		ret = -EINVAL;
> +		/* Copy the validated page to the GPU batch */
> +		{
> +			/* exclude partial cmd, we copy it next time */
> +			unsigned dst_length = (void *)cmd - tmp;
> +			in = 0;
> +			do {
> +				int len;
> +
> +				if (out == PAGE_SIZE) {
> +					kunmap_atomic(dst);
> +					dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter));
> +					out = 0;
>  				}
>  
> -	if (cmd >= batch_end) {
> -		DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
> -		ret = -EINVAL;
> +				len = dst_length;
> +				if (len + out > PAGE_SIZE)
> +					len = PAGE_SIZE - out;
> +				if (len == PAGE_SIZE)
> +					copy_page(dst, tmp + in);
> +				else
> +					memcpy(dst + out, tmp + in, len);
> +				in += len;
> +				out += len;
> +				dst_length -= len;
> +			} while (dst_length);
>  		}
>  
> -	vunmap(batch_base);
> +		batch_len -= this;
> +		if (batch_len == 0)
> +			break;
>  
> +		if (partial)
> +			memmove(tmp, cmd, partial);
> +
> +		kunmap_atomic(src);
> +		in = 0;
> +	}
> +unmap:
> +	kunmap_atomic(src);
> +	kunmap_atomic(dst);
> +	batch_obj->get_page = rewind;
> +	kfree(tmp);
> +unpin:
> +	i915_gem_object_unpin_pages(batch_obj);
>  	return ret;
>  }
>  
> -- 
> 2.6.2
Chris Wilson Nov. 25, 2015, 8:13 p.m. UTC | #2
On Wed, Nov 25, 2015 at 09:51:08PM +0200, Ville Syrjälä wrote:
> On Fri, Nov 20, 2015 at 03:31:23PM +0000, Chris Wilson wrote:
> > @@ -1097,6 +1003,7 @@ static bool check_cmd(const struct intel_engine_cs *ring,
> >  }
> >  
> >  #define LENGTH_BIAS 2
> > +#define MAX_PARTIAL 256
> 
> There seems to some confusion whether this is bytes or dwords.

Indeed, I can't remember of the top of my head.

(Double checked that the set of commands that I was thinking were 132
bytes.)
 
> Also I guess we already end up allocating two pages anyway, so
> maybe MAX_PARTIAL should just be one page? It's still not big
> enough to cover the max legal cmd length AFAICS, so I think
> the WARN in the check needs to be removed.

Sure, rounding up the next 8192 byte slab cache doesn't seem like it
will bite us.

So #define MAX_PARTIAL_BYTES PAGE_SIZE

> > +	in = offset_in_page(batch_start_offset);
> > +	partial = 0;
> > +	for (src_iter = batch_start_offset >> PAGE_SHIFT;
> > +	     src_iter < batch_obj->base.size >> PAGE_SHIFT;
> > +	     src_iter++) {
> 
> So we're iterating over all the pages. Should be enough to iterate
> until batch_start_offset+batch_len I suppose, but as long as we bail
> out when we run out of batch it should be fine.

Right, this was mostly convenience for writing the loop bounds - it was
more or less a simple conversion from the old iterator.

> I see there's a batch_len check at the end, but I don't see us handling
> the case when the user already gives us something with batch_len==0.
> Maybe that should be rejected somewhere higher up?

batch_len = 0 is filtered out in the caller...
 
> Also what happens if we don't find MI_BATCH_BUFFER_END before running
> out of batch? Oh, I see, we set ret=-EINVAL, and clear it to 0 when we
> find MI_BATCH_BUFFER_END. So that part seems to be fine.
> 
> > +		u32 *cmd, *page_end, *batch_end;
> > +		u32 this;
> > +
> > +		this = batch_len;
> 
> I was a bit concerned about batch_len & 3, but we already check for
> batch_len&7==0 in i915_gem_check_execbuffer(), so it should be good here.

cmdparser_assert(batch_len > 0 && (batch_len & 3) == 0);

as documentation for the contract?
-Chris
Ville Syrjälä Nov. 25, 2015, 9:15 p.m. UTC | #3
On Wed, Nov 25, 2015 at 08:13:43PM +0000, Chris Wilson wrote:
> On Wed, Nov 25, 2015 at 09:51:08PM +0200, Ville Syrjälä wrote:
> > On Fri, Nov 20, 2015 at 03:31:23PM +0000, Chris Wilson wrote:
> > > @@ -1097,6 +1003,7 @@ static bool check_cmd(const struct intel_engine_cs *ring,
> > >  }
> > >  
> > >  #define LENGTH_BIAS 2
> > > +#define MAX_PARTIAL 256
> > 
> > There seems to some confusion whether this is bytes or dwords.
> 
> Indeed, I can't remember of the top of my head.
> 
> (Double checked that the set of commands that I was thinking were 132
> bytes.)
>  
> > Also I guess we already end up allocating two pages anyway, so
> > maybe MAX_PARTIAL should just be one page? It's still not big
> > enough to cover the max legal cmd length AFAICS, so I think
> > the WARN in the check needs to be removed.
> 
> Sure, rounding up the next 8192 byte slab cache doesn't seem like it
> will bite us.
> 
> So #define MAX_PARTIAL_BYTES PAGE_SIZE
> 
> > > +	in = offset_in_page(batch_start_offset);
> > > +	partial = 0;
> > > +	for (src_iter = batch_start_offset >> PAGE_SHIFT;
> > > +	     src_iter < batch_obj->base.size >> PAGE_SHIFT;
> > > +	     src_iter++) {
> > 
> > So we're iterating over all the pages. Should be enough to iterate
> > until batch_start_offset+batch_len I suppose, but as long as we bail
> > out when we run out of batch it should be fine.
> 
> Right, this was mostly convenience for writing the loop bounds - it was
> more or less a simple conversion from the old iterator.
> 
> > I see there's a batch_len check at the end, but I don't see us handling
> > the case when the user already gives us something with batch_len==0.
> > Maybe that should be rejected somewhere higher up?
> 
> batch_len = 0 is filtered out in the caller...

Oh yeah, see it now.

>  
> > Also what happens if we don't find MI_BATCH_BUFFER_END before running
> > out of batch? Oh, I see, we set ret=-EINVAL, and clear it to 0 when we
> > find MI_BATCH_BUFFER_END. So that part seems to be fine.
> > 
> > > +		u32 *cmd, *page_end, *batch_end;
> > > +		u32 this;
> > > +
> > > +		this = batch_len;
> > 
> > I was a bit concerned about batch_len & 3, but we already check for
> > batch_len&7==0 in i915_gem_check_execbuffer(), so it should be good here.
> 
> cmdparser_assert(batch_len > 0 && (batch_len & 3) == 0);
> 
> as documentation for the contract?

I won't insist, but feel free to add something like that if you
wish.
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 814d894ed925..86910e17505a 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -860,100 +860,6 @@  find_reg(const struct drm_i915_reg_descriptor *table,
 	return NULL;
 }
 
-static u32 *vmap_batch(struct drm_i915_gem_object *obj,
-		       unsigned start, unsigned len)
-{
-	int i;
-	void *addr = NULL;
-	struct sg_page_iter sg_iter;
-	int first_page = start >> PAGE_SHIFT;
-	int last_page = (len + start + 4095) >> PAGE_SHIFT;
-	int npages = last_page - first_page;
-	struct page **pages;
-
-	pages = drm_malloc_ab(npages, sizeof(*pages));
-	if (pages == NULL) {
-		DRM_DEBUG_DRIVER("Failed to get space for pages\n");
-		goto finish;
-	}
-
-	i = 0;
-	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
-		pages[i++] = sg_page_iter_page(&sg_iter);
-		if (i == npages)
-			break;
-	}
-
-	addr = vmap(pages, i, 0, PAGE_KERNEL);
-	if (addr == NULL) {
-		DRM_DEBUG_DRIVER("Failed to vmap pages\n");
-		goto finish;
-	}
-
-finish:
-	if (pages)
-		drm_free_large(pages);
-	return (u32*)addr;
-}
-
-/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
-static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
-		       struct drm_i915_gem_object *src_obj,
-		       u32 batch_start_offset,
-		       u32 batch_len)
-{
-	int needs_clflush = 0;
-	void *src_base, *src;
-	void *dst = NULL;
-	int ret;
-
-	if (batch_len > dest_obj->base.size ||
-	    batch_len + batch_start_offset > src_obj->base.size)
-		return ERR_PTR(-E2BIG);
-
-	if (WARN_ON(dest_obj->pages_pin_count == 0))
-		return ERR_PTR(-ENODEV);
-
-	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
-	if (ret) {
-		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
-		return ERR_PTR(ret);
-	}
-
-	src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
-	if (!src_base) {
-		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
-		ret = -ENOMEM;
-		goto unpin_src;
-	}
-
-	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
-	if (ret) {
-		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
-		goto unmap_src;
-	}
-
-	dst = vmap_batch(dest_obj, 0, batch_len);
-	if (!dst) {
-		DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
-		ret = -ENOMEM;
-		goto unmap_src;
-	}
-
-	src = src_base + offset_in_page(batch_start_offset);
-	if (needs_clflush)
-		drm_clflush_virt_range(src, batch_len);
-
-	memcpy(dst, src, batch_len);
-
-unmap_src:
-	vunmap(src_base);
-unpin_src:
-	i915_gem_object_unpin_pages(src_obj);
-
-	return ret ? ERR_PTR(ret) : dst;
-}
-
 /**
  * i915_needs_cmd_parser() - should a given ring use software command parsing?
  * @ring: the ring in question
@@ -1097,6 +1003,7 @@  static bool check_cmd(const struct intel_engine_cs *ring,
 }
 
 #define LENGTH_BIAS 2
+#define MAX_PARTIAL 256
 
 /**
  * i915_parse_cmds() - parse a submitted batch buffer for privilege violations
@@ -1120,39 +1027,91 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 		    u32 batch_len,
 		    bool is_master)
 {
-	u32 *cmd, *batch_base, *batch_end;
+	const struct drm_i915_cmd_descriptor *desc;
+	unsigned dst_iter, src_iter;
+	int needs_clflush = 0;
+	struct get_page rewind;
+	void *src, *dst, *tmp;
+	u32 partial, length;
+	unsigned in, out;
 	struct drm_i915_cmd_descriptor default_desc = { 0 };
 	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
-	int ret = 0;
+	int ret;
 
-	batch_base = copy_batch(shadow_batch_obj, batch_obj,
-				batch_start_offset, batch_len);
-	if (IS_ERR(batch_base)) {
-		DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
-		return PTR_ERR(batch_base);
+	if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
+		return -ENODEV;
+
+	ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
+	if (ret) {
+		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
+		return ret;
 	}
 
-	/*
-	 * We use the batch length as size because the shadow object is as
-	 * large or larger and copy_batch() will write MI_NOPs to the extra
-	 * space. Parsing should be faster in some cases this way.
+	ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
+	if (ret) {
+		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
+		goto unpin;
+	}
+
+	tmp = kmalloc(PAGE_SIZE + MAX_PARTIAL, GFP_TEMPORARY);
+	if (tmp == NULL)
+		return -ENOMEM;
+
+	/* Keep the original cached iterator around as that is likely
+	 * to be more useful in future ioctls.
 	 */
-	batch_end = batch_base + (batch_len / sizeof(*batch_end));
+	rewind = batch_obj->get_page;
 
-	cmd = batch_base;
-	while (cmd < batch_end) {
-		const struct drm_i915_cmd_descriptor *desc;
-		u32 length;
+	ret = -EINVAL;
+
+	dst_iter = 0;
+	dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, dst_iter));
+	out = 0;
+
+	in = offset_in_page(batch_start_offset);
+	partial = 0;
+	for (src_iter = batch_start_offset >> PAGE_SHIFT;
+	     src_iter < batch_obj->base.size >> PAGE_SHIFT;
+	     src_iter++) {
+		u32 *cmd, *page_end, *batch_end;
+		u32 this;
+
+		this = batch_len;
+		if (this > PAGE_SIZE - in)
+			this = PAGE_SIZE - in;
 
-		if (*cmd == MI_BATCH_BUFFER_END)
+		src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter));
+		if (needs_clflush)
+			drm_clflush_virt_range(src + in, this);
+
+		if (this == PAGE_SIZE && partial == 0)
+			copy_page(tmp, src);
+		else
+			memcpy(tmp+partial, src+in, this);
+
+		cmd = tmp;
+		page_end = tmp + this + partial;
+		batch_end = tmp + batch_len + partial;
+		partial = 0;
+
+		do {
+			if (*cmd == MI_BATCH_BUFFER_END) {
+				if (oacontrol_set) {
+					DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
+					goto unmap;
+				}
+
+				cmd++; /* copy this cmd to dst */
+				batch_len = this; /* no more src */
+				ret = 0;
 				break;
+			}
 
 			desc = find_cmd(ring, *cmd, &default_desc);
 			if (!desc) {
 				DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
 						 *cmd);
-			ret = -EINVAL;
-			break;
+				goto unmap;
 			}
 
 			/*
@@ -1162,7 +1121,7 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 			 */
 			if (desc->cmd.value == MI_BATCH_BUFFER_START) {
 				ret = -EACCES;
-			break;
+				goto unmap;
 			}
 
 			if (desc->flags & CMD_DESC_FIXED)
@@ -1170,36 +1129,73 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 			else
 				length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
 
-		if ((batch_end - cmd) < length) {
+			if (unlikely(cmd + length > page_end)) {
+				if (unlikely(cmd + length > batch_end)) {
 					DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
-					 *cmd,
-					 length,
-					 batch_end - cmd);
-			ret = -EINVAL;
-			break;
+							 *cmd, length, batch_end - cmd);
+					goto unmap;
 				}
 
-		if (!check_cmd(ring, desc, cmd, length, is_master,
-			       &oacontrol_set)) {
-			ret = -EINVAL;
+				if (WARN_ON(length > MAX_PARTIAL)) {
+					ret = -ENODEV;
+					goto unmap;
+				}
+
+				partial = 4*(page_end - cmd);
 				break;
 			}
 
+			if (!check_cmd(ring, desc, cmd, length, is_master,
+				       &oacontrol_set))
+				goto unmap;
+
 			cmd += length;
-	}
+		} while (cmd < page_end);
 
-	if (oacontrol_set) {
-		DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
-		ret = -EINVAL;
+		/* Copy the validated page to the GPU batch */
+		{
+			/* exclude partial cmd, we copy it next time */
+			unsigned dst_length = (void *)cmd - tmp;
+			in = 0;
+			do {
+				int len;
+
+				if (out == PAGE_SIZE) {
+					kunmap_atomic(dst);
+					dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter));
+					out = 0;
 				}
 
-	if (cmd >= batch_end) {
-		DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
-		ret = -EINVAL;
+				len = dst_length;
+				if (len + out > PAGE_SIZE)
+					len = PAGE_SIZE - out;
+				if (len == PAGE_SIZE)
+					copy_page(dst, tmp + in);
+				else
+					memcpy(dst + out, tmp + in, len);
+				in += len;
+				out += len;
+				dst_length -= len;
+			} while (dst_length);
 		}
 
-	vunmap(batch_base);
+		batch_len -= this;
+		if (batch_len == 0)
+			break;
 
+		if (partial)
+			memmove(tmp, cmd, partial);
+
+		kunmap_atomic(src);
+		in = 0;
+	}
+unmap:
+	kunmap_atomic(src);
+	kunmap_atomic(dst);
+	batch_obj->get_page = rewind;
+	kfree(tmp);
+unpin:
+	i915_gem_object_unpin_pages(batch_obj);
 	return ret;
 }