diff mbox

[RFC,2/4] drm/i915: Use batch pools with the command parser

Message ID 1403109376-23452-3-git-send-email-bradley.d.volkin@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

bradley.d.volkin@intel.com June 18, 2014, 4:36 p.m. UTC
From: Brad Volkin <bradley.d.volkin@intel.com>

This patch sets up all of the tracking and copying necessary to
use batch pools with the command parser, but does not actually
dispatch the copied (shadow) batch to the hardware yet. We still
aren't quite ready to set the secure bit during dispatch.

Note that performance takes a hit from the copy in some cases
and will likely need some work. At a rough pass, the memcpy
appears to be the bottleneck. Without having done a deeper
analysis, two ideas that come to mind are:
1) Copy sections of the batch at a time, as they are reached
   by parsing. Might improve cache locality.
2) Copy only up to the userspace-supplied batch length and
   memset the rest of the buffer. Reduces the number of reads.

Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c       | 75 ++++++++++++++++++++++------
 drivers/gpu/drm/i915/i915_drv.h              |  7 ++-
 drivers/gpu/drm/i915/i915_gem.c              |  8 ++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c   | 45 +++++++++++++++--
 drivers/gpu/drm/i915/i915_gem_render_state.c |  2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c      | 12 +++++
 drivers/gpu/drm/i915/intel_ringbuffer.h      |  7 +++
 7 files changed, 134 insertions(+), 22 deletions(-)

Comments

Chris Wilson June 18, 2014, 4:52 p.m. UTC | #1
On Wed, Jun 18, 2014 at 09:36:14AM -0700, bradley.d.volkin@intel.com wrote:
> From: Brad Volkin <bradley.d.volkin@intel.com>
> 
> This patch sets up all of the tracking and copying necessary to
> use batch pools with the command parser, but does not actually
> dispatch the copied (shadow) batch to the hardware yet. We still
> aren't quite ready to set the secure bit during dispatch.
> 
> Note that performance takes a hit from the copy in some cases
> and will likely need some work. At a rough pass, the memcpy
> appears to be the bottleneck. Without having done a deeper
> analysis, two ideas that come to mind are:
> 1) Copy sections of the batch at a time, as they are reached
>    by parsing. Might improve cache locality.
> 2) Copy only up to the userspace-supplied batch length and
>    memset the rest of the buffer. Reduces the number of reads.
> 
> Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_cmd_parser.c       | 75 ++++++++++++++++++++++------
>  drivers/gpu/drm/i915/i915_drv.h              |  7 ++-
>  drivers/gpu/drm/i915/i915_gem.c              |  8 ++-
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c   | 45 +++++++++++++++--
>  drivers/gpu/drm/i915/i915_gem_render_state.c |  2 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.c      | 12 +++++
>  drivers/gpu/drm/i915/intel_ringbuffer.h      |  7 +++
>  7 files changed, 134 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> index dea99d9..669afb0 100644
> --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> @@ -831,6 +831,53 @@ finish:
>  	return (u32*)addr;
>  }
>  
> +/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
> +static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
> +		       struct drm_i915_gem_object *src_obj)
> +{
> +	int ret = 0;
> +	int needs_clflush = 0;
> +	u32 *src_addr, *dest_addr = NULL;
> +
> +	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("CMD: failed to prep read\n");
> +		return ERR_PTR(ret);
> +	}
> +
> +	src_addr = vmap_batch(src_obj);
> +	if (!src_addr) {
> +		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
> +		ret = -ENOMEM;
> +		goto unpin_src;
> +	}
> +
> +	if (needs_clflush)
> +		drm_clflush_virt_range((char *)src_addr, src_obj->base.size);
> +
> +	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("CMD: Failed to set batch CPU domain\n");
> +		goto unmap_src;
> +	}
> +
> +	dest_addr = vmap_batch(dest_obj);
> +	if (!dest_addr) {
> +		DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
> +		ret = -ENOMEM;
> +		goto unmap_src;
> +	}
> +
> +	memcpy(dest_addr, src_addr, src_obj->base.size);
> +
> +unmap_src:
> +	vunmap(src_addr);
> +unpin_src:
> +	i915_gem_object_unpin_pages(src_obj);
> +
> +	return ret ? ERR_PTR(ret) : dest_addr;
> +}

src_obj->size? You should perhaps borrow a byt.

>  static int
> @@ -1087,6 +1088,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	struct eb_vmas *eb;
>  	struct drm_i915_gem_object *batch_obj;
> +	struct drm_i915_gem_object *shadow_batch_obj = NULL;
>  	struct drm_clip_rect *cliprects = NULL;
>  	struct intel_engine_cs *ring;
>  	struct intel_context *ctx;
> @@ -1288,8 +1290,31 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	batch_obj->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;
>  
>  	if (i915_needs_cmd_parser(ring)) {
> +		shadow_batch_obj =
> +			i915_gem_batch_pool_get(ring->batch_pool,
> +						batch_obj->base.size);
> +		if (IS_ERR(shadow_batch_obj)) {
> +			ret = PTR_ERR(shadow_batch_obj);
> +			/* Don't try to clean up the obj in the error path */
> +			shadow_batch_obj = NULL;
> +
> +			/*
> +			 * If the pool is at capcity, retiring requests might
> +			 * return some buffers.
> +			 */
> +			if (ret == -EAGAIN)
> +				i915_gem_retire_requests_ring(ring);
> +
> +			goto err;
> +		}
> +
> +		ret = i915_gem_obj_ggtt_pin(shadow_batch_obj, 4096, 0);
> +		if (ret)
> +			goto err;
> +
>  		ret = i915_parse_cmds(ring,
>  				      batch_obj,
> +				      shadow_batch_obj,
>  				      args->batch_start_offset,
>  				      file->is_master);

You could just allocate the shadow inside parse_cmds and return it. Then
replace the conventional batch_boj with the new pointer and add it to
the eb list. Similarly, you do not need to track shadow_batch_obj
explicitly in the requests, the pool can do its own busy tracking
external to the core and reduce the invasiveness of the patch.

> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index e72017b..82aae29 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -188,6 +188,13 @@ struct  intel_engine_cs {
>  	bool needs_cmd_parser;
>  
>  	/*
> +	 * A pool of objects to use as shadow copies of client batch buffers
> +	 * when the command parser is enabled. Prevents the client from
> +	 * modifying the batch contents after software parsing.
> +	 */
> +	struct i915_gem_batch_pool *batch_pool;

Why are they tied to a ring?
-Chris
bradley.d.volkin@intel.com June 18, 2014, 5:49 p.m. UTC | #2
On Wed, Jun 18, 2014 at 09:52:52AM -0700, Chris Wilson wrote:
> On Wed, Jun 18, 2014 at 09:36:14AM -0700, bradley.d.volkin@intel.com wrote:
> > From: Brad Volkin <bradley.d.volkin@intel.com>
> > 
> > This patch sets up all of the tracking and copying necessary to
> > use batch pools with the command parser, but does not actually
> > dispatch the copied (shadow) batch to the hardware yet. We still
> > aren't quite ready to set the secure bit during dispatch.
> > 
> > Note that performance takes a hit from the copy in some cases
> > and will likely need some work. At a rough pass, the memcpy
> > appears to be the bottleneck. Without having done a deeper
> > analysis, two ideas that come to mind are:
> > 1) Copy sections of the batch at a time, as they are reached
> >    by parsing. Might improve cache locality.
> > 2) Copy only up to the userspace-supplied batch length and
> >    memset the rest of the buffer. Reduces the number of reads.
> > 
> > Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_cmd_parser.c       | 75 ++++++++++++++++++++++------
> >  drivers/gpu/drm/i915/i915_drv.h              |  7 ++-
> >  drivers/gpu/drm/i915/i915_gem.c              |  8 ++-
> >  drivers/gpu/drm/i915/i915_gem_execbuffer.c   | 45 +++++++++++++++--
> >  drivers/gpu/drm/i915/i915_gem_render_state.c |  2 +-
> >  drivers/gpu/drm/i915/intel_ringbuffer.c      | 12 +++++
> >  drivers/gpu/drm/i915/intel_ringbuffer.h      |  7 +++
> >  7 files changed, 134 insertions(+), 22 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> > index dea99d9..669afb0 100644
> > --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> > +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> > @@ -831,6 +831,53 @@ finish:
> >  	return (u32*)addr;
> >  }
> >  
> > +/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
> > +static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
> > +		       struct drm_i915_gem_object *src_obj)
> > +{
> > +	int ret = 0;
> > +	int needs_clflush = 0;
> > +	u32 *src_addr, *dest_addr = NULL;
> > +
> > +	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
> > +	if (ret) {
> > +		DRM_DEBUG_DRIVER("CMD: failed to prep read\n");
> > +		return ERR_PTR(ret);
> > +	}
> > +
> > +	src_addr = vmap_batch(src_obj);
> > +	if (!src_addr) {
> > +		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
> > +		ret = -ENOMEM;
> > +		goto unpin_src;
> > +	}
> > +
> > +	if (needs_clflush)
> > +		drm_clflush_virt_range((char *)src_addr, src_obj->base.size);
> > +
> > +	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
> > +	if (ret) {
> > +		DRM_DEBUG_DRIVER("CMD: Failed to set batch CPU domain\n");
> > +		goto unmap_src;
> > +	}
> > +
> > +	dest_addr = vmap_batch(dest_obj);
> > +	if (!dest_addr) {
> > +		DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
> > +		ret = -ENOMEM;
> > +		goto unmap_src;
> > +	}
> > +
> > +	memcpy(dest_addr, src_addr, src_obj->base.size);
> > +
> > +unmap_src:
> > +	vunmap(src_addr);
> > +unpin_src:
> > +	i915_gem_object_unpin_pages(src_obj);
> > +
> > +	return ret ? ERR_PTR(ret) : dest_addr;
> > +}
> 
> src_obj->size? You should perhaps borrow a byt.

Not sure I completely follow you here. The dest_obj is as big or bigger than
the source, so I think copying only as much data as exists in the source
object is right. I should be writing nops into any extra space though. And in
parse_cmds, I should update the batch_end calculation. Were those the issues
that you saw?

> 
> >  static int
> > @@ -1087,6 +1088,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> >  	struct eb_vmas *eb;
> >  	struct drm_i915_gem_object *batch_obj;
> > +	struct drm_i915_gem_object *shadow_batch_obj = NULL;
> >  	struct drm_clip_rect *cliprects = NULL;
> >  	struct intel_engine_cs *ring;
> >  	struct intel_context *ctx;
> > @@ -1288,8 +1290,31 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> >  	batch_obj->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;
> >  
> >  	if (i915_needs_cmd_parser(ring)) {
> > +		shadow_batch_obj =
> > +			i915_gem_batch_pool_get(ring->batch_pool,
> > +						batch_obj->base.size);
> > +		if (IS_ERR(shadow_batch_obj)) {
> > +			ret = PTR_ERR(shadow_batch_obj);
> > +			/* Don't try to clean up the obj in the error path */
> > +			shadow_batch_obj = NULL;
> > +
> > +			/*
> > +			 * If the pool is at capcity, retiring requests might
> > +			 * return some buffers.
> > +			 */
> > +			if (ret == -EAGAIN)
> > +				i915_gem_retire_requests_ring(ring);
> > +
> > +			goto err;
> > +		}
> > +
> > +		ret = i915_gem_obj_ggtt_pin(shadow_batch_obj, 4096, 0);
> > +		if (ret)
> > +			goto err;
> > +
> >  		ret = i915_parse_cmds(ring,
> >  				      batch_obj,
> > +				      shadow_batch_obj,
> >  				      args->batch_start_offset,
> >  				      file->is_master);
> 
> You could just allocate the shadow inside parse_cmds and return it. Then
> replace the conventional batch_boj with the new pointer and add it to
> the eb list. Similarly, you do not need to track shadow_batch_obj
> explicitly in the requests, the pool can do its own busy tracking
> external to the core and reduce the invasiveness of the patch.

Overall, I agree that this touched more places than I would have liked.

I initially thought about replacing batch_obj and hooking into the eb list,
but that seemed like it would involve trickier code w.r.t. ref counting,
making up an exec_entry for the vma, etc. Not the end of the world, but
it felt less obvious. I can look again though if you think it's worth it.

What specifically are you thinking in terms of implementing busy tracking
in the pool? The idea with adding the shadow object to the request was just
to get it back in the pool and purgeable asap. I also thought it would limit
some additional code given that we know buffers in the pool have had any
pending work completed. Maybe the suggested approach would do a better job
of those things though.

> 
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > index e72017b..82aae29 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > @@ -188,6 +188,13 @@ struct  intel_engine_cs {
> >  	bool needs_cmd_parser;
> >  
> >  	/*
> > +	 * A pool of objects to use as shadow copies of client batch buffers
> > +	 * when the command parser is enabled. Prevents the client from
> > +	 * modifying the batch contents after software parsing.
> > +	 */
> > +	struct i915_gem_batch_pool *batch_pool;
> 
> Why are they tied to a ring?

There was a question as to whether to make a global pool or per-ring
pools way back when I first sent the parser series. Daniel expressed
a slight preference for per-ring, but I don't object to a global pool.
I suppose a potential benefit to per-ring would be if userspace uses
different sized buffers on different rings, we'd more easily find a
suitable buffer. But enhancing the pool management could fix that too.

Brad

> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre
Chris Wilson June 18, 2014, 6:11 p.m. UTC | #3
On Wed, Jun 18, 2014 at 10:49:05AM -0700, Volkin, Bradley D wrote:
> On Wed, Jun 18, 2014 at 09:52:52AM -0700, Chris Wilson wrote:
> > On Wed, Jun 18, 2014 at 09:36:14AM -0700, bradley.d.volkin@intel.com wrote:
> > > From: Brad Volkin <bradley.d.volkin@intel.com>
> > > 
> > > This patch sets up all of the tracking and copying necessary to
> > > use batch pools with the command parser, but does not actually
> > > dispatch the copied (shadow) batch to the hardware yet. We still
> > > aren't quite ready to set the secure bit during dispatch.
> > > 
> > > Note that performance takes a hit from the copy in some cases
> > > and will likely need some work. At a rough pass, the memcpy
> > > appears to be the bottleneck. Without having done a deeper
> > > analysis, two ideas that come to mind are:
> > > 1) Copy sections of the batch at a time, as they are reached
> > >    by parsing. Might improve cache locality.
> > > 2) Copy only up to the userspace-supplied batch length and
> > >    memset the rest of the buffer. Reduces the number of reads.
> > > 
> > > Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com>
> > > ---
> > >  drivers/gpu/drm/i915/i915_cmd_parser.c       | 75 ++++++++++++++++++++++------
> > >  drivers/gpu/drm/i915/i915_drv.h              |  7 ++-
> > >  drivers/gpu/drm/i915/i915_gem.c              |  8 ++-
> > >  drivers/gpu/drm/i915/i915_gem_execbuffer.c   | 45 +++++++++++++++--
> > >  drivers/gpu/drm/i915/i915_gem_render_state.c |  2 +-
> > >  drivers/gpu/drm/i915/intel_ringbuffer.c      | 12 +++++
> > >  drivers/gpu/drm/i915/intel_ringbuffer.h      |  7 +++
> > >  7 files changed, 134 insertions(+), 22 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> > > index dea99d9..669afb0 100644
> > > --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> > > +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> > > @@ -831,6 +831,53 @@ finish:
> > >  	return (u32*)addr;
> > >  }
> > >  
> > > +/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
> > > +static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
> > > +		       struct drm_i915_gem_object *src_obj)
> > > +{
> > > +	int ret = 0;
> > > +	int needs_clflush = 0;
> > > +	u32 *src_addr, *dest_addr = NULL;
> > > +
> > > +	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
> > > +	if (ret) {
> > > +		DRM_DEBUG_DRIVER("CMD: failed to prep read\n");
> > > +		return ERR_PTR(ret);
> > > +	}
> > > +
> > > +	src_addr = vmap_batch(src_obj);
> > > +	if (!src_addr) {
> > > +		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
> > > +		ret = -ENOMEM;
> > > +		goto unpin_src;
> > > +	}
> > > +
> > > +	if (needs_clflush)
> > > +		drm_clflush_virt_range((char *)src_addr, src_obj->base.size);
> > > +
> > > +	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
> > > +	if (ret) {
> > > +		DRM_DEBUG_DRIVER("CMD: Failed to set batch CPU domain\n");
> > > +		goto unmap_src;
> > > +	}
> > > +
> > > +	dest_addr = vmap_batch(dest_obj);
> > > +	if (!dest_addr) {
> > > +		DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
> > > +		ret = -ENOMEM;
> > > +		goto unmap_src;
> > > +	}
> > > +
> > > +	memcpy(dest_addr, src_addr, src_obj->base.size);
> > > +
> > > +unmap_src:
> > > +	vunmap(src_addr);
> > > +unpin_src:
> > > +	i915_gem_object_unpin_pages(src_obj);
> > > +
> > > +	return ret ? ERR_PTR(ret) : dest_addr;
> > > +}
> > 
> > src_obj->size? You should perhaps borrow a byt.
> 
> Not sure I completely follow you here. The dest_obj is as big or bigger than
> the source, so I think copying only as much data as exists in the source
> object is right. I should be writing nops into any extra space though. And in
> parse_cmds, I should update the batch_end calculation. Were those the issues
> that you saw?

Just that generally src->size >> batch len and clflush will make it
twice as expensive on byt. (The object may be about 1000 times larger
than the batch at the extreme, libva *cough*.)

> > >  static int
> > > @@ -1087,6 +1088,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> > >  	struct drm_i915_private *dev_priv = dev->dev_private;
> > >  	struct eb_vmas *eb;
> > >  	struct drm_i915_gem_object *batch_obj;
> > > +	struct drm_i915_gem_object *shadow_batch_obj = NULL;
> > >  	struct drm_clip_rect *cliprects = NULL;
> > >  	struct intel_engine_cs *ring;
> > >  	struct intel_context *ctx;
> > > @@ -1288,8 +1290,31 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> > >  	batch_obj->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;
> > >  
> > >  	if (i915_needs_cmd_parser(ring)) {
> > > +		shadow_batch_obj =
> > > +			i915_gem_batch_pool_get(ring->batch_pool,
> > > +						batch_obj->base.size);
> > > +		if (IS_ERR(shadow_batch_obj)) {
> > > +			ret = PTR_ERR(shadow_batch_obj);
> > > +			/* Don't try to clean up the obj in the error path */
> > > +			shadow_batch_obj = NULL;
> > > +
> > > +			/*
> > > +			 * If the pool is at capcity, retiring requests might
> > > +			 * return some buffers.
> > > +			 */
> > > +			if (ret == -EAGAIN)
> > > +				i915_gem_retire_requests_ring(ring);
> > > +
> > > +			goto err;
> > > +		}
> > > +
> > > +		ret = i915_gem_obj_ggtt_pin(shadow_batch_obj, 4096, 0);
> > > +		if (ret)
> > > +			goto err;
> > > +
> > >  		ret = i915_parse_cmds(ring,
> > >  				      batch_obj,
> > > +				      shadow_batch_obj,
> > >  				      args->batch_start_offset,
> > >  				      file->is_master);
> > 
> > You could just allocate the shadow inside parse_cmds and return it. Then
> > replace the conventional batch_boj with the new pointer and add it to
> > the eb list. Similarly, you do not need to track shadow_batch_obj
> > explicitly in the requests, the pool can do its own busy tracking
> > external to the core and reduce the invasiveness of the patch.
> 
> Overall, I agree that this touched more places than I would have liked.
> 
> I initially thought about replacing batch_obj and hooking into the eb list,
> but that seemed like it would involve trickier code w.r.t. ref counting,
> making up an exec_entry for the vma, etc. Not the end of the world, but
> it felt less obvious. I can look again though if you think it's worth it.

I think so. The request is already complicated enough and I think the
shadow batch can fit neatly inside the rules we already have with a
modicum of stitching here.
 
> What specifically are you thinking in terms of implementing busy tracking
> in the pool? The idea with adding the shadow object to the request was just
> to get it back in the pool and purgeable asap. I also thought it would limit
> some additional code given that we know buffers in the pool have had any
> pending work completed. Maybe the suggested approach would do a better job
> of those things though.

I was thinking that a pool is basically a list of bo, and you simply
query whether the oldest was still busy when we need a new bo. Which is
the same as how userspace implements its pool of active/inactive objects.
 
> > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > index e72017b..82aae29 100644
> > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > @@ -188,6 +188,13 @@ struct  intel_engine_cs {
> > >  	bool needs_cmd_parser;
> > >  
> > >  	/*
> > > +	 * A pool of objects to use as shadow copies of client batch buffers
> > > +	 * when the command parser is enabled. Prevents the client from
> > > +	 * modifying the batch contents after software parsing.
> > > +	 */
> > > +	struct i915_gem_batch_pool *batch_pool;
> > 
> > Why are they tied to a ring?
> 
> There was a question as to whether to make a global pool or per-ring
> pools way back when I first sent the parser series. Daniel expressed
> a slight preference for per-ring, but I don't object to a global pool.
> I suppose a potential benefit to per-ring would be if userspace uses
> different sized buffers on different rings, we'd more easily find a
> suitable buffer. But enhancing the pool management could fix that too.

Batch buffer sizes are not fixed per ring anyway. I guess Daniel thought
it might work out easier, but memory is a global resource and should be
tracked primarily at the device level. Userspace segregates its caches
based on size to speed up searches.
-Chris
Daniel Vetter June 18, 2014, 7:59 p.m. UTC | #4
On Wed, Jun 18, 2014 at 07:11:46PM +0100, Chris Wilson wrote:
> On Wed, Jun 18, 2014 at 10:49:05AM -0700, Volkin, Bradley D wrote:
> > What specifically are you thinking in terms of implementing busy tracking
> > in the pool? The idea with adding the shadow object to the request was just
> > to get it back in the pool and purgeable asap. I also thought it would limit
> > some additional code given that we know buffers in the pool have had any
> > pending work completed. Maybe the suggested approach would do a better job
> > of those things though.
> 
> I was thinking that a pool is basically a list of bo, and you simply
> query whether the oldest was still busy when we need a new bo. Which is
> the same as how userspace implements its pool of active/inactive objects.

Yeah, linking the ggtt vma of the shadow batch into the eb list should be
the most natural solution. We need the same trickery for secure batches to
make sure they're bound into the ggtt with full ppgtt anyway. Yeah, that's
one of the tasks I'm signed up on and slacked off about :(
-Daniel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index dea99d9..669afb0 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -831,6 +831,53 @@  finish:
 	return (u32*)addr;
 }
 
+/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
+static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
+		       struct drm_i915_gem_object *src_obj)
+{
+	int ret = 0;
+	int needs_clflush = 0;
+	u32 *src_addr, *dest_addr = NULL;
+
+	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
+	if (ret) {
+		DRM_DEBUG_DRIVER("CMD: failed to prep read\n");
+		return ERR_PTR(ret);
+	}
+
+	src_addr = vmap_batch(src_obj);
+	if (!src_addr) {
+		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
+		ret = -ENOMEM;
+		goto unpin_src;
+	}
+
+	if (needs_clflush)
+		drm_clflush_virt_range((char *)src_addr, src_obj->base.size);
+
+	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
+	if (ret) {
+		DRM_DEBUG_DRIVER("CMD: Failed to set batch CPU domain\n");
+		goto unmap_src;
+	}
+
+	dest_addr = vmap_batch(dest_obj);
+	if (!dest_addr) {
+		DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
+		ret = -ENOMEM;
+		goto unmap_src;
+	}
+
+	memcpy(dest_addr, src_addr, src_obj->base.size);
+
+unmap_src:
+	vunmap(src_addr);
+unpin_src:
+	i915_gem_object_unpin_pages(src_obj);
+
+	return ret ? ERR_PTR(ret) : dest_addr;
+}
+
 /**
  * i915_needs_cmd_parser() - should a given ring use software command parsing?
  * @ring: the ring in question
@@ -952,6 +999,7 @@  static bool check_cmd(const struct intel_engine_cs *ring,
  * i915_parse_cmds() - parse a submitted batch buffer for privilege violations
  * @ring: the ring on which the batch is to execute
  * @batch_obj: the batch buffer in question
+ * @shadow_batch_obj: copy of the batch buffer in question
  * @batch_start_offset: byte offset in the batch at which execution starts
  * @is_master: is the submitting process the drm master?
  *
@@ -962,31 +1010,21 @@  static bool check_cmd(const struct intel_engine_cs *ring,
  */
 int i915_parse_cmds(struct intel_engine_cs *ring,
 		    struct drm_i915_gem_object *batch_obj,
+		    struct drm_i915_gem_object *shadow_batch_obj,
 		    u32 batch_start_offset,
 		    bool is_master)
 {
 	int ret = 0;
 	u32 *cmd, *batch_base, *batch_end;
 	struct drm_i915_cmd_descriptor default_desc = { 0 };
-	int needs_clflush = 0;
 	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
 
-	ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
-	if (ret) {
-		DRM_DEBUG_DRIVER("CMD: failed to prep read\n");
-		return ret;
-	}
-
-	batch_base = vmap_batch(batch_obj);
-	if (!batch_base) {
-		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
-		i915_gem_object_unpin_pages(batch_obj);
-		return -ENOMEM;
+	batch_base = copy_batch(shadow_batch_obj, batch_obj);
+	if (IS_ERR(batch_base)) {
+		DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
+		return PTR_ERR(batch_base);
 	}
 
-	if (needs_clflush)
-		drm_clflush_virt_range((char *)batch_base, batch_obj->base.size);
-
 	cmd = batch_base + (batch_start_offset / sizeof(*cmd));
 	batch_end = cmd + (batch_obj->base.size / sizeof(*batch_end));
 
@@ -1039,7 +1077,12 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 
 	vunmap(batch_base);
 
-	i915_gem_object_unpin_pages(batch_obj);
+	if (!ret) {
+		ret = i915_gem_object_set_to_gtt_domain(shadow_batch_obj,
+							false);
+		if (ret)
+			DRM_DEBUG_DRIVER("CMD: Failed to set batch GTT domain\n");
+	}
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2a88b5e..cecf0f8 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1766,6 +1766,9 @@  struct drm_i915_gem_request {
 	/** Batch buffer related to this request if any */
 	struct drm_i915_gem_object *batch_obj;
 
+	/** Shadow copy of the batch buffer, if any, for the command parser */
+	struct drm_i915_gem_object *shadow_batch_obj;
+
 	/** Time at which this request was emitted, in jiffies. */
 	unsigned long emitted_jiffies;
 
@@ -2297,9 +2300,10 @@  int __must_check i915_gem_suspend(struct drm_device *dev);
 int __i915_add_request(struct intel_engine_cs *ring,
 		       struct drm_file *file,
 		       struct drm_i915_gem_object *batch_obj,
+		       struct drm_i915_gem_object *shadow_batch_obj,
 		       u32 *seqno);
 #define i915_add_request(ring, seqno) \
-	__i915_add_request(ring, NULL, NULL, seqno)
+	__i915_add_request(ring, NULL, NULL, NULL, seqno)
 int __must_check i915_wait_seqno(struct intel_engine_cs *ring,
 				 uint32_t seqno);
 int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -2534,6 +2538,7 @@  void i915_cmd_parser_fini_ring(struct intel_engine_cs *ring);
 bool i915_needs_cmd_parser(struct intel_engine_cs *ring);
 int i915_parse_cmds(struct intel_engine_cs *ring,
 		    struct drm_i915_gem_object *batch_obj,
+		    struct drm_i915_gem_object *shadow_batch_obj,
 		    u32 batch_start_offset,
 		    bool is_master);
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d5e3001..a477818 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2325,6 +2325,7 @@  i915_gem_get_seqno(struct drm_device *dev, u32 *seqno)
 int __i915_add_request(struct intel_engine_cs *ring,
 		       struct drm_file *file,
 		       struct drm_i915_gem_object *obj,
+		       struct drm_i915_gem_object *shadow_obj,
 		       u32 *out_seqno)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
@@ -2368,9 +2369,10 @@  int __i915_add_request(struct intel_engine_cs *ring,
 	 * active_list, and so will hold the active reference. Only when this
 	 * request is retired will the the batch_obj be moved onto the
 	 * inactive_list and lose its active reference. Hence we do not need
-	 * to explicitly hold another reference here.
+	 * to explicitly hold another reference here. Same for shadow batch.
 	 */
 	request->batch_obj = obj;
+	request->shadow_batch_obj = shadow_obj;
 
 	/* Hold a reference to the current context so that we can inspect
 	 * it later in case a hangcheck error event fires.
@@ -2478,6 +2480,10 @@  static void i915_gem_free_request(struct drm_i915_gem_request *request)
 	if (request->ctx)
 		i915_gem_context_unreference(request->ctx);
 
+	if (request->shadow_batch_obj)
+		i915_gem_batch_pool_put(request->ring->batch_pool,
+					request->shadow_batch_obj);
+
 	kfree(request);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 93d7f72..0b263aa 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -992,13 +992,14 @@  static void
 i915_gem_execbuffer_retire_commands(struct drm_device *dev,
 				    struct drm_file *file,
 				    struct intel_engine_cs *ring,
-				    struct drm_i915_gem_object *obj)
+				    struct drm_i915_gem_object *obj,
+				    struct drm_i915_gem_object *shadow_obj)
 {
 	/* Unconditionally force add_request to emit a full flush. */
 	ring->gpu_caches_dirty = true;
 
 	/* Add a breadcrumb for the completion of the batch buffer */
-	(void)__i915_add_request(ring, file, obj, NULL);
+	(void)__i915_add_request(ring, file, obj, shadow_obj, NULL);
 }
 
 static int
@@ -1087,6 +1088,7 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct eb_vmas *eb;
 	struct drm_i915_gem_object *batch_obj;
+	struct drm_i915_gem_object *shadow_batch_obj = NULL;
 	struct drm_clip_rect *cliprects = NULL;
 	struct intel_engine_cs *ring;
 	struct intel_context *ctx;
@@ -1288,8 +1290,31 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	batch_obj->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;
 
 	if (i915_needs_cmd_parser(ring)) {
+		shadow_batch_obj =
+			i915_gem_batch_pool_get(ring->batch_pool,
+						batch_obj->base.size);
+		if (IS_ERR(shadow_batch_obj)) {
+			ret = PTR_ERR(shadow_batch_obj);
+			/* Don't try to clean up the obj in the error path */
+			shadow_batch_obj = NULL;
+
+			/*
+			 * If the pool is at capcity, retiring requests might
+			 * return some buffers.
+			 */
+			if (ret == -EAGAIN)
+				i915_gem_retire_requests_ring(ring);
+
+			goto err;
+		}
+
+		ret = i915_gem_obj_ggtt_pin(shadow_batch_obj, 4096, 0);
+		if (ret)
+			goto err;
+
 		ret = i915_parse_cmds(ring,
 				      batch_obj,
+				      shadow_batch_obj,
 				      args->batch_start_offset,
 				      file->is_master);
 		if (ret)
@@ -1377,13 +1402,27 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	trace_i915_gem_ring_dispatch(ring, intel_ring_get_seqno(ring), flags);
 
 	i915_gem_execbuffer_move_to_active(&eb->vmas, ring);
-	i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj);
+	if (shadow_batch_obj) {
+		struct i915_vma *vma = i915_gem_obj_to_ggtt(shadow_batch_obj);
+
+		i915_vma_move_to_active(vma, ring);
+		i915_gem_object_ggtt_unpin(shadow_batch_obj);
+	}
+	i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj,
+					    shadow_batch_obj);
 
 err:
 	/* the request owns the ref now */
 	i915_gem_context_unreference(ctx);
 	eb_destroy(eb);
 
+	if (ret && ring && shadow_batch_obj) {
+		if (i915_gem_obj_is_pinned(shadow_batch_obj))
+			i915_gem_object_ggtt_unpin(shadow_batch_obj);
+
+		i915_gem_batch_pool_put(ring->batch_pool, shadow_batch_obj);
+	}
+
 	mutex_unlock(&dev->struct_mutex);
 
 pre_mutex_err:
diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.c b/drivers/gpu/drm/i915/i915_gem_render_state.c
index e60be3f..f614406 100644
--- a/drivers/gpu/drm/i915/i915_gem_render_state.c
+++ b/drivers/gpu/drm/i915/i915_gem_render_state.c
@@ -161,7 +161,7 @@  int i915_gem_render_state_init(struct intel_engine_cs *ring)
 
 	i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), ring);
 
-	ret = __i915_add_request(ring, NULL, so.obj, NULL);
+	ret = __i915_add_request(ring, NULL, so.obj, NULL, NULL);
 	/* __i915_add_request moves object to inactive if it fails */
 out:
 	render_state_fini(&so);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index b96edaf..4cd123a 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1476,6 +1476,17 @@  static int intel_init_ring_buffer(struct drm_device *dev,
 	if (ret)
 		goto error;
 
+	/*
+	 * Benchmarking shows that the actual maximum number of buffers
+	 * in the pool varies a bit with workload, but 64 seems to cover
+	 * us fairly well.
+	 */
+	ring->batch_pool = i915_gem_batch_pool_alloc(dev, 64);
+	if (IS_ERR(ring->batch_pool)) {
+		ret = PTR_ERR(ring->batch_pool);
+		goto error;
+	}
+
 	ret = ring->init(ring);
 	if (ret)
 		goto error;
@@ -1513,6 +1524,7 @@  void intel_cleanup_ring_buffer(struct intel_engine_cs *ring)
 	cleanup_status_page(ring);
 
 	i915_cmd_parser_fini_ring(ring);
+	i915_gem_batch_pool_free(ring->batch_pool);
 
 	kfree(ringbuf);
 	ring->buffer = NULL;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index e72017b..82aae29 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -188,6 +188,13 @@  struct  intel_engine_cs {
 	bool needs_cmd_parser;
 
 	/*
+	 * A pool of objects to use as shadow copies of client batch buffers
+	 * when the command parser is enabled. Prevents the client from
+	 * modifying the batch contents after software parsing.
+	 */
+	struct i915_gem_batch_pool *batch_pool;
+
+	/*
 	 * Table of commands the command parser needs to know about
 	 * for this ring.
 	 */