diff mbox

[V2,6/6] drm/i915:Use the coarse ping-pong mechanism based on drm fd to dispatch the BSD command on BDW GT3

Message ID 1397449304-3224-7-git-send-email-yakui.zhao@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zhao, Yakui April 14, 2014, 4:21 a.m. UTC
V1->V2: Follow Daniel's comment and use the simple ping-pong mechanism.
This is only to add the support of dual BSD rings on BDW GT3 machine.
The further optimization will be considered in another patch set.

The BDW GT3 has two independent BSD rings, which can be used to process the
video commands. To be simpler, it is transparent to user-space driver/middle.
Instead the kernel driver will decide which ring is to dispatch the BSD video
command.

As every BSD ring is powerful, it is enough to dispatch the BSD video command
based on the drm fd. In such case it can play back video stream while encoding
another video stream. The coarse ping-pong mechanism is used to determine
which BSD ring is used to dispatch the BSD video command.

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c            |    3 +++
 drivers/gpu/drm/i915/i915_drv.h            |    3 +++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   37 +++++++++++++++++++++++++++-
 3 files changed, 42 insertions(+), 1 deletion(-)

Comments

Daniel Vetter April 14, 2014, 7:22 a.m. UTC | #1
On Mon, Apr 14, 2014 at 12:21:44PM +0800, Zhao Yakui wrote:
> V1->V2: Follow Daniel's comment and use the simple ping-pong mechanism.
> This is only to add the support of dual BSD rings on BDW GT3 machine.
> The further optimization will be considered in another patch set.
> 
> The BDW GT3 has two independent BSD rings, which can be used to process the
> video commands. To be simpler, it is transparent to user-space driver/middle.
> Instead the kernel driver will decide which ring is to dispatch the BSD video
> command.
> 
> As every BSD ring is powerful, it is enough to dispatch the BSD video command
> based on the drm fd. In such case it can play back video stream while encoding
> another video stream. The coarse ping-pong mechanism is used to determine
> which BSD ring is used to dispatch the BSD video command.
> 
> Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_dma.c            |    3 +++
>  drivers/gpu/drm/i915/i915_drv.h            |    3 +++
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   37 +++++++++++++++++++++++++++-
>  3 files changed, 42 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 0b38f88..4d27cf4 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1572,6 +1572,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
>  	spin_lock_init(&dev_priv->backlight_lock);
>  	spin_lock_init(&dev_priv->uncore.lock);
>  	spin_lock_init(&dev_priv->mm.object_stat_lock);
> +	atomic_set(&dev_priv->bsd_cmd_counter, 0);
>  	mutex_init(&dev_priv->dpio_lock);
>  	mutex_init(&dev_priv->modeset_restore_lock);
>  
> @@ -1929,6 +1930,8 @@ void i915_driver_postclose(struct drm_device *dev, struct drm_file *file)
>  {
>  	struct drm_i915_file_private *file_priv = file->driver_priv;
>  
> +	if (file_priv && file_priv->bsd_ring)
> +		file_priv->bsd_ring = NULL;
>  	kfree(file_priv);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index ac5598c3..68e8166 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1466,6 +1466,8 @@ struct drm_i915_private {
>  	struct i915_dri1_state dri1;
>  	/* Old ums support infrastructure, same warning applies. */
>  	struct i915_ums_state ums;
> +	/* the lock for dispatch video commands on two BSD rings */
> +	atomic_t bsd_cmd_counter;

You're still using atomic_t for no real good reason.
gen8_dispatch_bsd_ring is always called with the dev->struct_mutex lock
held, so there's really no reason for it.
-Daniel

>  };
>  
>  static inline struct drm_i915_private *to_i915(const struct drm_device *dev)
> @@ -1673,6 +1675,7 @@ struct drm_i915_file_private {
>  
>  	struct i915_hw_context *private_default_ctx;
>  	atomic_t rps_wait_boost;
> +	struct  intel_ring_buffer *bsd_ring;
>  };
>  
>  /*
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 341ec68..720ef17 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -999,6 +999,34 @@ i915_reset_gen7_sol_offsets(struct drm_device *dev,
>  	return 0;
>  }
>  
> +/**
> + * Find one BSD ring to dispatch the corresponding BSD command.
> + * The Ring ID is returned.
> + */
> +static int gen8_dispatch_bsd_ring(struct drm_device *dev,
> +				  struct drm_file *file)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct drm_i915_file_private *file_priv = file->driver_priv;
> +
> +	/* Check whether the file_priv is using one ring */
> +	if (file_priv->bsd_ring)
> +		return file_priv->bsd_ring->id;
> +	else {
> +		/* If no, use the ping-pong mechanism to select one ring */
> +		int counter, ring_id;
> +		smp_mb__before_atomic_inc();
> +		counter = atomic_inc_return(&dev_priv->bsd_cmd_counter);
> +		if (counter % 2 == 0)
> +			ring_id = VCS;
> +		else
> +			ring_id = VCS2;
> +
> +		file_priv->bsd_ring = &dev_priv->ring[ring_id];
> +		return ring_id;
> +	}
> +}
> +
>  static int
>  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  		       struct drm_file *file,
> @@ -1043,7 +1071,14 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  
>  	if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_DEFAULT)
>  		ring = &dev_priv->ring[RCS];
> -	else
> +	else if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_BSD) {
> +		if (HAS_BSD2(dev)) {
> +			int ring_id;
> +			ring_id = gen8_dispatch_bsd_ring(dev, file);
> +			ring = &dev_priv->ring[ring_id];
> +		} else
> +			ring = &dev_priv->ring[VCS];
> +	} else
>  		ring = &dev_priv->ring[(args->flags & I915_EXEC_RING_MASK) - 1];
>  
>  	if (!intel_ring_initialized(ring)) {
> -- 
> 1.7.10.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Zhao, Yakui April 14, 2014, 8:05 a.m. UTC | #2
On Mon, 2014-04-14 at 01:22 -0600, Daniel Vetter wrote:
> On Mon, Apr 14, 2014 at 12:21:44PM +0800, Zhao Yakui wrote:
> > V1->V2: Follow Daniel's comment and use the simple ping-pong mechanism.
> > This is only to add the support of dual BSD rings on BDW GT3 machine.
> > The further optimization will be considered in another patch set.
> > 
> > The BDW GT3 has two independent BSD rings, which can be used to process the
> > video commands. To be simpler, it is transparent to user-space driver/middle.
> > Instead the kernel driver will decide which ring is to dispatch the BSD video
> > command.
> > 
> > As every BSD ring is powerful, it is enough to dispatch the BSD video command
> > based on the drm fd. In such case it can play back video stream while encoding
> > another video stream. The coarse ping-pong mechanism is used to determine
> > which BSD ring is used to dispatch the BSD video command.
> > 
> > Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_dma.c            |    3 +++
> >  drivers/gpu/drm/i915/i915_drv.h            |    3 +++
> >  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   37 +++++++++++++++++++++++++++-
> >  3 files changed, 42 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> > index 0b38f88..4d27cf4 100644
> > --- a/drivers/gpu/drm/i915/i915_dma.c
> > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > @@ -1572,6 +1572,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
> >  	spin_lock_init(&dev_priv->backlight_lock);
> >  	spin_lock_init(&dev_priv->uncore.lock);
> >  	spin_lock_init(&dev_priv->mm.object_stat_lock);
> > +	atomic_set(&dev_priv->bsd_cmd_counter, 0);
> >  	mutex_init(&dev_priv->dpio_lock);
> >  	mutex_init(&dev_priv->modeset_restore_lock);
> >  
> > @@ -1929,6 +1930,8 @@ void i915_driver_postclose(struct drm_device *dev, struct drm_file *file)
> >  {
> >  	struct drm_i915_file_private *file_priv = file->driver_priv;
> >  
> > +	if (file_priv && file_priv->bsd_ring)
> > +		file_priv->bsd_ring = NULL;
> >  	kfree(file_priv);
> >  }
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index ac5598c3..68e8166 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1466,6 +1466,8 @@ struct drm_i915_private {
> >  	struct i915_dri1_state dri1;
> >  	/* Old ums support infrastructure, same warning applies. */
> >  	struct i915_ums_state ums;
> > +	/* the lock for dispatch video commands on two BSD rings */
> > +	atomic_t bsd_cmd_counter;
> 
> You're still using atomic_t for no real good reason.
> gen8_dispatch_bsd_ring is always called with the dev->struct_mutex lock
> held, so there's really no reason for it.

If the struct_mutex is used in the gen8_dispatch_bsd_ring, I can remove
the atomic_t. 
It seems that the struct_mutex is a big lock and it is used very
frequently(i915_gem.c, i915_dma.c and so on). In my point it is a little
heavier than the atomic_t if one counter is increased and returned. 

If you think that the mutex is better than atomic, I will follow your
advice.

Thanks.
    Yakui

> -Daniel
> 
> >  };
> >  
> >  static inline struct drm_i915_private *to_i915(const struct drm_device *dev)
> > @@ -1673,6 +1675,7 @@ struct drm_i915_file_private {
> >  
> >  	struct i915_hw_context *private_default_ctx;
> >  	atomic_t rps_wait_boost;
> > +	struct  intel_ring_buffer *bsd_ring;
> >  };
> >  
> >  /*
> > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > index 341ec68..720ef17 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > @@ -999,6 +999,34 @@ i915_reset_gen7_sol_offsets(struct drm_device *dev,
> >  	return 0;
> >  }
> >  
> > +/**
> > + * Find one BSD ring to dispatch the corresponding BSD command.
> > + * The Ring ID is returned.
> > + */
> > +static int gen8_dispatch_bsd_ring(struct drm_device *dev,
> > +				  struct drm_file *file)
> > +{
> > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > +	struct drm_i915_file_private *file_priv = file->driver_priv;
> > +
> > +	/* Check whether the file_priv is using one ring */
> > +	if (file_priv->bsd_ring)
> > +		return file_priv->bsd_ring->id;
> > +	else {
> > +		/* If no, use the ping-pong mechanism to select one ring */
> > +		int counter, ring_id;
> > +		smp_mb__before_atomic_inc();
> > +		counter = atomic_inc_return(&dev_priv->bsd_cmd_counter);
> > +		if (counter % 2 == 0)
> > +			ring_id = VCS;
> > +		else
> > +			ring_id = VCS2;
> > +
> > +		file_priv->bsd_ring = &dev_priv->ring[ring_id];
> > +		return ring_id;
> > +	}
> > +}
> > +
> >  static int
> >  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> >  		       struct drm_file *file,
> > @@ -1043,7 +1071,14 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> >  
> >  	if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_DEFAULT)
> >  		ring = &dev_priv->ring[RCS];
> > -	else
> > +	else if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_BSD) {
> > +		if (HAS_BSD2(dev)) {
> > +			int ring_id;
> > +			ring_id = gen8_dispatch_bsd_ring(dev, file);
> > +			ring = &dev_priv->ring[ring_id];
> > +		} else
> > +			ring = &dev_priv->ring[VCS];
> > +	} else
> >  		ring = &dev_priv->ring[(args->flags & I915_EXEC_RING_MASK) - 1];
> >  
> >  	if (!intel_ring_initialized(ring)) {
> > -- 
> > 1.7.10.1
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
Chris Wilson April 14, 2014, 8:19 a.m. UTC | #3
On Mon, Apr 14, 2014 at 04:05:19PM +0800, Zhao Yakui wrote:
> On Mon, 2014-04-14 at 01:22 -0600, Daniel Vetter wrote:
> > You're still using atomic_t for no real good reason.
> > gen8_dispatch_bsd_ring is always called with the dev->struct_mutex lock
> > held, so there's really no reason for it.
> 
> If the struct_mutex is used in the gen8_dispatch_bsd_ring, I can remove
> the atomic_t. 
> It seems that the struct_mutex is a big lock and it is used very
> frequently(i915_gem.c, i915_dma.c and so on). In my point it is a little
> heavier than the atomic_t if one counter is increased and returned. 
> 
> If you think that the mutex is better than atomic, I will follow your
> advice.

You are already holding the struct_mutex whenever we touch the ring and
execbuffer. Even in a fine-grained world, there will still be a mutex
around all operations that touch the rings.
-Chris
Zhao, Yakui April 14, 2014, 8:46 a.m. UTC | #4
On Mon, 2014-04-14 at 02:19 -0600, Chris Wilson wrote:
> On Mon, Apr 14, 2014 at 04:05:19PM +0800, Zhao Yakui wrote:
> > On Mon, 2014-04-14 at 01:22 -0600, Daniel Vetter wrote:
> > > You're still using atomic_t for no real good reason.
> > > gen8_dispatch_bsd_ring is always called with the dev->struct_mutex lock
> > > held, so there's really no reason for it.
> > 
> > If the struct_mutex is used in the gen8_dispatch_bsd_ring, I can remove
> > the atomic_t. 
> > It seems that the struct_mutex is a big lock and it is used very
> > frequently(i915_gem.c, i915_dma.c and so on). In my point it is a little
> > heavier than the atomic_t if one counter is increased and returned. 
> > 
> > If you think that the mutex is better than atomic, I will follow your
> > advice.
> 
> You are already holding the struct_mutex whenever we touch the ring and
> execbuffer. Even in a fine-grained world, there will still be a mutex
> around all operations that touch the rings.

Hi, Chris

    I understand your concern. From the source code the struct_mutex
will be held when trying to do the buffer relocation and dispatch the
command in one ring. 
    But my code is only to select one BSD ring. In such case the
atomic_t usage is enough and it is unnecessary to hold the struct_mutex.
    If you also think that the struct_mutex is better, I can update the
code to use the struct_mutex.

Thanks.
    Yakui    


> -Chris
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 0b38f88..4d27cf4 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1572,6 +1572,7 @@  int i915_driver_load(struct drm_device *dev, unsigned long flags)
 	spin_lock_init(&dev_priv->backlight_lock);
 	spin_lock_init(&dev_priv->uncore.lock);
 	spin_lock_init(&dev_priv->mm.object_stat_lock);
+	atomic_set(&dev_priv->bsd_cmd_counter, 0);
 	mutex_init(&dev_priv->dpio_lock);
 	mutex_init(&dev_priv->modeset_restore_lock);
 
@@ -1929,6 +1930,8 @@  void i915_driver_postclose(struct drm_device *dev, struct drm_file *file)
 {
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 
+	if (file_priv && file_priv->bsd_ring)
+		file_priv->bsd_ring = NULL;
 	kfree(file_priv);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ac5598c3..68e8166 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1466,6 +1466,8 @@  struct drm_i915_private {
 	struct i915_dri1_state dri1;
 	/* Old ums support infrastructure, same warning applies. */
 	struct i915_ums_state ums;
+	/* the lock for dispatch video commands on two BSD rings */
+	atomic_t bsd_cmd_counter;
 };
 
 static inline struct drm_i915_private *to_i915(const struct drm_device *dev)
@@ -1673,6 +1675,7 @@  struct drm_i915_file_private {
 
 	struct i915_hw_context *private_default_ctx;
 	atomic_t rps_wait_boost;
+	struct  intel_ring_buffer *bsd_ring;
 };
 
 /*
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 341ec68..720ef17 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -999,6 +999,34 @@  i915_reset_gen7_sol_offsets(struct drm_device *dev,
 	return 0;
 }
 
+/**
+ * Find one BSD ring to dispatch the corresponding BSD command.
+ * The Ring ID is returned.
+ */
+static int gen8_dispatch_bsd_ring(struct drm_device *dev,
+				  struct drm_file *file)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_file_private *file_priv = file->driver_priv;
+
+	/* Check whether the file_priv is using one ring */
+	if (file_priv->bsd_ring)
+		return file_priv->bsd_ring->id;
+	else {
+		/* If no, use the ping-pong mechanism to select one ring */
+		int counter, ring_id;
+		smp_mb__before_atomic_inc();
+		counter = atomic_inc_return(&dev_priv->bsd_cmd_counter);
+		if (counter % 2 == 0)
+			ring_id = VCS;
+		else
+			ring_id = VCS2;
+
+		file_priv->bsd_ring = &dev_priv->ring[ring_id];
+		return ring_id;
+	}
+}
+
 static int
 i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		       struct drm_file *file,
@@ -1043,7 +1071,14 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 
 	if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_DEFAULT)
 		ring = &dev_priv->ring[RCS];
-	else
+	else if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_BSD) {
+		if (HAS_BSD2(dev)) {
+			int ring_id;
+			ring_id = gen8_dispatch_bsd_ring(dev, file);
+			ring = &dev_priv->ring[ring_id];
+		} else
+			ring = &dev_priv->ring[VCS];
+	} else
 		ring = &dev_priv->ring[(args->flags & I915_EXEC_RING_MASK) - 1];
 
 	if (!intel_ring_initialized(ring)) {