drm/i915: Asynchronously initialise the GPU state
diff mbox

Message ID 1435742841-28454-1-git-send-email-chris@chris-wilson.co.uk
State New
Headers show

Commit Message

Chris Wilson July 1, 2015, 9:27 a.m. UTC
Dave Gordon made the good suggestion that once the ringbuffers were
setup, the actual queuing of commands to program the initial GPU state
could be deferred. Since that initial state contains instructions for
setting up the first power context, we want to execute that as earlier
as possible, preferrably in the background to userspace. Then when
userspace does wake up, the first time it opens the device we just need
to flush the work to be sure that our commands are queued before any of
userspace's. (Hooking into the device open should mean we have to check
less often than say hooking into execbuffer.)

Suggested-by: Dave Gordon <david.s.gordon@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dave Gordon <david.s.gordon@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h |   2 +
 drivers/gpu/drm/i915/i915_gem.c | 113 +++++++++++++++++++++++++++-------------
 2 files changed, 79 insertions(+), 36 deletions(-)

Comments

Daniel Vetter July 1, 2015, 1:07 p.m. UTC | #1
On Wed, Jul 01, 2015 at 10:27:21AM +0100, Chris Wilson wrote:
> Dave Gordon made the good suggestion that once the ringbuffers were
> setup, the actual queuing of commands to program the initial GPU state
> could be deferred. Since that initial state contains instructions for
> setting up the first power context, we want to execute that as earlier
> as possible, preferrably in the background to userspace. Then when
> userspace does wake up, the first time it opens the device we just need
> to flush the work to be sure that our commands are queued before any of
> userspace's. (Hooking into the device open should mean we have to check
> less often than say hooking into execbuffer.)
> 
> Suggested-by: Dave Gordon <david.s.gordon@intel.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dave Gordon <david.s.gordon@intel.com>

Just before this gets a bit out of hand with various patches floating
around ... I really meant it when I said that we should have a proper
design discussion about this in Jesse's meeting first.

Looking at all the ideas between you, Dave & me I count about 3-4
approaches to async gem init, and all have upsides and downsides.

Aside from that I concur that if we do async gem init then it better be a
worker and not relying on some abitrary userspace ioctl/syscall. Of course
we'd still need to place proper synchronization points at a good place
(flush_work in gem_open for Dave's design), but that's really orthogonal
to running it in a worker imo.
-Daniel

> ---
>  drivers/gpu/drm/i915/i915_drv.h |   2 +
>  drivers/gpu/drm/i915/i915_gem.c | 113 +++++++++++++++++++++++++++-------------
>  2 files changed, 79 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 3ea1fe8db63e..d4003dea97eb 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1938,6 +1938,8 @@ struct drm_i915_private {
>  
>  	bool edp_low_vswing;
>  
> +	struct work_struct init_hw_late;
> +
>  	/*
>  	 * NOTE: This is the dri1/ums dungeon, don't add stuff here. Your patch
>  	 * will be rejected. Instead look for a better place.
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 2f0fed1b9dd7..7efa71f8edd7 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -5140,12 +5140,76 @@ cleanup_render_ring:
>  	return ret;
>  }
>  
> +static int
> +i915_gem_init_hw_late(struct drm_i915_private *dev_priv)
> +{
> +	struct intel_engine_cs *ring;
> +	int i, j;
> +
> +	for_each_ring(ring, dev_priv, i) {
> +		struct drm_i915_gem_request *req;
> +		int ret;
> +
> +		if (WARN_ON(!ring->default_context)) {
> +			ret = -ENODEV;
> +			goto err;
> +		}
> +
> +		req = i915_gem_request_alloc(ring, ring->default_context);
> +		if (IS_ERR(req)) {
> +			ret = PTR_ERR(req);
> +			goto err;
> +		}
> +
> +		if (ring->id == RCS) {
> +			for (j = 0; j < NUM_L3_SLICES(dev_priv); j++)
> +				i915_gem_l3_remap(req, j);
> +		}
> +
> +		ret = i915_ppgtt_init_ring(req);
> +		if (ret) {
> +			DRM_ERROR("PPGTT enable ring #%d failed %d\n", i, ret);
> +			goto err_req;
> +		}
> +
> +		ret = i915_gem_context_enable(req);
> +		if (ret) {
> +			DRM_ERROR("Context enable ring #%d failed %d\n", i, ret);
> +			goto err_req;
> +		}
> +
> +		i915_add_request_no_flush(req);
> +		continue;
> +
> +err_req:
> +		i915_gem_request_cancel(req);
> +err:
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static void
> +i915_gem_init_hw_worker(struct work_struct *work)
> +{
> +	struct drm_i915_private *dev_priv =
> +		container_of(work, typeof(*dev_priv), init_hw_late);
> +	mutex_lock(&dev_priv->dev->struct_mutex);
> +	if (i915_gem_init_hw_late(dev_priv)) {
> +		DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
> +		atomic_set_mask(I915_WEDGED,
> +				&dev_priv->gpu_error.reset_counter);
> +	}
> +	mutex_unlock(&dev_priv->dev->struct_mutex);
> +}
> +
>  int
>  i915_gem_init_hw(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	struct intel_engine_cs *ring;
> -	int ret, i, j;
> +	int ret, i;
>  
>  	if (INTEL_INFO(dev)->gen < 6 && !intel_enable_gtt())
>  		return -EIO;
> @@ -5198,41 +5262,10 @@ i915_gem_init_hw(struct drm_device *dev)
>  	}
>  
>  	/* Now it is safe to go back round and do everything else: */
> -	for_each_ring(ring, dev_priv, i) {
> -		struct drm_i915_gem_request *req;
> -
> -		WARN_ON(!ring->default_context);
> -
> -		req = i915_gem_request_alloc(ring, ring->default_context);
> -		if (IS_ERR(req)) {
> -			ret = PTR_ERR(req);
> -			i915_gem_cleanup_ringbuffer(dev);
> -			goto out;
> -		}
> -
> -		if (ring->id == RCS) {
> -			for (j = 0; j < NUM_L3_SLICES(dev); j++)
> -				i915_gem_l3_remap(req, j);
> -		}
> -
> -		ret = i915_ppgtt_init_ring(req);
> -		if (ret && ret != -EIO) {
> -			DRM_ERROR("PPGTT enable ring #%d failed %d\n", i, ret);
> -			i915_gem_request_cancel(req);
> -			i915_gem_cleanup_ringbuffer(dev);
> -			goto out;
> -		}
> -
> -		ret = i915_gem_context_enable(req);
> -		if (ret && ret != -EIO) {
> -			DRM_ERROR("Context enable ring #%d failed %d\n", i, ret);
> -			i915_gem_request_cancel(req);
> -			i915_gem_cleanup_ringbuffer(dev);
> -			goto out;
> -		}
> -
> -		i915_add_request_no_flush(req);
> -	}
> +	if (dev->open_count == 0) /* uncontested with userspace, i.e. boot */
> +		queue_work(dev_priv->wq, &dev_priv->init_hw_late);
> +	else
> +		ret = i915_gem_init_hw_late(dev_priv);
>  
>  out:
>  	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> @@ -5379,6 +5412,7 @@ i915_gem_load(struct drm_device *dev)
>  		init_ring_lists(&dev_priv->ring[i]);
>  	for (i = 0; i < I915_MAX_NUM_FENCES; i++)
>  		INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list);
> +	INIT_WORK(&dev_priv->init_hw_late, i915_gem_init_hw_worker);
>  	INIT_DELAYED_WORK(&dev_priv->mm.retire_work,
>  			  i915_gem_retire_work_handler);
>  	INIT_DELAYED_WORK(&dev_priv->mm.idle_work,
> @@ -5442,11 +5476,18 @@ void i915_gem_release(struct drm_device *dev, struct drm_file *file)
>  
>  int i915_gem_open(struct drm_device *dev, struct drm_file *file)
>  {
> +	struct drm_i915_private *dev_priv = to_i915(dev);
>  	struct drm_i915_file_private *file_priv;
>  	int ret;
>  
>  	DRM_DEBUG_DRIVER("\n");
>  
> +	/* Flush ring initialisation before userspace can submit its own
> +	 * batches, so the hardware initialisation commands are queued
> +	 * first.
> +	 */
> +	flush_work(&dev_priv->init_hw_late);
> +
>  	file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
>  	if (!file_priv)
>  		return -ENOMEM;
> -- 
> 2.1.4
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson July 1, 2015, 1:17 p.m. UTC | #2
On Wed, Jul 01, 2015 at 03:07:18PM +0200, Daniel Vetter wrote:
> On Wed, Jul 01, 2015 at 10:27:21AM +0100, Chris Wilson wrote:
> > Dave Gordon made the good suggestion that once the ringbuffers were
> > setup, the actual queuing of commands to program the initial GPU state
> > could be deferred. Since that initial state contains instructions for
> > setting up the first power context, we want to execute that as earlier
> > as possible, preferrably in the background to userspace. Then when
> > userspace does wake up, the first time it opens the device we just need
> > to flush the work to be sure that our commands are queued before any of
> > userspace's. (Hooking into the device open should mean we have to check
> > less often than say hooking into execbuffer.)
> > 
> > Suggested-by: Dave Gordon <david.s.gordon@intel.com>
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Dave Gordon <david.s.gordon@intel.com>
> 
> Just before this gets a bit out of hand with various patches floating
> around ... I really meant it when I said that we should have a proper
> design discussion about this in Jesse's meeting first.

What more is there to design? Asynchronously loading the submission port
is orthogonal to the task of queuing requests for it, and need not block
request construction (be it kernel or userspace). Dave just identified
some work that we didn't need to do during module load. I don't think he
would propose using it for loading guc firmware, that would just be
silly...
-Chris
Daniel Vetter July 1, 2015, 2:07 p.m. UTC | #3
On Wed, Jul 01, 2015 at 02:17:28PM +0100, Chris Wilson wrote:
> On Wed, Jul 01, 2015 at 03:07:18PM +0200, Daniel Vetter wrote:
> > On Wed, Jul 01, 2015 at 10:27:21AM +0100, Chris Wilson wrote:
> > > Dave Gordon made the good suggestion that once the ringbuffers were
> > > setup, the actual queuing of commands to program the initial GPU state
> > > could be deferred. Since that initial state contains instructions for
> > > setting up the first power context, we want to execute that as earlier
> > > as possible, preferrably in the background to userspace. Then when
> > > userspace does wake up, the first time it opens the device we just need
> > > to flush the work to be sure that our commands are queued before any of
> > > userspace's. (Hooking into the device open should mean we have to check
> > > less often than say hooking into execbuffer.)
> > > 
> > > Suggested-by: Dave Gordon <david.s.gordon@intel.com>
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Cc: Dave Gordon <david.s.gordon@intel.com>
> > 
> > Just before this gets a bit out of hand with various patches floating
> > around ... I really meant it when I said that we should have a proper
> > design discussion about this in Jesse's meeting first.
> 
> What more is there to design? Asynchronously loading the submission port
> is orthogonal to the task of queuing requests for it, and need not block
> request construction (be it kernel or userspace). Dave just identified
> some work that we didn't need to do during module load. I don't think he
> would propose using it for loading guc firmware, that would just be
> silly...

set_wedged in your patch doesn't have the wakeup to kick waiters. And
maybe we want to be somewhat more synchronous with with init fail than gpu
hangs, for userspace to make better decisions. Also we still have that
issue that sometimes an -EIO escapes into modeset code.  And yes this is
mean to provide the async init for the request firmware.
-Daniel
Chris Wilson July 1, 2015, 2:15 p.m. UTC | #4
On Wed, Jul 01, 2015 at 04:07:08PM +0200, Daniel Vetter wrote:
> On Wed, Jul 01, 2015 at 02:17:28PM +0100, Chris Wilson wrote:
> > On Wed, Jul 01, 2015 at 03:07:18PM +0200, Daniel Vetter wrote:
> > > On Wed, Jul 01, 2015 at 10:27:21AM +0100, Chris Wilson wrote:
> > > > Dave Gordon made the good suggestion that once the ringbuffers were
> > > > setup, the actual queuing of commands to program the initial GPU state
> > > > could be deferred. Since that initial state contains instructions for
> > > > setting up the first power context, we want to execute that as earlier
> > > > as possible, preferrably in the background to userspace. Then when
> > > > userspace does wake up, the first time it opens the device we just need
> > > > to flush the work to be sure that our commands are queued before any of
> > > > userspace's. (Hooking into the device open should mean we have to check
> > > > less often than say hooking into execbuffer.)
> > > > 
> > > > Suggested-by: Dave Gordon <david.s.gordon@intel.com>
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > Cc: Dave Gordon <david.s.gordon@intel.com>
> > > 
> > > Just before this gets a bit out of hand with various patches floating
> > > around ... I really meant it when I said that we should have a proper
> > > design discussion about this in Jesse's meeting first.
> > 
> > What more is there to design? Asynchronously loading the submission port
> > is orthogonal to the task of queuing requests for it, and need not block
> > request construction (be it kernel or userspace). Dave just identified
> > some work that we didn't need to do during module load. I don't think he
> > would propose using it for loading guc firmware, that would just be
> > silly...
> 
> set_wedged in your patch doesn't have the wakeup to kick waiters.

True. But is has to be impossible for a waiter to exist at this point,
or else the entire async GPU init is broken. These commands have to be
the first requests we send to the GPU. Everything else must wait before
it is allowed to start queuing.

> And
> maybe we want to be somewhat more synchronous with with init fail than gpu
> hangs, for userspace to make better decisions.

The init is still synchronous with userspace using the device, just (and
this is no change) the only communication with userspace that GEM
initialisation failed is the wedged GPU.

> Also we still have that
> issue that sometimes an -EIO escapes into modeset code.

But there are no new wait requests running conncurrent with GEM init,
so this patch doesn't alter that.

> And yes this is
> mean to provide the async init for the request firmware.

It is an inappropriate juncture for async request firmware. I can keep
repeating that enabling the submission ports is orthogonal to setting up
the CS engines and allowing requests to be queued, because it is...

There is no need to modify the higher levels for async GuC
initialisation. The serialisation there is when to start feeding requests
into the submission port. At the moment we do that immediately when it 
is idle - but the GuC is not idle until it loaded, as soon as it is
loaded it can simply feed in the first set of requests and start on
its merry way. This also allows GuC failure also always to transparently
fallback to execlists.
-Chris

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3ea1fe8db63e..d4003dea97eb 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1938,6 +1938,8 @@  struct drm_i915_private {
 
 	bool edp_low_vswing;
 
+	struct work_struct init_hw_late;
+
 	/*
 	 * NOTE: This is the dri1/ums dungeon, don't add stuff here. Your patch
 	 * will be rejected. Instead look for a better place.
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 2f0fed1b9dd7..7efa71f8edd7 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5140,12 +5140,76 @@  cleanup_render_ring:
 	return ret;
 }
 
+static int
+i915_gem_init_hw_late(struct drm_i915_private *dev_priv)
+{
+	struct intel_engine_cs *ring;
+	int i, j;
+
+	for_each_ring(ring, dev_priv, i) {
+		struct drm_i915_gem_request *req;
+		int ret;
+
+		if (WARN_ON(!ring->default_context)) {
+			ret = -ENODEV;
+			goto err;
+		}
+
+		req = i915_gem_request_alloc(ring, ring->default_context);
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			goto err;
+		}
+
+		if (ring->id == RCS) {
+			for (j = 0; j < NUM_L3_SLICES(dev_priv); j++)
+				i915_gem_l3_remap(req, j);
+		}
+
+		ret = i915_ppgtt_init_ring(req);
+		if (ret) {
+			DRM_ERROR("PPGTT enable ring #%d failed %d\n", i, ret);
+			goto err_req;
+		}
+
+		ret = i915_gem_context_enable(req);
+		if (ret) {
+			DRM_ERROR("Context enable ring #%d failed %d\n", i, ret);
+			goto err_req;
+		}
+
+		i915_add_request_no_flush(req);
+		continue;
+
+err_req:
+		i915_gem_request_cancel(req);
+err:
+		return ret;
+	}
+
+	return 0;
+}
+
+static void
+i915_gem_init_hw_worker(struct work_struct *work)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(work, typeof(*dev_priv), init_hw_late);
+	mutex_lock(&dev_priv->dev->struct_mutex);
+	if (i915_gem_init_hw_late(dev_priv)) {
+		DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
+		atomic_set_mask(I915_WEDGED,
+				&dev_priv->gpu_error.reset_counter);
+	}
+	mutex_unlock(&dev_priv->dev->struct_mutex);
+}
+
 int
 i915_gem_init_hw(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_engine_cs *ring;
-	int ret, i, j;
+	int ret, i;
 
 	if (INTEL_INFO(dev)->gen < 6 && !intel_enable_gtt())
 		return -EIO;
@@ -5198,41 +5262,10 @@  i915_gem_init_hw(struct drm_device *dev)
 	}
 
 	/* Now it is safe to go back round and do everything else: */
-	for_each_ring(ring, dev_priv, i) {
-		struct drm_i915_gem_request *req;
-
-		WARN_ON(!ring->default_context);
-
-		req = i915_gem_request_alloc(ring, ring->default_context);
-		if (IS_ERR(req)) {
-			ret = PTR_ERR(req);
-			i915_gem_cleanup_ringbuffer(dev);
-			goto out;
-		}
-
-		if (ring->id == RCS) {
-			for (j = 0; j < NUM_L3_SLICES(dev); j++)
-				i915_gem_l3_remap(req, j);
-		}
-
-		ret = i915_ppgtt_init_ring(req);
-		if (ret && ret != -EIO) {
-			DRM_ERROR("PPGTT enable ring #%d failed %d\n", i, ret);
-			i915_gem_request_cancel(req);
-			i915_gem_cleanup_ringbuffer(dev);
-			goto out;
-		}
-
-		ret = i915_gem_context_enable(req);
-		if (ret && ret != -EIO) {
-			DRM_ERROR("Context enable ring #%d failed %d\n", i, ret);
-			i915_gem_request_cancel(req);
-			i915_gem_cleanup_ringbuffer(dev);
-			goto out;
-		}
-
-		i915_add_request_no_flush(req);
-	}
+	if (dev->open_count == 0) /* uncontested with userspace, i.e. boot */
+		queue_work(dev_priv->wq, &dev_priv->init_hw_late);
+	else
+		ret = i915_gem_init_hw_late(dev_priv);
 
 out:
 	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
@@ -5379,6 +5412,7 @@  i915_gem_load(struct drm_device *dev)
 		init_ring_lists(&dev_priv->ring[i]);
 	for (i = 0; i < I915_MAX_NUM_FENCES; i++)
 		INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list);
+	INIT_WORK(&dev_priv->init_hw_late, i915_gem_init_hw_worker);
 	INIT_DELAYED_WORK(&dev_priv->mm.retire_work,
 			  i915_gem_retire_work_handler);
 	INIT_DELAYED_WORK(&dev_priv->mm.idle_work,
@@ -5442,11 +5476,18 @@  void i915_gem_release(struct drm_device *dev, struct drm_file *file)
 
 int i915_gem_open(struct drm_device *dev, struct drm_file *file)
 {
+	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct drm_i915_file_private *file_priv;
 	int ret;
 
 	DRM_DEBUG_DRIVER("\n");
 
+	/* Flush ring initialisation before userspace can submit its own
+	 * batches, so the hardware initialisation commands are queued
+	 * first.
+	 */
+	flush_work(&dev_priv->init_hw_late);
+
 	file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
 	if (!file_priv)
 		return -ENOMEM;