[08/17,v2] drm/i915: Defer default hardware context initialisation until first open
diff mbox

Message ID 1435243213-22308-9-git-send-email-david.s.gordon@intel.com
State New
Headers show

Commit Message

Dave Gordon June 25, 2015, 2:40 p.m. UTC
In order to fully initialise the default contexts, we have to execute
batchbuffer commands on the GPU engines. But in the case of GuC-based
batch submission, we can't do that until any required firmware has
been loaded, which may not be possible during driver load, because the
filesystem(s) containing the firmware may not be mounted until later.

Therefore, we now allow the first call to the firmware-loading code to
return -EAGAIN to indicate that it's not yet ready, and that it should
be retried when the device is first opened from user code, by which
time we expect that all required filesystems will have been mounted.
The late-retry code will then re-attempt to load the firmware if the
early attempt failed.

If the late retry fails, the current open-in-progress will fail, but
the recovery code will disable GuC submission and reset the GPU and
driver. The next open will therefore be in non-GuC mode, and will be
allowed to complete even if the GuC cannot be loaded or used.

Issue: VIZ-4884
Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Signed-off-by: Alex Dai <yu.dai@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |    2 ++
 drivers/gpu/drm/i915/i915_gem.c         |    8 ++++-
 drivers/gpu/drm/i915/i915_gem_context.c |   52 ++++++++++++++++++++++++++++---
 drivers/gpu/drm/i915/i915_irq.c         |   48 ++++++++++++++++++++++++++++
 4 files changed, 104 insertions(+), 6 deletions(-)

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d0ea74e..c27743f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1848,6 +1848,7 @@  struct drm_i915_private {
 	/* hda/i915 audio component */
 	bool audio_component_registered;
 
+	bool contexts_ready;
 	uint32_t hw_context_size;
 	struct list_head context_list;
 
@@ -2637,6 +2638,7 @@  void i915_queue_hangcheck(struct drm_device *dev);
 __printf(3, 4)
 void i915_handle_error(struct drm_device *dev, bool wedged,
 		       const char *fmt, ...);
+void i915_handle_guc_error(struct drm_device *dev, int err);
 
 extern void intel_irq_init(struct drm_i915_private *dev_priv);
 int intel_irq_install(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b7b22e3..a3e77db 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5121,9 +5121,15 @@  i915_gem_init_hw(struct drm_device *dev)
 	}
 
 	/* We can't enable contexts until all firmware is loaded */
-	ret = intel_guc_ucode_load(dev, true);
+	ret = intel_guc_ucode_load(dev, false);
+	if (ret == -EAGAIN) {
+		ret = 0;
+		goto out;		/* too early */
+	}
 
 	ret = i915_gem_init_hw_late(dev);
+	if (ret == 0)
+		dev_priv->contexts_ready = true;
 
 out:
 	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index a7e58a8..d6f74bd 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -438,23 +438,65 @@  static int context_idr_cleanup(int id, void *p, void *data)
 	return 0;
 }
 
+/* Complete any late initialisation here */
+static int i915_gem_context_first_open(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	int ret;
+
+	/*
+	 * We can't enable contexts until all firmware is loaded. This
+	 * call shouldn't return -EAGAIN because we pass wait=true, but
+	 * it can still fail with code -EIO if the GuC doesn't respond,
+	 * or -ENOEXEC if the GuC firmware image is invalid.
+	 */
+	ret = intel_guc_ucode_load(dev, true);
+	WARN_ON(ret == -EAGAIN);
+
+	/*
+	 * If an error occurred and GuC submission has been requested, we can
+	 * attempt recovery by disabling GuC submission and reinitialising
+	 * the GPU and driver. We then fail this open() anyway, but the next
+	 * attempt will find that GuC submission is already disabled, and so
+	 * proceed to complete context initialisation in non-GuC mode instead.
+	 */
+	if (ret && i915.enable_guc_submission) {
+		i915_handle_guc_error(dev, ret);
+		return ret;
+	}
+
+	ret = i915_gem_init_hw_late(dev);
+	if (ret == 0)
+		dev_priv->contexts_ready = true;
+	return ret;
+}
+
 int i915_gem_context_open(struct drm_device *dev, struct drm_file *file)
 {
+	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 	struct intel_context *ctx;
+	int ret = 0;
 
 	idr_init(&file_priv->context_idr);
 
 	mutex_lock(&dev->struct_mutex);
-	ctx = i915_gem_create_context(dev, file_priv);
+
+	if (!dev_priv->contexts_ready)
+		ret = i915_gem_context_first_open(dev);
+
+	if (ret == 0) {
+		ctx = i915_gem_create_context(dev, file_priv);
+		if (IS_ERR(ctx))
+			ret = PTR_ERR(ctx);
+	}
+
 	mutex_unlock(&dev->struct_mutex);
 
-	if (IS_ERR(ctx)) {
+	if (ret)
 		idr_destroy(&file_priv->context_idr);
-		return PTR_ERR(ctx);
-	}
 
-	return 0;
+	return ret;
 }
 
 void i915_gem_context_close(struct drm_device *dev, struct drm_file *file)
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index a6fbe64..82192ff 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2374,6 +2374,54 @@  void i915_handle_error(struct drm_device *dev, bool wedged,
 	i915_reset_and_wakeup(dev);
 }
 
+/**
+ * i915_handle_error - handle a GuC error
+ * @dev: drm device
+ *
+ * If the GuC can't be (re-)initialised, disable GuC submission and
+ * then reset and reinitialise the rest of the GPU, so that we can
+ * fall back to operating in ELSP mode. Don't bother capturing error
+ * state, because it probably isn't relevant here.
+ *
+ * Unlike i915_handle_error() above, this is called with the global
+ * struct_mutex held, so we need to release it after setting the
+ * reset-in-progress bit so that other threads can make progress,
+ * and reacquire it after the reset is complete.
+ */
+void i915_handle_guc_error(struct drm_device *dev, int err)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	DRM_ERROR("GuC failure %d, disabling GuC submission\n", err);
+	i915.enable_guc_submission = false;
+
+	i915_report_and_clear_eir(dev);	/* unlikely? */
+
+	atomic_set_mask(I915_RESET_IN_PROGRESS_FLAG,
+			&dev_priv->gpu_error.reset_counter);
+
+	mutex_unlock(&dev->struct_mutex);
+
+	/*
+	 * Wakeup waiting processes so that the reset function
+	 * i915_reset_and_wakeup doesn't deadlock trying to grab
+	 * various locks. By bumping the reset counter first, the woken
+	 * processes will see a reset in progress and back off,
+	 * releasing their locks and then wait for the reset completion.
+	 * We must do this for _all_ gpu waiters that might hold locks
+	 * that the reset work needs to acquire.
+	 *
+	 * Note: The wake_up serves as the required memory barrier to
+	 * ensure that the waiters see the updated value of the reset
+	 * counter atomic_t.
+	 */
+	i915_error_wake_up(dev_priv, false);
+
+	i915_reset_and_wakeup(dev);
+
+	mutex_lock(&dev->struct_mutex);
+}
+
 /* Called from drm generic code, passed 'crtc' which
  * we use as a pipe index
  */